Merge pull request #6288 from dearblue/closing
[mruby.git] / src / string.c
blobf8ff4afc2138695d1fa60ebff98be3225d8e44a7
1 /*
2 ** string.c - String class
3 **
4 ** See Copyright Notice in mruby.h
5 */
7 #ifdef _MSC_VER
8 # define _CRT_NONSTDC_NO_DEPRECATE
9 # define WIN32_LEAN_AND_MEAN
10 #endif
12 #include <mruby.h>
13 #include <mruby/array.h>
14 #include <mruby/class.h>
15 #include <mruby/range.h>
16 #include <mruby/string.h>
17 #include <mruby/numeric.h>
18 #include <mruby/internal.h>
19 #include <mruby/presym.h>
20 #include <string.h>
22 typedef struct mrb_shared_string {
23 int refcnt;
24 mrb_int capa;
25 char *ptr;
26 } mrb_shared_string;
28 const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
30 #define mrb_obj_alloc_string(mrb) MRB_OBJ_ALLOC((mrb), MRB_TT_STRING, (mrb)->string_class)
32 #ifndef MRB_STR_LENGTH_MAX
33 #if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__)
34 #define MRB_STR_LENGTH_MAX 0
35 #else
36 #define MRB_STR_LENGTH_MAX 1048576
37 #endif
38 #endif
40 static void
41 str_check_length(mrb_state *mrb, mrb_int len)
43 if (len < 0) {
44 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative (or overflowed) string size");
46 #if MRB_STR_LENGTH_MAX != 0
47 if (len > MRB_STR_LENGTH_MAX-1) {
48 mrb_raisef(mrb, E_ARGUMENT_ERROR, "string too long (len=%i max=" MRB_STRINGIZE(MRB_STR_LENGTH_MAX) ")", len);
50 #endif
53 static struct RString*
54 str_init_normal_capa(mrb_state *mrb, struct RString *s,
55 const char *p, mrb_int len, mrb_int capa)
57 str_check_length(mrb, capa);
58 char *dst = (char*)mrb_malloc(mrb, capa + 1);
59 if (p) memcpy(dst, p, len);
60 dst[len] = '\0';
61 s->as.heap.ptr = dst;
62 s->as.heap.len = len;
63 s->as.heap.aux.capa = capa;
64 RSTR_UNSET_TYPE_FLAG(s);
65 return s;
68 static struct RString*
69 str_init_normal(mrb_state *mrb, struct RString *s, const char *p, mrb_int len)
71 return str_init_normal_capa(mrb, s, p, len, len);
74 static struct RString*
75 str_init_embed(struct RString *s, const char *p, mrb_int len)
77 mrb_assert(len >= 0);
78 if (p) memcpy(RSTR_EMBED_PTR(s), p, len);
79 RSTR_EMBED_PTR(s)[len] = '\0';
80 RSTR_SET_TYPE_FLAG(s, EMBED);
81 RSTR_SET_EMBED_LEN(s, len);
82 return s;
85 static struct RString*
86 str_init_nofree(struct RString *s, const char *p, mrb_int len)
88 s->as.heap.ptr = (char*)p;
89 s->as.heap.len = len;
90 s->as.heap.aux.capa = 0; /* nofree */
91 RSTR_SET_TYPE_FLAG(s, NOFREE);
92 return s;
95 static struct RString*
96 str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, mrb_shared_string *shared)
98 if (shared) {
99 shared->refcnt++;
101 else {
102 shared = (mrb_shared_string*)mrb_malloc(mrb, sizeof(mrb_shared_string));
103 shared->refcnt = 1;
104 shared->ptr = orig->as.heap.ptr;
105 shared->capa = orig->as.heap.aux.capa;
107 s->as.heap.ptr = orig->as.heap.ptr;
108 s->as.heap.len = orig->as.heap.len;
109 s->as.heap.aux.shared = shared;
110 RSTR_SET_TYPE_FLAG(s, SHARED);
111 return s;
114 static struct RString*
115 str_init_fshared(const struct RString *orig, struct RString *s, struct RString *fshared)
117 s->as.heap.ptr = orig->as.heap.ptr;
118 s->as.heap.len = orig->as.heap.len;
119 s->as.heap.aux.fshared = fshared;
120 RSTR_SET_TYPE_FLAG(s, FSHARED);
121 return s;
124 static struct RString*
125 str_init_modifiable(mrb_state *mrb, struct RString *s, const char *p, mrb_int len)
127 if (RSTR_EMBEDDABLE_P(len)) {
128 return str_init_embed(s, p, len);
130 return str_init_normal(mrb, s, p, len);
133 static struct RString*
134 str_new_static(mrb_state *mrb, const char *p, mrb_int len)
136 if (RSTR_EMBEDDABLE_P(len)) {
137 return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
139 return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
142 static struct RString*
143 str_new(mrb_state *mrb, const char *p, mrb_int len)
145 str_check_length(mrb, len);
146 if (RSTR_EMBEDDABLE_P(len)) {
147 return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
149 if (p && mrb_ro_data_p(p)) {
150 return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
152 return str_init_normal(mrb, mrb_obj_alloc_string(mrb), p, len);
155 MRB_API mrb_value
156 mrb_str_new_capa(mrb_state *mrb, mrb_int capa)
158 struct RString *s = mrb_obj_alloc_string(mrb);
160 if (RSTR_EMBEDDABLE_P(capa)) {
161 s = str_init_embed(s, NULL, 0);
163 else {
164 s = str_init_normal_capa(mrb, s, NULL, 0, capa);
166 return mrb_obj_value(s);
169 static void
170 resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity)
172 if (RSTR_EMBED_P(s)) {
173 if (!RSTR_EMBEDDABLE_P(capacity)) {
174 str_init_normal_capa(mrb, s, RSTR_EMBED_PTR(s), RSTR_EMBED_LEN(s), capacity);
177 else {
178 str_check_length(mrb, capacity);
179 s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
180 s->as.heap.aux.capa = (mrb_ssize)capacity;
184 MRB_API mrb_value
185 mrb_str_new(mrb_state *mrb, const char *p, mrb_int len)
187 return mrb_obj_value(str_new(mrb, p, len));
190 MRB_API mrb_value
191 mrb_str_new_cstr(mrb_state *mrb, const char *p)
193 struct RString *s;
194 mrb_int len;
196 if (p) {
197 len = strlen(p);
199 else {
200 len = 0;
203 s = str_new(mrb, p, len);
205 return mrb_obj_value(s);
208 MRB_API mrb_value
209 mrb_str_new_static(mrb_state *mrb, const char *p, mrb_int len)
211 struct RString *s = str_new_static(mrb, p, len);
212 return mrb_obj_value(s);
215 static void
216 str_decref(mrb_state *mrb, mrb_shared_string *shared)
218 shared->refcnt--;
219 if (shared->refcnt == 0) {
220 mrb_free(mrb, shared->ptr);
221 mrb_free(mrb, shared);
225 static void
226 str_modify_keep_ascii(mrb_state *mrb, struct RString *s)
228 if (RSTR_SHARED_P(s)) {
229 mrb_shared_string *shared = s->as.heap.aux.shared;
231 if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
232 s->as.heap.aux.capa = shared->capa;
233 s->as.heap.ptr[s->as.heap.len] = '\0';
234 RSTR_UNSET_SHARED_FLAG(s);
235 mrb_free(mrb, shared);
237 else {
238 str_init_modifiable(mrb, s, s->as.heap.ptr, s->as.heap.len);
239 str_decref(mrb, shared);
242 else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
243 str_init_modifiable(mrb, s, s->as.heap.ptr, s->as.heap.len);
247 static void
248 check_null_byte(mrb_state *mrb, struct RString *str)
250 const char *p = RSTR_PTR(str);
251 if (p && memchr(p, '\0', RSTR_LEN(str))) {
252 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
256 void
257 mrb_gc_free_str(mrb_state *mrb, struct RString *str)
259 if (RSTR_EMBED_P(str))
260 /* no code */;
261 else if (RSTR_SHARED_P(str))
262 str_decref(mrb, str->as.heap.aux.shared);
263 else if (!RSTR_NOFREE_P(str) && !RSTR_FSHARED_P(str))
264 mrb_free(mrb, str->as.heap.ptr);
267 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
268 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \
269 defined(__powerpc64__) || defined(__POWERPC__) || defined(__aarch64__) || \
270 defined(__mc68020__)
271 # define ALIGNED_WORD_ACCESS 0
272 #else
273 # define ALIGNED_WORD_ACCESS 1
274 #endif
276 #ifdef MRB_64BIT
277 #define bitint uint64_t
278 #define MASK01 0x0101010101010101ull
279 #else
280 #define bitint uint32_t
281 #define MASK01 0x01010101ul
282 #endif
284 #ifdef MRB_UTF8_STRING
286 #define NOASCII(c) ((c) & 0x80)
288 #ifdef SIMPLE_SEARCH_NONASCII
289 /* the naive implementation. define SIMPLE_SEARCH_NONASCII, */
290 /* if you need it for any constraint (e.g. code size). */
291 static const char*
292 search_nonascii(const char* p, const char *e)
294 for (; p < e; ++p) {
295 if (NOASCII(*p)) return p;
297 return e;
300 #elif defined(__SSE2__)
301 # include <emmintrin.h>
303 static inline const char *
304 search_nonascii(const char *p, const char *e)
306 if (sizeof(__m128i) < (size_t)(e - p)) {
307 if (!_mm_movemask_epi8(_mm_loadu_si128((__m128i const*)p))) {
308 const intptr_t lowbits = sizeof(__m128i) - 1;
309 const __m128i *s, *t;
310 s = (const __m128i*)(~lowbits & ((intptr_t)p + lowbits));
311 t = (const __m128i*)(~lowbits & (intptr_t)e);
312 for (; s < t; ++s) {
313 if (_mm_movemask_epi8(_mm_load_si128(s))) break;
315 p = (const char *)s;
318 switch (e - p) {
319 default:
320 case 15: if (NOASCII(*p)) return p; ++p;
321 case 14: if (NOASCII(*p)) return p; ++p;
322 case 13: if (NOASCII(*p)) return p; ++p;
323 case 12: if (NOASCII(*p)) return p; ++p;
324 case 11: if (NOASCII(*p)) return p; ++p;
325 case 10: if (NOASCII(*p)) return p; ++p;
326 case 9: if (NOASCII(*p)) return p; ++p;
327 case 8: if (NOASCII(*p)) return p; ++p;
328 case 7: if (NOASCII(*p)) return p; ++p;
329 case 6: if (NOASCII(*p)) return p; ++p;
330 case 5: if (NOASCII(*p)) return p; ++p;
331 case 4: if (NOASCII(*p)) return p; ++p;
332 case 3: if (NOASCII(*p)) return p; ++p;
333 case 2: if (NOASCII(*p)) return p; ++p;
334 case 1: if (NOASCII(*p)) return p; ++p;
335 if (NOASCII(*p)) return p;
336 case 0: break;
338 return e;
341 #else
343 static const char*
344 search_nonascii(const char *p, const char *e)
346 ptrdiff_t byte_len = e - p;
348 const char *be = p + sizeof(bitint) * (byte_len / sizeof(bitint));
349 for (; p < be; p+=sizeof(bitint)) {
350 bitint t0;
352 memcpy(&t0, p, sizeof(bitint));
353 const bitint t1 = t0 & (MASK01*0x80);
354 if (t1) {
355 e = p + sizeof(bitint)-1;
356 byte_len = sizeof(bitint)-1;
357 break;
361 switch (byte_len % sizeof(bitint)) {
362 #ifdef MRB_64BIT
363 case 7: if (e[-7]&0x80) return e-7;
364 case 6: if (e[-6]&0x80) return e-6;
365 case 5: if (e[-5]&0x80) return e-5;
366 case 4: if (e[-4]&0x80) return e-4;
367 #endif
368 case 3: if (e[-3]&0x80) return e-3;
369 case 2: if (e[-2]&0x80) return e-2;
370 case 1: if (e[-1]&0x80) return e-1;
372 return e;
375 #endif /* SIMPLE_SEARCH_NONASCII */
377 #define utf8_islead(c) ((unsigned char)((c)&0xc0) != 0x80)
379 extern const char mrb_utf8len_table[];
380 const char mrb_utf8len_table[] = {
381 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
382 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
385 mrb_int
386 mrb_utf8len(const char* p, const char* e)
388 mrb_int len = mrb_utf8len_table[(unsigned char)p[0] >> 3];
389 if (len > e - p) return 1;
390 switch (len) {
391 case 0:
392 return 1;
393 case 4:
394 if (utf8_islead(p[3])) return 1;
395 case 3:
396 if (utf8_islead(p[2])) return 1;
397 case 2:
398 if (utf8_islead(p[1])) return 1;
400 return len;
403 #if defined(__GNUC__) || __has_builtin(__builtin_popcount)
404 # ifdef MRB_64BIT
405 # define popcount(x) __builtin_popcountll(x)
406 # else
407 # define popcount(x) __builtin_popcountl(x)
408 # endif
409 #else
410 static inline uint32_t popcount(bitint x)
412 x = (x & (MASK01*0x55)) + ((x >> 1) & (MASK01*0x55));
413 x = (x & (MASK01*0x33)) + ((x >> 2) & (MASK01*0x33));
414 x = (x & (MASK01*0x0F)) + ((x >> 4) & (MASK01*0x0F));
415 return (x * MASK01) >> 56;
417 #endif
419 mrb_int
420 mrb_utf8_strlen(const char *str, mrb_int byte_len)
422 const char *p = str;
423 const char *e = str + byte_len;
424 mrb_int len = 0;
426 while (p < e) {
427 const char *np = search_nonascii(p, e);
429 len += np - p;
430 if (np == e) break;
431 p = np;
432 while (NOASCII(*p)) {
433 p += mrb_utf8len(p, e);
434 len++;
437 return len;
440 static mrb_int
441 utf8_strlen(mrb_value str)
443 struct RString *s = mrb_str_ptr(str);
444 mrb_int byte_len = RSTR_LEN(s);
446 if (RSTR_SINGLE_BYTE_P(s)) {
447 return byte_len;
449 else {
450 mrb_int utf8_len = mrb_utf8_strlen(RSTR_PTR(s), byte_len);
451 mrb_assert(utf8_len <= byte_len);
452 if (byte_len == utf8_len) RSTR_SET_SINGLE_BYTE_FLAG(s);
453 return utf8_len;
457 #define RSTRING_CHAR_LEN(s) utf8_strlen(s)
459 /* map character index to byte offset index */
460 static mrb_int
461 chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
463 if (RSTR_SINGLE_BYTE_P(mrb_str_ptr(s))) {
464 return idx;
467 const char *p0 = RSTRING_PTR(s) + off;
468 const char *p = p0;
469 const char *e = RSTRING_END(s);
470 mrb_int i = 0;
472 while (p<e && i<idx) {
473 if ((*p & 0x80) == 0) {
474 const char *np = search_nonascii(p, e);
475 ptrdiff_t alen = np - p;
476 if (idx < i+alen) {
477 p += idx-i;
478 i=idx;
480 else {
481 p = np;
482 i += alen;
485 else {
486 p += mrb_utf8len(p, e);
487 i++;
491 mrb_int len = (mrb_int)(p-p0);
492 if (i<idx) len++;
493 return len;
496 /* map byte offset to character index */
497 static mrb_int
498 bytes2chars(mrb_value s, mrb_int bi)
500 if (RSTR_SINGLE_BYTE_P(mrb_str_ptr(s))) {
501 return bi;
504 const char *p = RSTRING_PTR(s);
505 const char *e = p + RSTRING_LEN(s);
506 const char *pivot = p + bi;
507 mrb_int i = 0;
509 if (e < pivot) return -1;
510 while (p < pivot) {
511 if ((*p & 0x80) == 0) {
512 const char *np = search_nonascii(p, pivot);
513 i += np - p;
514 p = np;
516 else {
517 p += mrb_utf8len(p, e);
518 i++;
521 if (p != pivot) return -1;
522 return i;
525 static const char*
526 char_adjust(const char *beg, const char *end, const char *ptr)
528 ptrdiff_t len = end - ptr;
529 if (len < 1 || utf8_islead(ptr[0])) return ptr;
530 if (len > 1 && utf8_islead(ptr[1])) return ptr+1;
531 if (len > 2 && utf8_islead(ptr[2])) return ptr+2;
532 if (len > 3 && utf8_islead(ptr[3])) return ptr+3;
533 return ptr;
536 static const char*
537 char_backtrack(const char *ptr, const char *end)
539 ptrdiff_t len = end - ptr;
540 if (len < 1 || utf8_islead(end[-1])) return end-1;
541 if (len > 1 && utf8_islead(end[-2])) return end-2;
542 if (len > 2 && utf8_islead(end[-3])) return end-3;
543 if (len > 3 && utf8_islead(end[-4])) return end-4;
544 return end - 1;
547 static mrb_int
548 str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
550 const char *ptr = RSTRING_PTR(sub);
551 mrb_int len = RSTRING_LEN(sub);
553 if (pos > 0) {
554 pos = chars2bytes(str, 0, pos);
557 pos = mrb_str_index(mrb, str, ptr, len, pos);
559 if (pos > 0) {
560 pos = bytes2chars(str, pos);
562 return pos;
565 #else
566 #define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
567 #define chars2bytes(s, off, ci) (ci)
568 #define bytes2chars(s, bi) (bi)
569 #define char_adjust(beg, end, ptr) (ptr)
570 #define char_backtrack(ptr, end) ((end) - 1)
571 #define str_index_str_by_char(mrb, str, sub, pos) str_index_str((mrb), (str), (sub), (pos))
572 #endif
574 /* memsearch_swar (SWAR stands for SIMD within a register) */
575 /* See https://en.wikipedia.org/wiki/SWAR */
576 /* The function is taken from http://0x80.pl/articles/simd-strfind.html */
577 /* The original source code is under 2-clause BSD license; see LEGAL file. */
578 /* The modifications:
579 * port from C++ to C
580 * returns mrb_int
581 * remove alignment issue
582 * support bigendian CPU
583 * fixed potential buffer overflow
585 static inline mrb_int
586 memsearch_swar(const char *xs, mrb_int m, const char *ys, mrb_int n)
588 #define MASK7f (MASK01*0x7f)
589 #define MASK80 (MASK01*0x80)
590 #if defined(MRB_ENDIAN_BIG)
591 #ifdef MRB_64BIT
592 #define MASKtop 0x8000000000000000ull
593 #else
594 #define MASKtop 0x80000000ul
595 #endif
596 #else
597 #define MASKtop 0x80
598 #endif
600 const bitint first = MASK01 * (uint8_t)xs[0];
601 const bitint last = MASK01 * (uint8_t)xs[m-1];
603 const char *s0 = ys;
604 const char *s1 = ys+m-1;
606 const mrb_int lim = n - m - (mrb_int)sizeof(bitint);
607 mrb_int i;
609 for (i=0; i < lim; i+=sizeof(bitint)) {
610 bitint t0, t1;
612 memcpy(&t0, s0+i, sizeof(bitint));
613 memcpy(&t1, s1+i, sizeof(bitint));
615 const bitint eq = (t0 ^ first) | (t1 ^ last);
616 bitint zeros = ((~eq & MASK7f) + MASK01) & (~eq & MASK80);
618 for (size_t j = 0; zeros; j++) {
619 if (zeros & MASKtop) {
620 const mrb_int idx = i + j;
621 const char* p = s0 + idx + 1;
622 if (memcmp(p, xs + 1, m - 2) == 0) {
623 return idx;
627 #if defined(MRB_ENDIAN_BIG)
628 zeros <<= 8;
629 #else
630 zeros >>= 8;
631 #endif
635 if (i+m < n) {
636 const char *p = s0;
637 const char *e = ys + n;
638 while (p<e) {
639 p = (const char*)memchr(p, *xs, e - p);
640 if (p == NULL || (e - p) < m) break;
641 if (memcmp(p+1, xs+1, m-1) == 0) return (mrb_int)(p - ys);
642 p++;
646 return -1;
649 static mrb_int
650 mrb_memsearch(const char *x, mrb_int m, const char *y, mrb_int n)
652 if (m > n) return -1;
653 else if (m == n) {
654 return memcmp(x, y, m) == 0 ? 0 : -1;
656 else if (m < 1) {
657 return 0;
659 else if (m == 1) {
660 const char *p = (const char*)memchr(y, *x, n);
662 if (p) return (mrb_int)(p - y);
663 return -1;
665 return memsearch_swar(x, m, y, n);
668 static void
669 str_share(mrb_state *mrb, struct RString *orig, struct RString *s)
671 size_t len = (size_t)orig->as.heap.len;
673 mrb_assert(!RSTR_EMBED_P(orig));
674 if (RSTR_NOFREE_P(orig)) {
675 str_init_nofree(s, orig->as.heap.ptr, len);
677 else if (RSTR_SHARED_P(orig)) {
678 str_init_shared(mrb, orig, s, orig->as.heap.aux.shared);
680 else if (RSTR_FSHARED_P(orig)) {
681 str_init_fshared(orig, s, orig->as.heap.aux.fshared);
683 else {
684 if (orig->as.heap.aux.capa > orig->as.heap.len) {
685 orig->as.heap.ptr = (char*)mrb_realloc(mrb, orig->as.heap.ptr, len+1);
686 orig->as.heap.aux.capa = (mrb_ssize)len;
688 str_init_shared(mrb, orig, s, NULL);
689 str_init_shared(mrb, orig, orig, s->as.heap.aux.shared);
693 mrb_value
694 mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
696 struct RString *orig = mrb_str_ptr(str);
697 struct RString *s = mrb_obj_alloc_string(mrb);
699 if (RSTR_EMBEDDABLE_P(len)) {
700 str_init_embed(s, RSTR_PTR(orig)+beg, len);
702 else {
703 str_share(mrb, orig, s);
704 s->as.heap.ptr += (mrb_ssize)beg;
705 s->as.heap.len = (mrb_ssize)len;
707 RSTR_COPY_SINGLE_BYTE_FLAG(s, orig);
708 return mrb_obj_value(s);
711 #ifdef MRB_UTF8_STRING
712 static inline mrb_value
713 str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
715 beg = chars2bytes(str, 0, beg);
716 len = chars2bytes(str, beg, len);
717 return mrb_str_byte_subseq(mrb, str, beg, len);
719 #else
720 #define str_subseq(mrb, str, beg, len) mrb_str_byte_subseq(mrb, str, beg, len)
721 #endif
723 mrb_bool
724 mrb_str_beg_len(mrb_int str_len, mrb_int *begp, mrb_int *lenp)
726 if (str_len < *begp || *lenp < 0) return FALSE;
727 if (*begp < 0) {
728 *begp += str_len;
729 if (*begp < 0) return FALSE;
731 if (*lenp > str_len - *begp)
732 *lenp = str_len - *begp;
733 if (*lenp <= 0) {
734 *lenp = 0;
736 return TRUE;
739 static mrb_value
740 str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
742 return mrb_str_beg_len(RSTRING_CHAR_LEN(str), &beg, &len) ?
743 str_subseq(mrb, str, beg, len) : mrb_nil_value();
746 MRB_API mrb_int
747 mrb_str_index(mrb_state *mrb, mrb_value str, const char *sptr, mrb_int slen, mrb_int offset)
749 mrb_int len = RSTRING_LEN(str);
751 if (offset < 0) {
752 offset += len;
753 if (offset < 0) return -1;
755 if (len - offset < slen) return -1;
757 char *s = RSTRING_PTR(str);
758 if (offset) {
759 s += offset;
761 if (slen == 0) return offset;
762 /* need proceed one character at a time */
763 len = RSTRING_LEN(str) - offset;
765 mrb_int pos = mrb_memsearch(sptr, slen, s, len);
766 if (pos < 0) return pos;
767 return pos + offset;
770 static mrb_int
771 str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset)
773 const char *ptr = RSTRING_PTR(str2);
774 mrb_int len = RSTRING_LEN(str2);
776 return mrb_str_index(mrb, str, ptr, len, offset);
779 static mrb_value
780 str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
782 mrb_check_frozen(mrb, s1);
783 if (s1 == s2) return mrb_obj_value(s1);
784 RSTR_COPY_SINGLE_BYTE_FLAG(s1, s2);
785 if (RSTR_SHARED_P(s1)) {
786 str_decref(mrb, s1->as.heap.aux.shared);
788 else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1)) {
789 mrb_free(mrb, s1->as.heap.ptr);
792 size_t len = (size_t)RSTR_LEN(s2);
793 if (RSTR_EMBEDDABLE_P(len)) {
794 str_init_embed(s1, RSTR_PTR(s2), len);
796 else {
797 str_share(mrb, s2, s1);
800 return mrb_obj_value(s1);
803 static mrb_int
804 str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
806 const char *s, *sbeg, *send, *t;
807 struct RString *ps = mrb_str_ptr(str);
808 mrb_int len = RSTRING_LEN(sub);
809 mrb_int slen = RSTR_LEN(ps);
811 /* substring longer than string */
812 if (slen < len) return -1;
813 if (slen - pos < len) {
814 pos = slen - len;
816 sbeg = RSTR_PTR(ps);
817 send = sbeg + slen;
818 s = sbeg + pos;
819 t = RSTRING_PTR(sub);
820 if (len) {
821 s = char_adjust(sbeg, send, s);
822 while (sbeg <= s) {
823 if ((mrb_int)(send - s) >= len && memcmp(s, t, len) == 0) {
824 return (mrb_int)(s - sbeg);
826 s = char_backtrack(sbeg, s);
828 return -1;
830 else {
831 return pos;
835 #ifdef _WIN32
836 #include <stdlib.h>
837 #include <malloc.h>
838 #include <windows.h>
840 char*
841 mrb_utf8_from_locale(const char *str, int len)
843 wchar_t* wcsp;
844 char* mbsp;
845 int mbssize, wcssize;
847 if (len == 0)
848 return strdup("");
849 if (len == -1)
850 len = (int)strlen(str);
851 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0);
852 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
853 if (!wcsp)
854 return NULL;
855 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
856 wcsp[wcssize] = 0;
858 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
859 mbsp = (char*) malloc((mbssize + 1));
860 if (!mbsp) {
861 free(wcsp);
862 return NULL;
864 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
865 mbsp[mbssize] = 0;
866 free(wcsp);
867 return mbsp;
870 char*
871 mrb_locale_from_utf8(const char *utf8, int len)
873 wchar_t* wcsp;
874 char* mbsp;
875 int mbssize, wcssize;
877 if (len == 0)
878 return strdup("");
879 if (len == -1)
880 len = (int)strlen(utf8);
881 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
882 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
883 if (!wcsp)
884 return NULL;
885 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
886 wcsp[wcssize] = 0;
887 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
888 mbsp = (char*) malloc((mbssize + 1));
889 if (!mbsp) {
890 free(wcsp);
891 return NULL;
893 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
894 mbsp[mbssize] = 0;
895 free(wcsp);
896 return mbsp;
898 #endif
900 MRB_API void
901 mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s)
903 mrb_check_frozen(mrb, s);
904 str_modify_keep_ascii(mrb, s);
907 MRB_API void
908 mrb_str_modify(mrb_state *mrb, struct RString *s)
910 mrb_str_modify_keep_ascii(mrb, s);
911 RSTR_UNSET_SINGLE_BYTE_FLAG(s);
914 MRB_API mrb_value
915 mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
917 mrb_int slen;
918 struct RString *s = mrb_str_ptr(str);
920 str_check_length(mrb, len);
921 mrb_str_modify(mrb, s);
922 slen = RSTR_LEN(s);
923 if (len != slen) {
924 if (slen < len || slen - len > 256) {
925 resize_capa(mrb, s, len);
927 RSTR_SET_LEN(s, len);
928 RSTR_PTR(s)[len] = '\0'; /* sentinel */
930 return str;
933 MRB_API char*
934 mrb_str_to_cstr(mrb_state *mrb, mrb_value str0)
936 struct RString *s;
938 const char *p = RSTRING_PTR(str0);
939 mrb_int len = RSTRING_LEN(str0);
940 check_null_byte(mrb, RSTRING(str0));
941 s = str_init_modifiable(mrb, mrb_obj_alloc_string(mrb), p, len);
942 return RSTR_PTR(s);
945 MRB_API void
946 mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other)
948 other = mrb_obj_as_string(mrb, other);
949 mrb_str_cat_str(mrb, self, other);
952 MRB_API mrb_value
953 mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b)
955 struct RString *s = mrb_str_ptr(a);
956 struct RString *s2 = mrb_str_ptr(b);
957 struct RString *t;
958 mrb_int slen = RSTR_LEN(s);
959 mrb_int s2len = RSTR_LEN(s2);
960 const char *p = RSTR_PTR(s);
961 const char *p2 = RSTR_PTR(s2);
963 t = str_new(mrb, 0, slen + s2len);
964 char *pt = RSTR_PTR(t);
965 memcpy(pt, p, slen);
966 memcpy(pt + slen, p2, s2len);
968 return mrb_obj_value(t);
971 /* 15.2.10.5.2 */
974 * call-seq:
975 * str + other_str -> new_str
977 * Concatenation---Returns a new <code>String</code> containing
978 * <i>other_str</i> concatenated to <i>str</i>.
980 * "Hello from " + self.to_s #=> "Hello from main"
982 static mrb_value
983 mrb_str_plus_m(mrb_state *mrb, mrb_value self)
985 mrb_value str;
987 mrb_get_args(mrb, "S", &str);
988 return mrb_str_plus(mrb, self, str);
991 /* 15.2.10.5.26 */
992 /* 15.2.10.5.33 */
994 * call-seq:
995 * "abcd".size => int
997 * Returns the length of string.
999 static mrb_value
1000 mrb_str_size(mrb_state *mrb, mrb_value self)
1002 mrb_int len = RSTRING_CHAR_LEN(self);
1003 return mrb_int_value(mrb, len);
1006 static mrb_value
1007 mrb_str_bytesize(mrb_state *mrb, mrb_value self)
1009 return mrb_int_value(mrb, RSTRING_LEN(self));
1012 /* 15.2.10.5.1 */
1014 * call-seq:
1015 * str * integer => new_str
1017 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
1018 * the receiver.
1020 * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1022 static mrb_value
1023 mrb_str_times(mrb_state *mrb, mrb_value self)
1025 mrb_int len, times;
1027 mrb_get_args(mrb, "i", &times);
1028 if (times < 0) {
1029 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument");
1031 if (mrb_int_mul_overflow(RSTRING_LEN(self), times, &len)) {
1032 mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big");
1035 struct RString *str2 = str_new(mrb, 0, len);
1036 char *p = RSTR_PTR(str2);
1037 if (len > 0) {
1038 mrb_int n = RSTRING_LEN(self);
1039 memcpy(p, RSTRING_PTR(self), n);
1040 while (n <= len/2) {
1041 memcpy(p + n, p, n);
1042 n *= 2;
1044 memcpy(p + n, p, len-n);
1046 p[RSTR_LEN(str2)] = '\0';
1047 RSTR_COPY_SINGLE_BYTE_FLAG(str2, mrb_str_ptr(self));
1049 return mrb_obj_value(str2);
1051 /* -------------------------------------------------------------- */
1053 #define lesser(a,b) (((a)>(b))?(b):(a))
1055 /* ---------------------------*/
1057 * call-seq:
1058 * mrb_value str1 <=> mrb_value str2 => int
1059 * > 1
1060 * = 0
1061 * < -1
1063 MRB_API int
1064 mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2)
1066 struct RString *s1 = mrb_str_ptr(str1);
1067 struct RString *s2 = mrb_str_ptr(str2);
1069 mrb_int len1 = RSTR_LEN(s1);
1070 mrb_int len2 = RSTR_LEN(s2);
1071 mrb_int len = lesser(len1, len2);
1072 mrb_int retval = memcmp(RSTR_PTR(s1), RSTR_PTR(s2), len);
1073 if (retval == 0) {
1074 if (len1 == len2) return 0;
1075 if (len1 > len2) return 1;
1076 return -1;
1078 if (retval > 0) return 1;
1079 return -1;
1082 /* 15.2.10.5.3 */
1085 * call-seq:
1086 * str <=> other_str => -1, 0, +1
1088 * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1089 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1090 * <i>str</i>. If the strings are of different lengths, and the strings are
1091 * equal when compared up to the shortest length, then the longer string is
1092 * considered greater than the shorter one. If the variable <code>$=</code> is
1093 * <code>false</code>, the comparison is based on comparing the binary values
1094 * of each character in the string. In older versions of Ruby, setting
1095 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1096 * in favor of using <code>String#casecmp</code>.
1098 * <code><=></code> is the basis for the methods <code><</code>,
1099 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1100 * included from module <code>Comparable</code>. The method
1101 * <code>String#==</code> does not use <code>Comparable#==</code>.
1103 * "abcdef" <=> "abcde" #=> 1
1104 * "abcdef" <=> "abcdef" #=> 0
1105 * "abcdef" <=> "abcdefg" #=> -1
1106 * "abcdef" <=> "ABCDEF" #=> 1
1108 static mrb_value
1109 mrb_str_cmp_m(mrb_state *mrb, mrb_value str1)
1111 mrb_value str2 = mrb_get_arg1(mrb);
1112 mrb_int result;
1114 if (!mrb_string_p(str2)) {
1115 return mrb_nil_value();
1117 else {
1118 result = mrb_str_cmp(mrb, str1, str2);
1120 return mrb_int_value(mrb, result);
1123 static mrb_bool
1124 str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2)
1126 const mrb_int len = RSTRING_LEN(str1);
1128 if (len != RSTRING_LEN(str2)) return FALSE;
1129 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), (size_t)len) == 0)
1130 return TRUE;
1131 return FALSE;
1134 MRB_API mrb_bool
1135 mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2)
1137 if (!mrb_string_p(str2)) return FALSE;
1138 return str_eql(mrb, str1, str2);
1141 /* 15.2.10.5.4 */
1143 * call-seq:
1144 * str == obj => true or false
1146 * Equality---
1147 * If <i>obj</i> is not a <code>String</code>, returns <code>false</code>.
1148 * Otherwise, returns <code>false</code> or <code>true</code>
1150 * caution:if <i>str</i> <code><=></code> <i>obj</i> returns zero.
1152 static mrb_value
1153 mrb_str_equal_m(mrb_state *mrb, mrb_value str1)
1155 mrb_value str2 = mrb_get_arg1(mrb);
1157 return mrb_bool_value(mrb_str_equal(mrb, str1, str2));
1159 /* ---------------------------------- */
1161 MRB_API mrb_value
1162 mrb_str_dup(mrb_state *mrb, mrb_value str)
1164 struct RString *s = mrb_str_ptr(str);
1165 struct RString *dup = str_new(mrb, 0, 0);
1167 return str_replace(mrb, dup, s);
1170 enum str_convert_range {
1171 /* `beg` and `len` are byte unit in `0 ... str.bytesize` */
1172 STR_BYTE_RANGE_CORRECTED = 1,
1174 /* `beg` and `len` are char unit in any range */
1175 STR_CHAR_RANGE = 2,
1177 /* `beg` and `len` are char unit in `0 ... str.size` */
1178 STR_CHAR_RANGE_CORRECTED = 3,
1180 /* `beg` is out of range */
1181 STR_OUT_OF_RANGE = -1
1184 static enum str_convert_range
1185 str_convert_range(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_int *beg, mrb_int *len)
1187 if (!mrb_undef_p(alen)) {
1188 *beg = mrb_as_int(mrb, indx);
1189 *len = mrb_as_int(mrb, alen);
1190 return STR_CHAR_RANGE;
1192 else {
1193 switch (mrb_type(indx)) {
1194 default:
1195 indx = mrb_ensure_int_type(mrb, indx);
1196 /* fall through */
1197 case MRB_TT_INTEGER:
1198 *beg = mrb_integer(indx);
1199 *len = 1;
1200 return STR_CHAR_RANGE;
1202 case MRB_TT_STRING:
1203 *beg = str_index_str(mrb, str, indx, 0);
1204 if (*beg < 0) { break; }
1205 *len = RSTRING_LEN(indx);
1206 return STR_BYTE_RANGE_CORRECTED;
1208 case MRB_TT_RANGE:
1209 *len = RSTRING_CHAR_LEN(str);
1210 switch (mrb_range_beg_len(mrb, indx, beg, len, *len, TRUE)) {
1211 case MRB_RANGE_OK:
1212 return STR_CHAR_RANGE_CORRECTED;
1213 case MRB_RANGE_OUT:
1214 return STR_OUT_OF_RANGE;
1215 default:
1216 break;
1220 return STR_OUT_OF_RANGE;
1223 mrb_value
1224 mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen)
1226 mrb_int beg, len;
1228 switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) {
1229 case STR_CHAR_RANGE_CORRECTED:
1230 return str_subseq(mrb, str, beg, len);
1231 case STR_CHAR_RANGE:
1232 str = str_substr(mrb, str, beg, len);
1233 if (mrb_undef_p(alen) && !mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
1234 return str;
1235 case STR_BYTE_RANGE_CORRECTED:
1236 if (mrb_string_p(indx)) {
1237 return mrb_str_dup(mrb, indx);
1239 else {
1240 return mrb_str_byte_subseq(mrb, str, beg, len);
1242 case STR_OUT_OF_RANGE:
1243 default:
1244 return mrb_nil_value();
1248 /* 15.2.10.5.6 */
1249 /* 15.2.10.5.34 */
1251 * call-seq:
1252 * str[int] => int or nil
1253 * str[int, int] => new_str or nil
1254 * str[range] => new_str or nil
1255 * str[other_str] => new_str or nil
1256 * str.slice(int) => int or nil
1257 * str.slice(int, int) => new_str or nil
1258 * str.slice(range) => new_str or nil
1259 * str.slice(other_str) => new_str or nil
1261 * Element Reference---If passed a single <code>Integer</code>, returns the code
1262 * of the character at that position. If passed two <code>Integer</code>
1263 * objects, returns a substring starting at the offset given by the first, and
1264 * a length given by the second. If given a range, a substring containing
1265 * characters at offsets given by the range is returned. In all three cases, if
1266 * an offset is negative, it is counted from the end of <i>str</i>. Returns
1267 * <code>nil</code> if the initial offset falls outside the string, the length
1268 * is negative, or the beginning of the range is greater than the end.
1270 * If a <code>String</code> is given, that string is returned if it occurs in
1271 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1272 * match.
1274 * a = "hello there"
1275 * a[1] #=> 101(1.8.7) "e"(1.9.2)
1276 * a[1.1] #=> "e"(1.9.2)
1277 * a[1,3] #=> "ell"
1278 * a[1..3] #=> "ell"
1279 * a[-3,2] #=> "er"
1280 * a[-4..-2] #=> "her"
1281 * a[12..-1] #=> nil
1282 * a[-2..-4] #=> ""
1283 * a["lo"] #=> "lo"
1284 * a["bye"] #=> nil
1286 static mrb_value
1287 mrb_str_aref_m(mrb_state *mrb, mrb_value str)
1289 mrb_value a1, a2;
1291 if (mrb_get_args(mrb, "o|o", &a1, &a2) == 1) {
1292 a2 = mrb_undef_value();
1295 return mrb_str_aref(mrb, str, a1, a2);
1298 static mrb_noreturn void
1299 str_out_of_index(mrb_state *mrb, mrb_value index)
1301 mrb_raisef(mrb, E_INDEX_ERROR, "index %v out of string", index);
1304 static mrb_value
1305 str_replace_partial(mrb_state *mrb, mrb_value src, mrb_int pos, mrb_int end, mrb_value rep)
1307 const mrb_int shrink_threshold = 256;
1308 struct RString *str = mrb_str_ptr(src);
1309 mrb_int len = RSTR_LEN(str);
1310 mrb_int replen, newlen;
1311 char *strp;
1313 if (end > len) { end = len; }
1315 if (pos < 0 || pos > len) {
1316 str_out_of_index(mrb, mrb_int_value(mrb, pos));
1319 replen = (mrb_nil_p(rep) ? 0 : RSTRING_LEN(rep));
1320 if (mrb_int_add_overflow(replen, len - (end - pos), &newlen)) {
1321 mrb_raise(mrb, E_RUNTIME_ERROR, "string size too big");
1324 mrb_str_modify(mrb, str);
1326 if (len < newlen) {
1327 resize_capa(mrb, str, newlen);
1330 strp = RSTR_PTR(str);
1332 memmove(strp + newlen - (len - end), strp + end, len - end);
1333 if (!mrb_nil_p(rep)) {
1334 memmove(strp + pos, RSTRING_PTR(rep), replen);
1336 RSTR_SET_LEN(str, newlen);
1337 strp[newlen] = '\0';
1339 if (len - newlen >= shrink_threshold) {
1340 resize_capa(mrb, str, newlen);
1343 return src;
1346 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
1348 static mrb_value
1349 str_escape(mrb_state *mrb, mrb_value str, mrb_bool inspect)
1351 const char *p, *pend;
1352 char buf[4]; /* `\x??` or UTF-8 character */
1353 mrb_value result = mrb_str_new_lit(mrb, "\"");
1354 #ifdef MRB_UTF8_STRING
1355 uint32_t sb_flag = MRB_STR_SINGLE_BYTE;
1356 #endif
1358 p = RSTRING_PTR(str); pend = RSTRING_END(str);
1359 for (;p < pend; p++) {
1360 unsigned char c, cc;
1361 #ifdef MRB_UTF8_STRING
1362 if (inspect) {
1363 mrb_int clen = mrb_utf8len(p, pend);
1364 if (clen > 1) {
1365 mrb_str_cat(mrb, result, p, clen);
1366 p += clen-1;
1367 sb_flag = 0;
1368 continue;
1371 #endif
1372 c = *p;
1373 if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p+1, pend))) {
1374 buf[0] = '\\'; buf[1] = c;
1375 mrb_str_cat(mrb, result, buf, 2);
1376 continue;
1378 if (ISPRINT(c)) {
1379 buf[0] = c;
1380 mrb_str_cat(mrb, result, buf, 1);
1381 continue;
1383 switch (c) {
1384 case '\n': cc = 'n'; break;
1385 case '\r': cc = 'r'; break;
1386 case '\t': cc = 't'; break;
1387 case '\f': cc = 'f'; break;
1388 case '\013': cc = 'v'; break;
1389 case '\010': cc = 'b'; break;
1390 case '\007': cc = 'a'; break;
1391 case 033: cc = 'e'; break;
1392 default: cc = 0; break;
1394 buf[0] = '\\';
1395 if (cc) {
1396 buf[1] = (char)cc;
1397 mrb_str_cat(mrb, result, buf, 2);
1399 else {
1400 buf[1] = 'x';
1401 buf[3] = mrb_digitmap[c % 16]; c /= 16;
1402 buf[2] = mrb_digitmap[c % 16];
1403 mrb_str_cat(mrb, result, buf, 4);
1406 mrb_str_cat_lit(mrb, result, "\"");
1407 #ifdef MRB_UTF8_STRING
1408 if (inspect) {
1409 mrb_str_ptr(str)->flags |= sb_flag;
1410 mrb_str_ptr(result)->flags |= sb_flag;
1412 else {
1413 RSTR_SET_SINGLE_BYTE_FLAG(mrb_str_ptr(result));
1415 #endif
1417 return result;
1420 static void
1421 mrb_str_aset(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_value replace)
1423 mrb_int beg, len, charlen;
1425 mrb_ensure_string_type(mrb, replace);
1426 switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) {
1427 case STR_OUT_OF_RANGE:
1428 default:
1429 mrb_raise(mrb, E_INDEX_ERROR, "string not matched");
1430 case STR_CHAR_RANGE:
1431 if (len < 0) {
1432 mrb_raisef(mrb, E_INDEX_ERROR, "negative length %v", alen);
1434 charlen = RSTRING_CHAR_LEN(str);
1435 if (beg < 0) { beg += charlen; }
1436 if (beg < 0 || beg > charlen) { str_out_of_index(mrb, indx); }
1437 /* fall through */
1438 case STR_CHAR_RANGE_CORRECTED:
1439 beg = chars2bytes(str, 0, beg);
1440 len = chars2bytes(str, beg, len);
1441 /* fall through */
1442 case STR_BYTE_RANGE_CORRECTED:
1443 if (mrb_int_add_overflow(beg, len, &len)) {
1444 mrb_raise(mrb, E_RUNTIME_ERROR, "string index too big");
1446 str_replace_partial(mrb, str, beg, len, replace);
1451 * call-seq:
1452 * str[int] = replace
1453 * str[int, int] = replace
1454 * str[range] = replace
1455 * str[other_str] = replace
1457 * Modify +self+ by replacing the content of +self+.
1458 * The portion of the string affected is determined using the same criteria as +String#[]+.
1460 static mrb_value
1461 mrb_str_aset_m(mrb_state *mrb, mrb_value str)
1463 mrb_value indx, alen, replace;
1465 switch (mrb_get_args(mrb, "oo|S!", &indx, &alen, &replace)) {
1466 case 2:
1467 replace = alen;
1468 alen = mrb_undef_value();
1469 break;
1470 case 3:
1471 break;
1473 mrb_str_aset(mrb, str, indx, alen, replace);
1474 return str;
1477 /* 15.2.10.5.8 */
1479 * call-seq:
1480 * str.capitalize! => str or nil
1482 * Modifies <i>str</i> by converting the first character to uppercase and the
1483 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
1485 * a = "hello"
1486 * a.capitalize! #=> "Hello"
1487 * a #=> "Hello"
1488 * a.capitalize! #=> nil
1490 static mrb_value
1491 mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str)
1493 mrb_bool modify = FALSE;
1494 struct RString *s = mrb_str_ptr(str);
1495 mrb_int len = RSTR_LEN(s);
1497 mrb_str_modify_keep_ascii(mrb, s);
1498 char *p = RSTR_PTR(s);
1499 char *pend = RSTR_PTR(s) + len;
1500 if (len == 0 || p == NULL) return mrb_nil_value();
1501 if (ISLOWER(*p)) {
1502 *p = TOUPPER(*p);
1503 modify = TRUE;
1505 while (++p < pend) {
1506 if (ISUPPER(*p)) {
1507 *p = TOLOWER(*p);
1508 modify = TRUE;
1511 if (modify) return str;
1512 return mrb_nil_value();
1515 /* 15.2.10.5.7 */
1517 * call-seq:
1518 * str.capitalize => new_str
1520 * Returns a copy of <i>str</i> with the first character converted to uppercase
1521 * and the remainder to lowercase.
1523 * "hello".capitalize #=> "Hello"
1524 * "HELLO".capitalize #=> "Hello"
1525 * "123ABC".capitalize #=> "123abc"
1527 static mrb_value
1528 mrb_str_capitalize(mrb_state *mrb, mrb_value self)
1530 mrb_value str;
1532 str = mrb_str_dup(mrb, self);
1533 mrb_str_capitalize_bang(mrb, str);
1534 return str;
1537 /* 15.2.10.5.10 */
1539 * call-seq:
1540 * str.chomp!(separator="\n") => str or nil
1542 * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
1543 * returning <i>str</i>, or <code>nil</code> if no modifications were made.
1545 static mrb_value
1546 mrb_str_chomp_bang(mrb_state *mrb, mrb_value str)
1548 mrb_value rs;
1549 mrb_int newline;
1550 char *p, *pp;
1551 mrb_int rslen;
1552 mrb_int len;
1553 mrb_int argc;
1554 struct RString *s = mrb_str_ptr(str);
1556 argc = mrb_get_args(mrb, "|S", &rs);
1557 mrb_str_modify_keep_ascii(mrb, s);
1558 len = RSTR_LEN(s);
1559 if (argc == 0) {
1560 if (len == 0) return mrb_nil_value();
1561 smart_chomp:
1562 if (RSTR_PTR(s)[len-1] == '\n') {
1563 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1564 if (RSTR_LEN(s) > 0 &&
1565 RSTR_PTR(s)[RSTR_LEN(s)-1] == '\r') {
1566 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1569 else if (RSTR_PTR(s)[len-1] == '\r') {
1570 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1572 else {
1573 return mrb_nil_value();
1575 RSTR_PTR(s)[RSTR_LEN(s)] = '\0';
1576 return str;
1579 if (len == 0 || mrb_nil_p(rs)) return mrb_nil_value();
1580 p = RSTR_PTR(s);
1581 rslen = RSTRING_LEN(rs);
1582 if (rslen == 0) {
1583 while (len>0 && p[len-1] == '\n') {
1584 len--;
1585 if (len>0 && p[len-1] == '\r')
1586 len--;
1588 if (len < RSTR_LEN(s)) {
1589 RSTR_SET_LEN(s, len);
1590 p[len] = '\0';
1591 return str;
1593 return mrb_nil_value();
1595 if (rslen > len) return mrb_nil_value();
1596 newline = RSTRING_PTR(rs)[rslen-1];
1597 if (rslen == 1 && newline == '\n')
1598 newline = RSTRING_PTR(rs)[rslen-1];
1599 if (rslen == 1 && newline == '\n')
1600 goto smart_chomp;
1602 pp = p + len - rslen;
1603 if (p[len-1] == newline &&
1604 (rslen <= 1 ||
1605 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
1606 RSTR_SET_LEN(s, len - rslen);
1607 p[RSTR_LEN(s)] = '\0';
1608 return str;
1610 return mrb_nil_value();
1613 /* 15.2.10.5.9 */
1615 * call-seq:
1616 * str.chomp(separator="\n") => new_str
1618 * Returns a new <code>String</code> with the given record separator removed
1619 * from the end of <i>str</i> (if present). <code>chomp</code> also removes
1620 * carriage return characters (that is it will remove <code>\n</code>,
1621 * <code>\r</code>, and <code>\r\n</code>).
1623 * "hello".chomp #=> "hello"
1624 * "hello\n".chomp #=> "hello"
1625 * "hello\r\n".chomp #=> "hello"
1626 * "hello\n\r".chomp #=> "hello\n"
1627 * "hello\r".chomp #=> "hello"
1628 * "hello \n there".chomp #=> "hello \n there"
1629 * "hello".chomp("llo") #=> "he"
1631 static mrb_value
1632 mrb_str_chomp(mrb_state *mrb, mrb_value self)
1634 mrb_value str;
1636 str = mrb_str_dup(mrb, self);
1637 mrb_str_chomp_bang(mrb, str);
1638 return str;
1641 /* 15.2.10.5.12 */
1643 * call-seq:
1644 * str.chop! => str or nil
1646 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
1647 * or <code>nil</code> if <i>str</i> is the empty string. See also
1648 * <code>String#chomp!</code>.
1650 static mrb_value
1651 mrb_str_chop_bang(mrb_state *mrb, mrb_value str)
1653 struct RString *s = mrb_str_ptr(str);
1655 mrb_str_modify_keep_ascii(mrb, s);
1656 if (RSTR_LEN(s) > 0) {
1657 mrb_int len;
1658 #ifdef MRB_UTF8_STRING
1659 const char* t = RSTR_PTR(s), *p = t;
1660 const char* e = p + RSTR_LEN(s);
1661 while (p<e) {
1662 mrb_int clen = mrb_utf8len(p, e);
1663 if (p + clen>=e) break;
1664 p += clen;
1666 len = p - t;
1667 #else
1668 len = RSTR_LEN(s) - 1;
1669 #endif
1670 if (RSTR_PTR(s)[len] == '\n') {
1671 if (len > 0 &&
1672 RSTR_PTR(s)[len-1] == '\r') {
1673 len--;
1676 RSTR_SET_LEN(s, len);
1677 RSTR_PTR(s)[len] = '\0';
1678 return str;
1680 return mrb_nil_value();
1683 /* 15.2.10.5.11 */
1685 * call-seq:
1686 * str.chop => new_str
1688 * Returns a new <code>String</code> with the last character removed. If the
1689 * string ends with <code>\r\n</code>, both characters are removed. Applying
1690 * <code>chop</code> to an empty string returns an empty
1691 * string. <code>String#chomp</code> is often a safer alternative, as it leaves
1692 * the string unchanged if it doesn't end in a record separator.
1694 * "string\r\n".chop #=> "string"
1695 * "string\n\r".chop #=> "string\n"
1696 * "string\n".chop #=> "string"
1697 * "string".chop #=> "strin"
1698 * "x".chop #=> ""
1700 static mrb_value
1701 mrb_str_chop(mrb_state *mrb, mrb_value self)
1703 mrb_value str;
1704 str = mrb_str_dup(mrb, self);
1705 mrb_str_chop_bang(mrb, str);
1706 return str;
1709 /* 15.2.10.5.14 */
1711 * call-seq:
1712 * str.downcase! => str or nil
1714 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
1715 * changes were made.
1717 static mrb_value
1718 mrb_str_downcase_bang(mrb_state *mrb, mrb_value str)
1720 char *p, *pend;
1721 mrb_bool modify = FALSE;
1722 struct RString *s = mrb_str_ptr(str);
1724 mrb_str_modify_keep_ascii(mrb, s);
1725 p = RSTR_PTR(s);
1726 pend = RSTR_PTR(s) + RSTR_LEN(s);
1727 while (p < pend) {
1728 if (ISUPPER(*p)) {
1729 *p = TOLOWER(*p);
1730 modify = TRUE;
1732 p++;
1735 if (modify) return str;
1736 return mrb_nil_value();
1739 /* 15.2.10.5.13 */
1741 * call-seq:
1742 * str.downcase => new_str
1744 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
1745 * lowercase counterparts. The operation is locale insensitive---only
1746 * characters 'A' to 'Z' are affected.
1748 * "hEllO".downcase #=> "hello"
1750 static mrb_value
1751 mrb_str_downcase(mrb_state *mrb, mrb_value self)
1753 mrb_value str;
1755 str = mrb_str_dup(mrb, self);
1756 mrb_str_downcase_bang(mrb, str);
1757 return str;
1760 /* 15.2.10.5.16 */
1762 * call-seq:
1763 * str.empty? => true or false
1765 * Returns <code>true</code> if <i>str</i> has a length of zero.
1767 * "hello".empty? #=> false
1768 * "".empty? #=> true
1770 static mrb_value
1771 mrb_str_empty_p(mrb_state *mrb, mrb_value self)
1773 struct RString *s = mrb_str_ptr(self);
1775 return mrb_bool_value(RSTR_LEN(s) == 0);
1778 /* 15.2.10.5.17 */
1780 * call-seq:
1781 * str.eql?(other) => true or false
1783 * Two strings are equal if the have the same length and content.
1785 static mrb_value
1786 mrb_str_eql(mrb_state *mrb, mrb_value self)
1788 mrb_value str2 = mrb_get_arg1(mrb);
1789 mrb_bool eql_p;
1791 eql_p = (mrb_string_p(str2)) && str_eql(mrb, self, str2);
1793 return mrb_bool_value(eql_p);
1796 MRB_API mrb_value
1797 mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
1799 return str_substr(mrb, str, beg, len);
1803 * 32 bit magic FNV-0 and FNV-1 prime
1804 b */
1805 #define FNV_32_PRIME ((uint32_t)0x01000193)
1806 #define FNV1_32_INIT ((uint32_t)0x811c9dc5)
1808 uint32_t
1809 mrb_byte_hash_step(const uint8_t *s, mrb_int len, uint32_t hval)
1811 const uint8_t *send = s + len;
1814 * FNV-1 hash each octet in the buffer
1816 while (s < send) {
1817 /* multiply by the 32 bit FNV magic prime mod 2^32 */
1818 #if defined(NO_FNV_GCC_OPTIMIZATION)
1819 hval *= FNV_32_PRIME;
1820 #else
1821 hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
1822 #endif
1824 /* xor the bottom with the current octet */
1825 hval ^= (uint32_t)*s++;
1828 /* return our new hash value */
1829 return hval;
1832 uint32_t
1833 mrb_byte_hash(const uint8_t *s, mrb_int len)
1835 return mrb_byte_hash_step(s, len, FNV1_32_INIT);
1838 uint32_t
1839 mrb_str_hash(mrb_state *mrb, mrb_value str)
1841 struct RString *s = mrb_str_ptr(str);
1842 return mrb_byte_hash((uint8_t*)RSTR_PTR(s), RSTR_LEN(s));
1845 /* 15.2.10.5.20 */
1847 * call-seq:
1848 * str.hash => int
1850 * Return a hash based on the string's length and content.
1852 static mrb_value
1853 mrb_str_hash_m(mrb_state *mrb, mrb_value self)
1855 mrb_int key = mrb_str_hash(mrb, self);
1856 return mrb_int_value(mrb, key);
1859 /* 15.2.10.5.21 */
1861 * call-seq:
1862 * str.include? other_str => true or false
1863 * str.include? int => true or false
1865 * Returns <code>true</code> if <i>str</i> contains the given string or
1866 * character.
1868 * "hello".include? "lo" #=> true
1869 * "hello".include? "ol" #=> false
1870 * "hello".include? ?h #=> true
1872 static mrb_value
1873 mrb_str_include(mrb_state *mrb, mrb_value self)
1875 mrb_value str2;
1877 mrb_get_args(mrb, "S", &str2);
1878 if (str_index_str(mrb, self, str2, 0) < 0)
1879 return mrb_bool_value(FALSE);
1880 return mrb_bool_value(TRUE);
1884 * call-seq:
1885 * str.byteindex(substring, offset = 0) -> integer or nil
1887 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
1888 * or +nil+ if none found:
1890 * 'foo'.byteindex('f') # => 0
1891 * 'foo'.byteindex('oo') # => 1
1892 * 'foo'.byteindex('ooo') # => nil
1894 static mrb_value
1895 mrb_str_byteindex_m(mrb_state *mrb, mrb_value str)
1897 mrb_value sub;
1898 mrb_int pos;
1900 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
1901 pos = 0;
1903 else if (pos < 0) {
1904 pos += RSTRING_LEN(str);
1905 if (pos < 0) {
1906 return mrb_nil_value();
1909 pos = str_index_str(mrb, str, sub, pos);
1911 if (pos == -1) return mrb_nil_value();
1912 return mrb_int_value(mrb, pos);
1915 /* 15.2.10.5.22 */
1917 * call-seq:
1918 * str.index(substring [, offset]) => int or nil
1920 * Returns the index of the first occurrence of the given
1921 * <i>substring</i>. Returns <code>nil</code> if not found.
1922 * If the second parameter is present, it
1923 * specifies the position in the string to begin the search.
1925 * "hello".index('l') #=> 2
1926 * "hello".index('lo') #=> 3
1927 * "hello".index('a') #=> nil
1928 * "hello".index('l', -2) #=> 3
1930 #ifdef MRB_UTF8_STRING
1931 static mrb_value
1932 mrb_str_index_m(mrb_state *mrb, mrb_value str)
1934 if (RSTR_SINGLE_BYTE_P(mrb_str_ptr(str))) {
1935 return mrb_str_byteindex_m(mrb, str);
1938 mrb_value sub;
1939 mrb_int pos;
1941 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
1942 pos = 0;
1944 else if (pos < 0) {
1945 mrb_int clen = RSTRING_CHAR_LEN(str);
1946 pos += clen;
1947 if (pos < 0) {
1948 return mrb_nil_value();
1951 pos = str_index_str_by_char(mrb, str, sub, pos);
1953 if (pos == -1) return mrb_nil_value();
1954 return mrb_int_value(mrb, pos);
1956 #else
1957 #define mrb_str_index_m mrb_str_byteindex_m
1958 #endif
1960 /* 15.2.10.5.24 */
1961 /* 15.2.10.5.28 */
1963 * call-seq:
1964 * str.replace(other_str) => str
1966 * s = "hello" #=> "hello"
1967 * s.replace "world" #=> "world"
1969 static mrb_value
1970 mrb_str_replace(mrb_state *mrb, mrb_value str)
1972 mrb_value str2;
1974 mrb_get_args(mrb, "S", &str2);
1975 return str_replace(mrb, mrb_str_ptr(str), mrb_str_ptr(str2));
1978 /* 15.2.10.5.23 */
1980 * call-seq:
1981 * String.new(str="") => new_str
1983 * Returns a new string object containing a copy of <i>str</i>.
1985 static mrb_value
1986 mrb_str_init(mrb_state *mrb, mrb_value self)
1988 mrb_value str2;
1990 if (mrb_get_args(mrb, "|S", &str2) == 0) {
1991 struct RString *s = str_new(mrb, 0, 0);
1992 str2 = mrb_obj_value(s);
1994 str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2));
1995 return self;
1998 /* 15.2.10.5.25 */
1999 /* 15.2.10.5.41 */
2001 * call-seq:
2002 * str.intern => symbol
2003 * str.to_sym => symbol
2005 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
2006 * symbol if it did not previously exist.
2008 * "Koala".intern #=> :Koala
2009 * s = 'cat'.to_sym #=> :cat
2010 * s == :cat #=> true
2011 * s = '@cat'.to_sym #=> :@cat
2012 * s == :@cat #=> true
2014 * This can also be used to create symbols that cannot be represented using the
2015 * <code>:xxx</code> notation.
2017 * 'cat and dog'.to_sym #=> :"cat and dog"
2019 MRB_API mrb_value
2020 mrb_str_intern(mrb_state *mrb, mrb_value self)
2022 return mrb_symbol_value(mrb_intern_str(mrb, self));
2024 /* ---------------------------------- */
2025 MRB_API mrb_value
2026 mrb_obj_as_string(mrb_state *mrb, mrb_value obj)
2028 switch (mrb_type(obj)) {
2029 case MRB_TT_STRING:
2030 return obj;
2031 case MRB_TT_SYMBOL:
2032 return mrb_sym_str(mrb, mrb_symbol(obj));
2033 case MRB_TT_INTEGER:
2034 return mrb_integer_to_str(mrb, obj, 10);
2035 case MRB_TT_SCLASS:
2036 case MRB_TT_CLASS:
2037 case MRB_TT_MODULE:
2038 return mrb_mod_to_s(mrb, obj);
2039 default:
2040 return mrb_type_convert(mrb, obj, MRB_TT_STRING, MRB_SYM(to_s));
2044 MRB_API mrb_value
2045 mrb_ptr_to_str(mrb_state *mrb, void *p)
2047 struct RString *p_str;
2048 char *p1;
2049 char *p2;
2050 uintptr_t n = (uintptr_t)p;
2052 p_str = str_new(mrb, NULL, 2 + sizeof(uintptr_t) * CHAR_BIT / 4);
2053 p1 = RSTR_PTR(p_str);
2054 *p1++ = '0';
2055 *p1++ = 'x';
2056 p2 = p1;
2058 do {
2059 *p2++ = mrb_digitmap[n % 16];
2060 n /= 16;
2061 } while (n > 0);
2062 *p2 = '\0';
2063 RSTR_SET_LEN(p_str, (mrb_int)(p2 - RSTR_PTR(p_str)));
2065 while (p1 < p2) {
2066 const char c = *p1;
2067 *p1++ = *--p2;
2068 *p2 = c;
2071 return mrb_obj_value(p_str);
2074 static inline void
2075 str_reverse(char *p, char *e)
2077 char c;
2079 while (p < e) {
2080 c = *p;
2081 *p++ = *e;
2082 *e-- = c;
2086 /* 15.2.10.5.30 */
2088 * call-seq:
2089 * str.reverse! => str
2091 * Reverses <i>str</i> in place.
2093 static mrb_value
2094 mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
2096 struct RString *s = mrb_str_ptr(str);
2097 char *p, *e;
2099 #ifdef MRB_UTF8_STRING
2100 mrb_int utf8_len = RSTRING_CHAR_LEN(str);
2101 mrb_int len = RSTR_LEN(s);
2103 if (utf8_len < 2) return str;
2104 if (utf8_len < len) {
2105 mrb_str_modify(mrb, s);
2106 p = RSTR_PTR(s);
2107 e = p + RSTR_LEN(s);
2108 while (p<e) {
2109 mrb_int clen = mrb_utf8len(p, e);
2110 str_reverse(p, p + clen - 1);
2111 p += clen;
2113 goto bytes;
2115 #endif
2117 if (RSTR_LEN(s) > 1) {
2118 mrb_str_modify(mrb, s);
2119 goto bytes;
2121 return str;
2123 bytes:
2124 p = RSTR_PTR(s);
2125 e = p + RSTR_LEN(s) - 1;
2126 str_reverse(p, e);
2127 return str;
2130 /* ---------------------------------- */
2131 /* 15.2.10.5.29 */
2133 * call-seq:
2134 * str.reverse => new_str
2136 * Returns a new string with the characters from <i>str</i> in reverse order.
2138 * "stressed".reverse #=> "desserts"
2140 static mrb_value
2141 mrb_str_reverse(mrb_state *mrb, mrb_value str)
2143 mrb_value str2 = mrb_str_dup(mrb, str);
2144 mrb_str_reverse_bang(mrb, str2);
2145 return str2;
2149 * call-seq:
2150 * byterindex(substring, offset = self.bytesize) -> integer or nil
2152 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
2153 * or +nil+ if none found:
2155 * 'foo'.byterindex('f') # => 0
2156 * 'foo'.byterindex('o') # => 2
2157 * 'foo'.byterindex('oo') # => 1
2158 * 'foo'.byterindex('ooo') # => nil
2160 static mrb_value
2161 mrb_str_byterindex_m(mrb_state *mrb, mrb_value str)
2163 mrb_value sub;
2164 mrb_int pos;
2165 mrb_int len = RSTRING_LEN(str);
2167 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
2168 pos = len;
2170 else {
2171 if (pos < 0) {
2172 pos += len;
2173 if (pos < 0) {
2174 return mrb_nil_value();
2177 if (pos > len) pos = len;
2179 pos = str_rindex(mrb, str, sub, pos);
2180 if (pos < 0) {
2181 return mrb_nil_value();
2183 return mrb_int_value(mrb, pos);
2186 /* 15.2.10.5.31 */
2188 * call-seq:
2189 * str.rindex(substring [, offset]) => int or nil
2191 * Returns the index of the last occurrence of the given <i>substring</i>.
2192 * Returns <code>nil</code> if not found. If the second parameter is
2193 * present, it specifies the position in the string to end the
2194 * search---characters beyond this point will not be considered.
2196 * "hello".rindex('e') #=> 1
2197 * "hello".rindex('l') #=> 3
2198 * "hello".rindex('a') #=> nil
2199 * "hello".rindex('l', 2) #=> 2
2201 #ifdef MRB_UTF8_STRING
2202 static mrb_value
2203 mrb_str_rindex_m(mrb_state *mrb, mrb_value str)
2205 if (RSTR_SINGLE_BYTE_P(mrb_str_ptr(str))) {
2206 return mrb_str_byterindex_m(mrb, str);
2209 mrb_value sub;
2210 mrb_int pos;
2212 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
2213 pos = RSTRING_LEN(str);
2215 else if (pos >= 0) {
2216 pos = chars2bytes(str, 0, pos);
2218 else {
2219 const char *p = RSTRING_PTR(str);
2220 const char *e = RSTRING_END(str);
2221 while (pos++ < 0 && p < e) {
2222 e = char_backtrack(p, e);
2224 if (p == e) return mrb_nil_value();
2225 pos = (mrb_int)(e - p);
2227 pos = str_rindex(mrb, str, sub, pos);
2228 if (pos >= 0) {
2229 pos = bytes2chars(str, pos);
2230 if (pos < 0) return mrb_nil_value();
2231 return mrb_int_value(mrb, pos);
2233 return mrb_nil_value();
2235 #else
2236 #define mrb_str_rindex_m mrb_str_byterindex_m
2237 #endif
2239 /* 15.2.10.5.35 */
2242 * call-seq:
2243 * str.split(separator=nil, [limit]) => anArray
2245 * Divides <i>str</i> into substrings based on a delimiter, returning an array
2246 * of these substrings.
2248 * If <i>separator</i> is a <code>String</code>, then its contents are used as
2249 * the delimiter when splitting <i>str</i>. If <i>separator</i> is a single
2250 * space, <i>str</i> is split on whitespace, with leading whitespace and runs
2251 * of contiguous whitespace characters ignored.
2253 * If <i>separator</i> is omitted or <code>nil</code> (which is the default),
2254 * <i>str</i> is split on whitespace as if ' ' were specified.
2256 * If the <i>limit</i> parameter is omitted, trailing null fields are
2257 * suppressed. If <i>limit</i> is a positive number, at most that number of
2258 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
2259 * string is returned as the only entry in an array). If negative, there is no
2260 * limit to the number of fields returned, and trailing null fields are not
2261 * suppressed.
2263 * " now's the time".split #=> ["now's", "the", "time"]
2264 * " now's the time".split(' ') #=> ["now's", "the", "time"]
2266 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
2267 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
2268 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
2269 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
2272 static mrb_value
2273 mrb_str_split_m(mrb_state *mrb, mrb_value str)
2275 mrb_int argc;
2276 mrb_value spat = mrb_nil_value();
2277 enum {awk, string} split_type = string;
2278 mrb_int i = 0;
2279 mrb_int beg;
2280 mrb_int end;
2281 mrb_int lim = 0;
2282 mrb_bool lim_p;
2283 mrb_value result, tmp;
2285 argc = mrb_get_args(mrb, "|oi", &spat, &lim);
2286 lim_p = (lim > 0 && argc == 2);
2287 if (argc == 2) {
2288 if (lim == 1) {
2289 if (RSTRING_LEN(str) == 0)
2290 return mrb_ary_new_capa(mrb, 0);
2291 return mrb_ary_new_from_values(mrb, 1, &str);
2293 i = 1;
2296 if (argc == 0 || mrb_nil_p(spat)) {
2297 split_type = awk;
2299 else if (!mrb_string_p(spat)) {
2300 mrb_raise(mrb, E_TYPE_ERROR, "expected String");
2302 else if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
2303 split_type = awk;
2306 result = mrb_ary_new(mrb);
2307 beg = 0;
2308 if (split_type == awk) {
2309 mrb_bool skip = TRUE;
2310 mrb_int idx = 0;
2311 mrb_int str_len = RSTRING_LEN(str);
2312 unsigned int c;
2313 int ai = mrb_gc_arena_save(mrb);
2315 idx = end = beg;
2316 while (idx < str_len) {
2317 c = (unsigned char)RSTRING_PTR(str)[idx++];
2318 if (skip) {
2319 if (ISSPACE(c)) {
2320 beg = idx;
2322 else {
2323 end = idx;
2324 skip = FALSE;
2325 if (lim_p && lim <= i) break;
2328 else if (ISSPACE(c)) {
2329 mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, beg, end-beg));
2330 mrb_gc_arena_restore(mrb, ai);
2331 skip = TRUE;
2332 beg = idx;
2333 if (lim_p) i++;
2335 else {
2336 end = idx;
2340 else { /* split_type == string */
2341 mrb_int str_len = RSTRING_LEN(str);
2342 mrb_int pat_len = RSTRING_LEN(spat);
2343 mrb_int idx = 0;
2344 int ai = mrb_gc_arena_save(mrb);
2346 while (idx < str_len) {
2347 if (pat_len > 0) {
2348 end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx);
2349 if (end < 0) break;
2351 else {
2352 end = chars2bytes(str, idx, 1);
2354 mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, idx, end));
2355 mrb_gc_arena_restore(mrb, ai);
2356 idx += end + pat_len;
2357 if (lim_p && lim <= ++i) break;
2359 beg = idx;
2361 if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
2362 if (RSTRING_LEN(str) == beg) {
2363 tmp = mrb_str_new(mrb, 0, 0);
2365 else {
2366 tmp = mrb_str_byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
2368 mrb_ary_push(mrb, result, tmp);
2370 if (!lim_p && lim == 0) {
2371 mrb_int len;
2372 while ((len = RARRAY_LEN(result)) > 0 &&
2373 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
2374 mrb_ary_pop(mrb, result);
2377 return result;
2380 static mrb_bool
2381 trailingbad(const char *str, const char *p, const char *pend)
2383 if (p == str) return TRUE; /* no number */
2384 if (*(p - 1) == '_') return TRUE; /* trailing '_' */
2385 while (p<pend && ISSPACE(*p)) p++;
2386 if (p<pend) return TRUE; /* trailing garbage */
2387 return FALSE;
2390 static mrb_value
2391 mrb_str_len_to_integer(mrb_state *mrb, const char *str, size_t len, mrb_int base, int badcheck)
2393 const char *p = str;
2394 const char *pend = str + len;
2395 #ifdef MRB_USE_BIGINT
2396 const char *p2 = NULL;
2397 #endif
2398 char sign = 1;
2399 int c;
2400 mrb_int n = 0;
2401 mrb_int val;
2403 #define conv_digit(c) \
2404 (ISDIGIT(c) ? ((c) - '0') : \
2405 ISLOWER(c) ? ((c) - 'a' + 10) : \
2406 ISUPPER(c) ? ((c) - 'A' + 10) : \
2409 if (!p) {
2410 if (badcheck) goto bad;
2411 return mrb_fixnum_value(0);
2413 while (p<pend && ISSPACE(*p))
2414 p++;
2416 if (p[0] == '+') {
2417 p++;
2419 else if (p[0] == '-') {
2420 p++;
2421 sign = 0;
2423 if (base <= 0) {
2424 if (p[0] == '0') {
2425 switch (p[1]) {
2426 case 'x': case 'X':
2427 base = 16;
2428 break;
2429 case 'b': case 'B':
2430 base = 2;
2431 break;
2432 case 'o': case 'O':
2433 base = 8;
2434 break;
2435 case 'd': case 'D':
2436 base = 10;
2437 break;
2438 default:
2439 base = 8;
2440 break;
2443 else if (base < -1) {
2444 base = -base;
2446 else {
2447 base = 10;
2450 switch (base) {
2451 case 2:
2452 if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) {
2453 p += 2;
2455 break;
2456 case 3:
2457 break;
2458 case 8:
2459 if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) {
2460 p += 2;
2462 case 4: case 5: case 6: case 7:
2463 break;
2464 case 10:
2465 if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) {
2466 p += 2;
2468 case 9: case 11: case 12: case 13: case 14: case 15:
2469 break;
2470 case 16:
2471 if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) {
2472 p += 2;
2474 break;
2475 default:
2476 if (base < 2 || 36 < base) {
2477 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base);
2479 break;
2480 } /* end of switch (base) { */
2481 if (p>=pend) {
2482 if (badcheck) goto bad;
2483 return mrb_fixnum_value(0);
2485 if (*p == '0') { /* squeeze preceding 0s */
2486 p++;
2487 while (p<pend) {
2488 c = *p++;
2489 if (c == '_') {
2490 if (p<pend && *p == '_') {
2491 if (badcheck) goto bad;
2492 break;
2494 continue;
2496 if (c != '0') {
2497 p--;
2498 break;
2501 if (*(p - 1) == '0')
2502 p--;
2504 if (p == pend || *p == '_') {
2505 if (badcheck) goto bad;
2506 return mrb_fixnum_value(0);
2508 #ifdef MRB_USE_BIGINT
2509 p2 = p;
2510 #endif
2511 for (;p<pend; p++) {
2512 if (*p == '_') {
2513 p++;
2514 if (p==pend) {
2515 if (badcheck) goto bad;
2516 continue;
2518 if (*p == '_') {
2519 if (badcheck) goto bad;
2520 break;
2523 if (badcheck && *p == '\0') {
2524 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
2526 c = conv_digit(*p);
2527 if (c < 0 || c >= base) {
2528 break;
2530 if (mrb_int_mul_overflow(n, base, &n)) goto overflow;
2531 if (MRB_INT_MAX - c < n) {
2532 if (sign == 0 && MRB_INT_MAX - n == c - 1) {
2533 n = MRB_INT_MIN;
2534 sign = 1;
2535 break;
2537 overflow:
2538 #ifdef MRB_USE_BIGINT
2540 const char *p3 = p2;
2541 while (p3 < pend) {
2542 char c = TOLOWER(*p3);
2543 const char *p4 = strchr(mrb_digitmap, c);
2544 if (p4 == NULL && c != '_') break;
2545 if (p4 - mrb_digitmap >= base) break;
2546 p3++;
2548 if (badcheck && trailingbad(str, p, pend)) goto bad;
2549 return mrb_bint_new_str(mrb, p2, (mrb_int)(p3-p2), sign ? base : -base);
2550 #else
2551 mrb_raisef(mrb, E_RANGE_ERROR, "string (%l) too big for integer", str, pend-str);
2552 #endif
2554 n += c;
2556 val = (mrb_int)n;
2557 if (badcheck && trailingbad(str, p, pend)) goto bad;
2558 return mrb_int_value(mrb, sign ? val : -val);
2559 bad:
2560 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%!l)", str, pend-str);
2561 /* not reached */
2562 return mrb_fixnum_value(0);
2565 /* obsolete: use RSTRING_CSTR() or mrb_string_cstr() */
2566 MRB_API const char*
2567 mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr)
2569 struct RString *ps;
2570 const char *p;
2571 mrb_int len;
2573 mrb_ensure_string_type(mrb, *ptr);
2574 ps = mrb_str_ptr(*ptr);
2575 check_null_byte(mrb, ps);
2576 p = RSTR_PTR(ps);
2577 len = RSTR_LEN(ps);
2578 if (p == NULL) return "";
2579 if (p[len] == '\0') {
2580 return p;
2584 * Even after str_modify_keep_ascii(), NULL termination is not ensured if
2585 * RSTR_SET_LEN() is used explicitly (e.g. String#delete_suffix!).
2587 str_modify_keep_ascii(mrb, ps);
2588 RSTR_PTR(ps)[len] = '\0';
2589 return RSTR_PTR(ps);
2592 MRB_API const char*
2593 mrb_string_cstr(mrb_state *mrb, mrb_value str)
2595 return mrb_string_value_cstr(mrb, &str);
2598 MRB_API mrb_value
2599 mrb_str_to_integer(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck)
2601 const char *s;
2602 mrb_int len;
2604 mrb_ensure_string_type(mrb, str);
2605 s = RSTRING_PTR(str);
2606 len = RSTRING_LEN(str);
2607 return mrb_str_len_to_integer(mrb, s, len, base, badcheck);
2610 /* 15.2.10.5.38 */
2612 * call-seq:
2613 * str.to_i(base=10) => integer
2615 * Returns the result of interpreting leading characters in <i>str</i> as an
2616 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
2617 * end of a valid number are ignored. If there is not a valid number at the
2618 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
2619 * exception.
2621 * "12345".to_i #=> 12345
2622 * "99 red balloons".to_i #=> 99
2623 * "0a".to_i #=> 0
2624 * "0a".to_i(16) #=> 10
2625 * "hello".to_i #=> 0
2626 * "1100101".to_i(2) #=> 101
2627 * "1100101".to_i(8) #=> 294977
2628 * "1100101".to_i(10) #=> 1100101
2629 * "1100101".to_i(16) #=> 17826049
2631 static mrb_value
2632 mrb_str_to_i(mrb_state *mrb, mrb_value self)
2634 mrb_int base = 10;
2636 mrb_get_args(mrb, "|i", &base);
2637 if (base < 0 || 36 < base) {
2638 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base);
2640 return mrb_str_to_integer(mrb, self, base, FALSE);
2643 #ifndef MRB_NO_FLOAT
2644 static double
2645 mrb_str_len_to_dbl(mrb_state *mrb, const char *s, size_t len, mrb_bool badcheck)
2647 char buf[DBL_DIG * 4 + 20];
2648 const char *p = s, *p2;
2649 const char *pend = p + len;
2650 char *end;
2651 char *n;
2652 char prev = 0;
2653 double d;
2654 mrb_bool dot = FALSE;
2656 if (!p) return 0.0;
2657 while (p<pend && ISSPACE(*p)) p++;
2658 p2 = p;
2660 if (pend - p > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2661 mrb_value x;
2663 if (!badcheck) return 0.0;
2664 x = mrb_str_len_to_integer(mrb, p, pend-p, 0, badcheck);
2665 if (mrb_integer_p(x))
2666 d = (double)mrb_integer(x);
2667 else /* if (mrb_float_p(x)) */
2668 d = mrb_float(x);
2669 return d;
2671 while (p < pend) {
2672 if (!*p) {
2673 if (badcheck) {
2674 mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte");
2675 /* not reached */
2677 pend = p;
2678 p = p2;
2679 goto nocopy;
2681 if (!badcheck && *p == ' ') {
2682 pend = p;
2683 p = p2;
2684 goto nocopy;
2686 if (*p == '_') break;
2687 p++;
2689 p = p2;
2690 n = buf;
2691 while (p < pend) {
2692 char c = *p++;
2693 if (c == '.') dot = TRUE;
2694 if (c == '_') {
2695 /* remove an underscore between digits */
2696 if (n == buf || !ISDIGIT(prev) || p == pend) {
2697 if (badcheck) goto bad;
2698 break;
2701 else if (badcheck && prev == '_' && !ISDIGIT(c)) goto bad;
2702 else {
2703 const char *bend = buf+sizeof(buf)-1;
2704 if (n==bend) { /* buffer overflow */
2705 if (dot) break; /* cut off remaining fractions */
2706 return INFINITY;
2708 *n++ = c;
2710 prev = c;
2712 *n = '\0';
2713 p = buf;
2714 pend = n;
2715 nocopy:
2716 if (mrb_read_float(p, &end, &d) == FALSE) {
2717 if (badcheck) {
2718 bad:
2719 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%!s)", s);
2720 /* not reached */
2722 return 0.0;
2724 if (badcheck) {
2725 if (!end || p == end) goto bad;
2726 while (end<pend && ISSPACE(*end)) end++;
2727 if (end<pend) goto bad;
2729 return d;
2732 MRB_API double
2733 mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck)
2735 return mrb_str_len_to_dbl(mrb, RSTRING_PTR(str), RSTRING_LEN(str), badcheck);
2738 /* 15.2.10.5.39 */
2740 * call-seq:
2741 * str.to_f => float
2743 * Returns the result of interpreting leading characters in <i>str</i> as a
2744 * floating-point number. Extraneous characters past the end of a valid number
2745 * are ignored. If there is not a valid number at the start of <i>str</i>,
2746 * <code>0.0</code> is returned. This method never raises an exception.
2748 * "123.45e1".to_f #=> 1234.5
2749 * "45.67 degrees".to_f #=> 45.67
2750 * "thx1138".to_f #=> 0.0
2752 static mrb_value
2753 mrb_str_to_f(mrb_state *mrb, mrb_value self)
2755 return mrb_float_value(mrb, mrb_str_to_dbl(mrb, self, FALSE));
2757 #endif
2759 /* 15.2.10.5.40 */
2761 * call-seq:
2762 * str.to_s => str
2764 * Returns the receiver.
2766 static mrb_value
2767 mrb_str_to_s(mrb_state *mrb, mrb_value self)
2769 if (mrb_obj_class(mrb, self) != mrb->string_class) {
2770 return mrb_str_dup(mrb, self);
2772 return self;
2775 /* 15.2.10.5.43 */
2777 * call-seq:
2778 * str.upcase! => str or nil
2780 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
2781 * were made.
2783 static mrb_value
2784 mrb_str_upcase_bang(mrb_state *mrb, mrb_value str)
2786 struct RString *s = mrb_str_ptr(str);
2787 char *p, *pend;
2788 mrb_bool modify = FALSE;
2790 mrb_str_modify_keep_ascii(mrb, s);
2791 p = RSTRING_PTR(str);
2792 pend = RSTRING_END(str);
2793 while (p < pend) {
2794 if (ISLOWER(*p)) {
2795 *p = TOUPPER(*p);
2796 modify = TRUE;
2798 p++;
2801 if (modify) return str;
2802 return mrb_nil_value();
2805 /* 15.2.10.5.42 */
2807 * call-seq:
2808 * str.upcase => new_str
2810 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
2811 * uppercase counterparts. The operation is locale insensitive---only
2812 * characters 'a' to 'z' are affected.
2814 * "hEllO".upcase #=> "HELLO"
2816 static mrb_value
2817 mrb_str_upcase(mrb_state *mrb, mrb_value self)
2819 mrb_value str;
2821 str = mrb_str_dup(mrb, self);
2822 mrb_str_upcase_bang(mrb, str);
2823 return str;
2827 * call-seq:
2828 * str.dump -> new_str
2830 * Produces a version of <i>str</i> with all nonprinting characters replaced by
2831 * <code>\nnn</code> notation and all special characters escaped.
2833 mrb_value
2834 mrb_str_dump(mrb_state *mrb, mrb_value str)
2836 return str_escape(mrb, str, FALSE);
2839 MRB_API mrb_value
2840 mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len)
2842 struct RString *s = mrb_str_ptr(str);
2843 mrb_int capa;
2844 mrb_int total;
2845 ptrdiff_t off = -1;
2847 if (len == 0) return str;
2848 mrb_str_modify(mrb, s);
2849 if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) {
2850 off = ptr - RSTR_PTR(s);
2853 capa = RSTR_CAPA(s);
2854 if (mrb_int_add_overflow(RSTR_LEN(s), len, &total)) {
2855 size_error:
2856 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
2858 if (capa <= total) {
2859 if (capa == 0) capa = 1;
2860 while (capa <= total) {
2861 if (mrb_int_mul_overflow(capa, 2, &capa)) goto size_error;
2863 resize_capa(mrb, s, capa);
2865 if (off != -1) {
2866 ptr = RSTR_PTR(s) + off;
2868 memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len);
2869 RSTR_SET_LEN(s, total);
2870 RSTR_PTR(s)[total] = '\0'; /* sentinel */
2871 return str;
2874 MRB_API mrb_value
2875 mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr)
2877 return mrb_str_cat(mrb, str, ptr, ptr ? strlen(ptr) : 0);
2880 MRB_API mrb_value
2881 mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2)
2883 if (mrb_str_ptr(str) == mrb_str_ptr(str2)) {
2884 mrb_str_modify(mrb, mrb_str_ptr(str));
2886 return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2));
2889 MRB_API mrb_value
2890 mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2)
2892 mrb_ensure_string_type(mrb, str2);
2893 return mrb_str_cat_str(mrb, str1, str2);
2897 * call-seq:
2898 * str.inspect -> string
2900 * Returns a printable version of _str_, surrounded by quote marks,
2901 * with special characters escaped.
2903 * str = "hello"
2904 * str[3] = "\b"
2905 * str.inspect #=> "\"hel\\bo\""
2907 mrb_value
2908 mrb_str_inspect(mrb_state *mrb, mrb_value str)
2910 return str_escape(mrb, str, TRUE);
2914 * call-seq:
2915 * str.bytes -> array of int
2917 * Returns an array of bytes in _str_.
2919 * str = "hello"
2920 * str.bytes #=> [104, 101, 108, 108, 111]
2922 static mrb_value
2923 mrb_str_bytes(mrb_state *mrb, mrb_value str)
2925 struct RString *s = mrb_str_ptr(str);
2926 mrb_value a = mrb_ary_new_capa(mrb, RSTR_LEN(s));
2927 unsigned char *p = (unsigned char*)(RSTR_PTR(s)), *pend = p + RSTR_LEN(s);
2929 while (p < pend) {
2930 mrb_ary_push(mrb, a, mrb_fixnum_value(p[0]));
2931 p++;
2933 return a;
2937 * call-seq:
2938 * str.getbyte(index) -> 0 .. 255
2940 * returns the <i>index</i>th byte as an integer.
2942 static mrb_value
2943 mrb_str_getbyte(mrb_state *mrb, mrb_value str)
2945 mrb_int pos;
2946 mrb_get_args(mrb, "i", &pos);
2948 if (pos < 0)
2949 pos += RSTRING_LEN(str);
2950 if (pos < 0 || RSTRING_LEN(str) <= pos)
2951 return mrb_nil_value();
2953 return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[pos]);
2957 * call-seq:
2958 * str.setbyte(index, integer) -> integer
2960 * modifies the <i>index</i>th byte as <i>integer</i>.
2962 static mrb_value
2963 mrb_str_setbyte(mrb_state *mrb, mrb_value str)
2965 mrb_int pos, byte;
2966 mrb_int len;
2968 mrb_get_args(mrb, "ii", &pos, &byte);
2970 len = RSTRING_LEN(str);
2971 if (pos < -len || len <= pos)
2972 mrb_raisef(mrb, E_INDEX_ERROR, "index %i out of string", pos);
2973 if (pos < 0)
2974 pos += len;
2976 mrb_str_modify(mrb, mrb_str_ptr(str));
2977 byte &= 0xff;
2978 RSTRING_PTR(str)[pos] = (unsigned char)byte;
2979 return mrb_fixnum_value((unsigned char)byte);
2983 * call-seq:
2984 * str.byteslice(integer) -> new_str or nil
2985 * str.byteslice(integer, integer) -> new_str or nil
2986 * str.byteslice(range) -> new_str or nil
2988 * Byte Reference---If passed a single Integer, returns a
2989 * substring of one byte at that position. If passed two Integer
2990 * objects, returns a substring starting at the offset given by the first, and
2991 * a length given by the second. If given a Range, a substring containing
2992 * bytes at offsets given by the range is returned. In all three cases, if
2993 * an offset is negative, it is counted from the end of <i>str</i>. Returns
2994 * <code>nil</code> if the initial offset falls outside the string, the length
2995 * is negative, or the beginning of the range is greater than the end.
2996 * The encoding of the resulted string keeps original encoding.
2998 * "hello".byteslice(1) #=> "e"
2999 * "hello".byteslice(-1) #=> "o"
3000 * "hello".byteslice(1, 2) #=> "el"
3001 * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
3002 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
3004 static mrb_value
3005 mrb_str_byteslice(mrb_state *mrb, mrb_value str)
3007 mrb_value a1;
3008 mrb_int str_len, beg, len;
3009 mrb_bool empty = TRUE;
3011 len = mrb_get_argc(mrb);
3012 switch (len) {
3013 case 2:
3014 mrb_get_args(mrb, "ii", &beg, &len);
3015 str_len = RSTRING_LEN(str);
3016 break;
3017 case 1:
3018 a1 = mrb_get_arg1(mrb);
3019 str_len = RSTRING_LEN(str);
3020 if (mrb_range_p(a1)) {
3021 if (mrb_range_beg_len(mrb, a1, &beg, &len, str_len, TRUE) != MRB_RANGE_OK) {
3022 return mrb_nil_value();
3025 else {
3026 beg = mrb_as_int(mrb, a1);
3027 len = 1;
3028 empty = FALSE;
3030 break;
3031 default:
3032 mrb_argnum_error(mrb, len, 1, 2);
3033 break;
3035 if (mrb_str_beg_len(str_len, &beg, &len) && (empty || len != 0)) {
3036 return mrb_str_byte_subseq(mrb, str, beg, len);
3038 else {
3039 return mrb_nil_value();
3043 static mrb_value
3044 sub_replace(mrb_state *mrb, mrb_value self)
3046 char *p, *match;
3047 mrb_int plen, mlen;
3048 mrb_int found, offset;
3049 mrb_value result;
3051 mrb_get_args(mrb, "ssi", &p, &plen, &match, &mlen, &found);
3052 result = mrb_str_new(mrb, 0, 0);
3053 for (mrb_int i=0; i<plen; i++) {
3054 if (p[i] != '\\' || i+1==plen) {
3055 mrb_str_cat(mrb, result, p+i, 1);
3056 continue;
3058 i++;
3059 switch (p[i]) {
3060 case '\\':
3061 mrb_str_cat(mrb, result, "\\", 1);
3062 break;
3063 case '`':
3064 mrb_str_cat(mrb, result, RSTRING_PTR(self), found);
3065 break;
3066 case '&': case '0':
3067 mrb_str_cat(mrb, result, match, mlen);
3068 break;
3069 case '\'':
3070 offset = found + mlen;
3071 if (RSTRING_LEN(self) > offset) {
3072 mrb_str_cat(mrb, result, RSTRING_PTR(self)+offset, RSTRING_LEN(self)-offset);
3074 break;
3075 case '1': case '2': case '3':
3076 case '4': case '5': case '6':
3077 case '7': case '8': case '9':
3078 /* ignore sub-group match (no Regexp supported) */
3079 break;
3080 default:
3081 mrb_str_cat(mrb, result, &p[i-1], 2);
3082 break;
3085 return result;
3089 static mrb_value
3090 str_bytesplice(mrb_state *mrb, mrb_value str, mrb_int idx1, mrb_int len1, mrb_value replace, mrb_int idx2, mrb_int len2)
3092 struct RString *s = RSTRING(str);
3093 if (idx1 < 0) {
3094 idx1 += RSTR_LEN(s);
3096 if (idx2 < 0) {
3097 idx2 += RSTRING_LEN(replace);
3099 if (RSTR_LEN(s) < idx1 || idx1 < 0 || RSTRING_LEN(replace) < idx2 || idx2 < 0) {
3100 mrb_raise(mrb, E_INDEX_ERROR, "index out of string");
3102 if (len1 < 0 || len2 < 0) {
3103 mrb_raise(mrb, E_INDEX_ERROR, "negative length");
3105 mrb_int n;
3106 if (mrb_int_add_overflow(idx1, len1, &n) || RSTR_LEN(s) < n) {
3107 len1 = RSTR_LEN(s) - idx1;
3109 if (mrb_int_add_overflow(idx2, len2, &n) || RSTRING_LEN(replace) < n) {
3110 len2 = RSTRING_LEN(replace) - idx2;
3112 mrb_str_modify(mrb, s);
3113 if (len1 >= len2) {
3114 memmove(RSTR_PTR(s)+idx1, RSTRING_PTR(replace)+idx2, len2);
3115 if (len1 > len2) {
3116 memmove(RSTR_PTR(s)+idx1+len2, RSTR_PTR(s)+idx1+len1, RSTR_LEN(s)-(idx1+len1));
3117 RSTR_SET_LEN(s, RSTR_LEN(s)-(len1-len2));
3120 else { /* len1 < len2 */
3121 mrb_int slen = RSTR_LEN(s);
3122 mrb_str_resize(mrb, str, slen+len2-len1);
3123 memmove(RSTR_PTR(s)+idx1+len2, RSTR_PTR(s)+idx1+len1, slen-(idx1+len1));
3124 memmove(RSTR_PTR(s)+idx1, RSTRING_PTR(replace)+idx2, len2);
3126 return str;
3130 * call-seq:
3131 * bytesplice(index, length, str) -> string
3132 * bytesplice(index, length, str, str_index, str_length) -> string
3133 * bytesplice(range, str) -> string
3134 * bytesplice(range, str, str_range) -> string
3136 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
3137 * The portion of the string affected is determined using
3138 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
3139 * If the replacement string is not the same length as the text it is replacing,
3140 * the string will be adjusted accordingly.
3142 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
3144 * The form that take an Integer will raise an IndexError if the value is out
3145 * of range; the Range form will raise a RangeError.
3146 * If the beginning or ending offset does not land on character (codepoint)
3147 * boundary, an IndexError will be raised.
3149 static mrb_value
3150 mrb_str_bytesplice(mrb_state *mrb, mrb_value str)
3152 mrb_int idx1, len1, idx2, len2;
3153 mrb_value range1, range2, replace;
3154 switch (mrb_get_argc(mrb)) {
3155 case 3:
3156 mrb_get_args(mrb, "ooo", &range1, &replace, &range2);
3157 if (mrb_integer_p(range1)) {
3158 mrb_get_args(mrb, "iiS", &idx1, &len1, &replace);
3159 return str_bytesplice(mrb, str, idx1, len1, replace, 0, RSTRING_LEN(replace));
3161 mrb_ensure_string_type(mrb, replace);
3162 if (mrb_range_beg_len(mrb, range1, &idx1, &len1, RSTRING_LEN(str), FALSE) != MRB_RANGE_OK) break;
3163 if (mrb_range_beg_len(mrb, range2, &idx2, &len2, RSTRING_LEN(replace), FALSE) != MRB_RANGE_OK) break;
3164 return str_bytesplice(mrb, str, idx1, len1, replace, idx2, len2);
3165 case 5:
3166 mrb_get_args(mrb, "iiSii", &idx1, &len1, &replace, &idx2, &len2);
3167 return str_bytesplice(mrb, str, idx1, len1, replace, idx2, len2);
3168 case 2:
3169 mrb_get_args(mrb, "oS", &range1, &replace);
3170 if (mrb_range_beg_len(mrb, range1, &idx1, &len1, RSTRING_LEN(str), FALSE) == MRB_RANGE_OK) {
3171 return str_bytesplice(mrb, str, idx1, len1, replace, 0, RSTRING_LEN(replace));
3173 default:
3174 break;
3176 mrb_raise(mrb, E_ARGUMENT_ERROR, "wrong number of arumgnts");
3179 static mrb_value
3180 mrb_encoding(mrb_state *mrb, mrb_value self)
3182 mrb_get_args(mrb, "");
3183 #ifdef MRB_UTF8_STRING
3184 return mrb_str_new_lit(mrb, "UTF-8");
3185 #else
3186 return mrb_str_new_lit(mrb, "ASCII-8BIT");
3187 #endif
3190 /* ---------------------------*/
3191 void
3192 mrb_init_string(mrb_state *mrb)
3194 struct RClass *s;
3196 mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BIT),
3197 "pointer size too big for embedded string");
3199 mrb->string_class = s = mrb_define_class_id(mrb, MRB_SYM(String), mrb->object_class); /* 15.2.10 */
3200 MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);
3202 mrb_define_method_id(mrb, s, MRB_SYM(bytesize), mrb_str_bytesize, MRB_ARGS_NONE());
3204 mrb_define_method_id(mrb, s, MRB_OPSYM(cmp), mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */
3205 mrb_define_method_id(mrb, s, MRB_OPSYM(eq), mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */
3206 mrb_define_method_id(mrb, s, MRB_OPSYM(add), mrb_str_plus_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.4 */
3207 mrb_define_method_id(mrb, s, MRB_OPSYM(mul), mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */
3208 mrb_define_method_id(mrb, s, MRB_OPSYM(aref), mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */
3209 mrb_define_method_id(mrb, s, MRB_OPSYM(aset), mrb_str_aset_m, MRB_ARGS_ANY());
3210 mrb_define_method_id(mrb, s, MRB_SYM(capitalize), mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */
3211 mrb_define_method_id(mrb, s, MRB_SYM_B(capitalize), mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */
3212 mrb_define_method_id(mrb, s, MRB_SYM(chomp), mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */
3213 mrb_define_method_id(mrb, s, MRB_SYM_B(chomp), mrb_str_chomp_bang, MRB_ARGS_ANY()); /* 15.2.10.5.10 */
3214 mrb_define_method_id(mrb, s, MRB_SYM(chop), mrb_str_chop, MRB_ARGS_NONE()); /* 15.2.10.5.11 */
3215 mrb_define_method_id(mrb, s, MRB_SYM_B(chop), mrb_str_chop_bang, MRB_ARGS_NONE()); /* 15.2.10.5.12 */
3216 mrb_define_method_id(mrb, s, MRB_SYM(downcase), mrb_str_downcase, MRB_ARGS_NONE()); /* 15.2.10.5.13 */
3217 mrb_define_method_id(mrb, s, MRB_SYM_B(downcase), mrb_str_downcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.14 */
3218 mrb_define_method_id(mrb, s, MRB_SYM_Q(empty), mrb_str_empty_p, MRB_ARGS_NONE()); /* 15.2.10.5.16 */
3219 mrb_define_method_id(mrb, s, MRB_SYM_Q(eql), mrb_str_eql, MRB_ARGS_REQ(1)); /* 15.2.10.5.17 */
3221 mrb_define_method_id(mrb, s, MRB_SYM(hash), mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */
3222 mrb_define_method_id(mrb, s, MRB_SYM_Q(include), mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */
3223 mrb_define_method_id(mrb, s, MRB_SYM(index), mrb_str_index_m, MRB_ARGS_ARG(1,1)); /* 15.2.10.5.22 */
3224 mrb_define_method_id(mrb, s, MRB_SYM(initialize), mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */
3225 mrb_define_method_id(mrb, s, MRB_SYM(initialize_copy), mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */
3226 mrb_define_method_id(mrb, s, MRB_SYM(intern), mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */
3227 mrb_define_method_id(mrb, s, MRB_SYM(length), mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.26 */
3228 mrb_define_method_id(mrb, s, MRB_SYM(replace), mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */
3229 mrb_define_method_id(mrb, s, MRB_SYM(reverse), mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */
3230 mrb_define_method_id(mrb, s, MRB_SYM_B(reverse), mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */
3231 mrb_define_method_id(mrb, s, MRB_SYM(rindex), mrb_str_rindex_m, MRB_ARGS_ANY()); /* 15.2.10.5.31 */
3232 mrb_define_method_id(mrb, s, MRB_SYM(size), mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */
3233 mrb_define_method_id(mrb, s, MRB_SYM(slice), mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */
3234 mrb_define_method_id(mrb, s, MRB_SYM(split), mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */
3236 #ifndef MRB_NO_FLOAT
3237 mrb_define_method_id(mrb, s, MRB_SYM(to_f), mrb_str_to_f, MRB_ARGS_NONE()); /* 15.2.10.5.38 */
3238 #endif
3239 mrb_define_method_id(mrb, s, MRB_SYM(to_i), mrb_str_to_i, MRB_ARGS_ANY()); /* 15.2.10.5.39 */
3240 mrb_define_method_id(mrb, s, MRB_SYM(to_s), mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */
3241 mrb_define_method_id(mrb, s, MRB_SYM(to_str), mrb_str_to_s, MRB_ARGS_NONE());
3242 mrb_define_method_id(mrb, s, MRB_SYM(to_sym), mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.41 */
3243 mrb_define_method_id(mrb, s, MRB_SYM(upcase), mrb_str_upcase, MRB_ARGS_NONE()); /* 15.2.10.5.42 */
3244 mrb_define_method_id(mrb, s, MRB_SYM_B(upcase), mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */
3245 mrb_define_method_id(mrb, s, MRB_SYM(inspect), mrb_str_inspect, MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */
3246 mrb_define_method_id(mrb, s, MRB_SYM(bytes), mrb_str_bytes, MRB_ARGS_NONE());
3248 mrb_define_method_id(mrb, s, MRB_SYM(getbyte), mrb_str_getbyte, MRB_ARGS_REQ(1));
3249 mrb_define_method_id(mrb, s, MRB_SYM(setbyte), mrb_str_setbyte, MRB_ARGS_REQ(2));
3250 mrb_define_method_id(mrb, s, MRB_SYM(byteindex), mrb_str_byteindex_m, MRB_ARGS_ARG(1,1));
3251 mrb_define_method_id(mrb, s, MRB_SYM(byterindex), mrb_str_byterindex_m, MRB_ARGS_ARG(1,1));
3252 mrb_define_method_id(mrb, s, MRB_SYM(byteslice), mrb_str_byteslice, MRB_ARGS_ARG(1,1));
3253 mrb_define_method_id(mrb, s, MRB_SYM(bytesplice), mrb_str_bytesplice, MRB_ARGS_ANY());
3255 mrb_define_method_id(mrb, s, MRB_SYM(__sub_replace), sub_replace, MRB_ARGS_REQ(3)); /* internal */
3257 mrb_define_method_id(mrb, mrb->kernel_module, MRB_SYM(__ENCODING__), mrb_encoding, MRB_ARGS_NONE());