Merge pull request #6289 from dearblue/orphan-block
[mruby.git] / src / symbol.c
blob4ff16666d39588173ee408492b81ddf120fbd9a7
1 /*
2 ** symbol.c - Symbol class
3 **
4 ** See Copyright Notice in mruby.h
5 */
7 #include <string.h>
8 #include <mruby.h>
9 #include <mruby/khash.h>
10 #include <mruby/string.h>
11 #include <mruby/dump.h>
12 #include <mruby/class.h>
13 #include <mruby/internal.h>
14 #include <mruby/presym.h>
16 #ifndef MRB_NO_PRESYM
18 #ifndef MRB_PRESYM_SCANNING
19 /* const uint16_t presym_length_table[] */
20 /* const char * const presym_name_table[] */
21 # include <mruby/presym/table.h>
22 #endif
24 static mrb_sym
25 presym_find(const char *name, size_t len)
27 if (presym_length_table[MRB_PRESYM_MAX-1] < len) return 0;
29 mrb_sym presym_size = MRB_PRESYM_MAX;
30 for (mrb_sym start = 0; presym_size != 0; presym_size/=2) {
31 mrb_sym idx = start+presym_size/2;
32 int cmp = (int)len-(int)presym_length_table[idx];
33 if (cmp == 0) {
34 cmp = memcmp(name, presym_name_table[idx], len);
35 if (cmp == 0) return idx+1;
37 if (0 < cmp) {
38 start = ++idx;
39 --presym_size;
42 return 0;
45 static const char*
46 presym_sym2name(mrb_sym sym, mrb_int *lenp)
48 if (sym > MRB_PRESYM_MAX) return NULL;
49 if (lenp) *lenp = presym_length_table[sym-1];
50 return presym_name_table[sym-1];
53 #endif /* MRB_NO_PRESYM */
55 /* ------------------------------------------------------ */
56 static void
57 sym_validate_len(mrb_state *mrb, size_t len)
59 if (len >= UINT16_MAX) {
60 mrb_raise(mrb, E_ARGUMENT_ERROR, "symbol length too long");
64 #ifdef MRB_USE_ALL_SYMBOLS
65 # define SYMBOL_INLINE_P(sym) FALSE
66 # define sym_inline_pack(name, len) 0
67 # define sym_inline_unpack(sym, buf, lenp) NULL
68 #else
69 # define SYMBOL_INLINE_P(sym) ((sym) >= (1<<24))
71 static const char pack_table[] = "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
73 static mrb_sym
74 sym_inline_pack(const char *name, size_t len)
76 #if defined(MRB_WORD_BOXING) && defined(MRB_32BIT) && !defined(MRB_WORDBOX_NO_FLOAT_TRUNCATE)
77 const size_t pack_length_max = 4;
78 #else
79 const size_t pack_length_max = 5;
80 #endif
82 mrb_sym sym = 0;
84 if (len > pack_length_max) return 0; /* too long */
85 if (len == 0) return 0; /* empty string */
86 for (size_t i=0; i<len; i++) {
87 char c = name[i];
88 if (c == 0) return 0; /* NUL in name */
89 const char *p = strchr(pack_table, (int)c);
90 if (p == 0) return 0; /* non alnum char */
92 uint32_t bits = (uint32_t)(p - pack_table)+1;
93 sym |= bits<<(24-i*6);
95 mrb_assert(SYMBOL_INLINE_P(sym));
96 return sym;
99 static const char*
100 sym_inline_unpack(mrb_sym sym, char *buf, mrb_int *lenp)
102 int i;
104 mrb_assert(SYMBOL_INLINE_P(sym));
106 for (i=0; i<5; i++) {
107 uint32_t bits = sym>>(24-i*6) & 0x3f;
108 if (bits == 0) break;
109 buf[i] = pack_table[bits-1];
111 buf[i] = '\0';
112 if (lenp) *lenp = i;
113 return buf;
115 #endif
117 #define sym_lit_p(mrb, i) (mrb->symflags[i>>3]&(1<<(i&7)))
118 #define sym_lit_set(mrb, i) mrb->symflags[i>>3]|=(1<<(i&7))
119 #define sym_flags_clear(mrb, i) mrb->symflags[i>>3]&=~(1<<(i&7))
121 static mrb_bool
122 sym_check(mrb_state *mrb, const char *name, size_t len, mrb_sym i)
124 const char *symname = mrb->symtbl[i];
125 size_t symlen;
127 if (sym_lit_p(mrb, i)) {
128 symlen = strlen(symname);
130 else {
131 /* length in BER */
132 symlen = mrb_packed_int_decode((const uint8_t*)symname, (const uint8_t**)&symname);
134 if (len == symlen && memcmp(symname, name, len) == 0) {
135 return TRUE;
137 return FALSE;
140 static mrb_sym
141 find_symbol(mrb_state *mrb, const char *name, size_t len, uint8_t *hashp)
143 mrb_sym i;
144 uint8_t hash;
146 #ifndef MRB_NO_PRESYM
147 /* presym */
148 i = presym_find(name, len);
149 if (i > 0) return i;
150 #endif
152 /* inline symbol */
153 i = sym_inline_pack(name, len);
154 if (i > 0) return i;
156 hash = mrb_byte_hash((const uint8_t*)name, len);
157 if (hashp) *hashp = hash;
159 i = mrb->symhash[hash];
160 if (i == 0) return 0;
161 for (;;) {
162 if (sym_check(mrb, name, len, i)) {
163 return (i+MRB_PRESYM_MAX);
165 uint8_t diff = mrb->symlink[i];
166 if (diff == 0xff) {
167 i -= 0xff;
168 while (i > 0) {
169 if (sym_check(mrb, name, len, i)) {
170 return (i+MRB_PRESYM_MAX);
172 i--;
174 return 0;
176 if (diff == 0) return 0;
177 i -= diff;
179 return 0;
182 static mrb_sym
183 sym_intern(mrb_state *mrb, const char *name, size_t len, mrb_bool lit)
185 mrb_sym sym;
186 uint8_t hash;
188 sym_validate_len(mrb, len);
189 sym = find_symbol(mrb, name, len, &hash);
190 if (sym > 0) return sym;
192 /* registering a new symbol */
193 sym = mrb->symidx + 1;
194 if (mrb->symcapa <= sym) {
195 size_t symcapa = mrb->symcapa;
196 if (symcapa == 0) symcapa = 100;
197 else symcapa = (size_t)(symcapa * 6 / 5);
198 mrb->symtbl = (const char**)mrb_realloc(mrb, (void*)mrb->symtbl, sizeof(char*)*symcapa);
199 mrb->symflags = (uint8_t*)mrb_realloc(mrb, mrb->symflags, symcapa/8+1);
200 memset(mrb->symflags+mrb->symcapa/8+1, 0, (symcapa-mrb->symcapa)/8);
201 mrb->symlink = (uint8_t*)mrb_realloc(mrb, mrb->symlink, symcapa);
202 mrb->symcapa = symcapa;
204 sym_flags_clear(mrb, sym);
205 if ((lit || mrb_ro_data_p(name)) && name[len] == 0 && strlen(name) == len) {
206 sym_lit_set(mrb, sym);
207 mrb->symtbl[sym] = name;
209 else {
210 uint32_t ulen = (uint32_t)len;
211 size_t ilen = mrb_packed_int_len(ulen);
212 char *p = (char*)mrb_malloc(mrb, len+ilen+1);
213 mrb_packed_int_encode(ulen, (uint8_t*)p);
214 memcpy(p+ilen, name, len);
215 p[ilen+len] = 0;
216 mrb->symtbl[sym] = p;
218 if (mrb->symhash[hash]) {
219 mrb_sym i = sym - mrb->symhash[hash];
220 if (i > 0xff)
221 mrb->symlink[sym] = 0xff;
222 else
223 mrb->symlink[sym] = i;
225 else {
226 mrb->symlink[sym] = 0;
228 mrb->symhash[hash] = mrb->symidx = sym;
230 return (sym+MRB_PRESYM_MAX);
233 MRB_API mrb_sym
234 mrb_intern(mrb_state *mrb, const char *name, size_t len)
236 return sym_intern(mrb, name, len, FALSE);
239 MRB_API mrb_sym
240 mrb_intern_static(mrb_state *mrb, const char *name, size_t len)
242 return sym_intern(mrb, name, len, TRUE);
245 MRB_API mrb_sym
246 mrb_intern_cstr(mrb_state *mrb, const char *name)
248 return mrb_intern(mrb, name, strlen(name));
251 MRB_API mrb_sym
252 mrb_intern_str(mrb_state *mrb, mrb_value str)
254 return mrb_intern(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
257 MRB_API mrb_sym
258 mrb_intern_check(mrb_state *mrb, const char *name, size_t len)
260 mrb_sym sym;
262 sym_validate_len(mrb, len);
263 sym = find_symbol(mrb, name, len, NULL);
264 if (sym > 0) return sym;
265 return 0;
268 MRB_API mrb_value
269 mrb_check_intern(mrb_state *mrb, const char *name, size_t len)
271 mrb_sym sym = mrb_intern_check(mrb, name, len);
272 if (sym == 0) return mrb_nil_value();
273 return mrb_symbol_value(sym);
276 MRB_API mrb_sym
277 mrb_intern_check_cstr(mrb_state *mrb, const char *name)
279 return mrb_intern_check(mrb, name, strlen(name));
282 MRB_API mrb_value
283 mrb_check_intern_cstr(mrb_state *mrb, const char *name)
285 mrb_sym sym = mrb_intern_check_cstr(mrb, name);
286 if (sym == 0) return mrb_nil_value();
287 return mrb_symbol_value(sym);
290 MRB_API mrb_sym
291 mrb_intern_check_str(mrb_state *mrb, mrb_value str)
293 return mrb_intern_check(mrb, RSTRING_PTR(str), RSTRING_LEN(str));
296 MRB_API mrb_value
297 mrb_check_intern_str(mrb_state *mrb, mrb_value str)
299 mrb_sym sym = mrb_intern_check_str(mrb, str);
300 if (sym == 0) return mrb_nil_value();
301 return mrb_symbol_value(sym);
304 static const char*
305 sym2name_len(mrb_state *mrb, mrb_sym sym, char *buf, mrb_int *lenp)
307 if (sym == 0) goto outofsym;
308 if (SYMBOL_INLINE_P(sym)) return sym_inline_unpack(sym, buf, lenp);
310 #ifndef MRB_NO_PRESYM
312 const char *name = presym_sym2name(sym, lenp);
313 if (name) return name;
315 #endif
316 sym -= MRB_PRESYM_MAX;
318 if (mrb->symidx < sym) {
319 outofsym:
320 if (lenp) *lenp = 0;
321 return NULL;
324 const char *symname = mrb->symtbl[sym];
325 if (!sym_lit_p(mrb, sym)) {
326 uint32_t len = mrb_packed_int_decode((const uint8_t*)symname, (const uint8_t**)&symname);
327 if (lenp) *lenp = (mrb_int)len;
329 else if (lenp) {
330 *lenp = (mrb_int)strlen(symname);
332 return symname;
335 MRB_API const char*
336 mrb_sym_name_len(mrb_state *mrb, mrb_sym sym, mrb_int *lenp)
338 #ifdef MRB_USE_ALL_SYMBOLS
339 return sym2name_len(mrb, sym, NULL, lenp);
340 #else
341 return sym2name_len(mrb, sym, mrb->symbuf, lenp);
342 #endif
345 void
346 mrb_free_symtbl(mrb_state *mrb)
348 mrb_sym i, lim;
350 for (i=1,lim=mrb->symidx+1; i<lim; i++) {
351 if (!sym_lit_p(mrb, i)) {
352 mrb_free(mrb, (char*)mrb->symtbl[i]);
355 mrb_free(mrb, (void*)mrb->symtbl);
356 mrb_free(mrb, (void*)mrb->symlink);
357 mrb_free(mrb, (void*)mrb->symflags);
360 void
361 mrb_init_symtbl(mrb_state *mrb)
365 /**********************************************************************
366 * Document-class: Symbol
368 * <code>Symbol</code> objects represent names and some strings
369 * inside the Ruby
370 * interpreter. They are generated using the <code>:name</code> and
371 * <code>:"string"</code> literals
372 * syntax, and by the various <code>to_sym</code> methods. The same
373 * <code>Symbol</code> object will be created for a given name or string
374 * for the duration of a program's execution, regardless of the context
375 * or meaning of that name. Thus if <code>Fred</code> is a constant in
376 * one context, a method in another, and a class in a third, the
377 * <code>Symbol</code> <code>:Fred</code> will be the same object in
378 * all three contexts.
380 * module One
381 * class Fred
382 * end
383 * $f1 = :Fred
384 * end
385 * module Two
386 * Fred = 1
387 * $f2 = :Fred
388 * end
389 * def Fred()
390 * end
391 * $f3 = :Fred
392 * $f1.object_id #=> 2514190
393 * $f2.object_id #=> 2514190
394 * $f3.object_id #=> 2514190
398 /* 15.2.11.3.2 */
399 /* 15.2.11.3.3 */
401 * call-seq:
402 * sym.to_s -> string
404 * Returns the name or string corresponding to <i>sym</i>.
406 * :fred.to_s #=> "fred"
408 static mrb_value
409 sym_to_s(mrb_state *mrb, mrb_value sym)
411 return mrb_sym_str(mrb, mrb_symbol(sym));
415 * call-seq:
416 * sym.name -> string
418 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
419 * returned string is frozen.
421 * :fred.name #=> "fred"
422 * :fred.name.frozen? #=> true
424 static mrb_value
425 sym_name(mrb_state *mrb, mrb_value vsym)
427 mrb_sym sym = mrb_symbol(vsym);
428 mrb_int len;
429 const char *name = mrb_sym_name_len(mrb, sym, &len);
431 mrb_assert(name != NULL);
432 if (SYMBOL_INLINE_P(sym)) {
433 return mrb_str_new_frozen(mrb, name, len);
435 return mrb_str_new_static_frozen(mrb, name, len);
438 /* 15.2.11.3.4 */
440 * Document-method: Symbol#to_sym
442 * call-seq:
443 * sym.to_sym -> sym
444 * sym.intern -> sym
446 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
447 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
448 * in this case.
451 /* 15.2.11.3.5(x) */
453 * call-seq:
454 * sym.inspect -> string
456 * Returns the representation of <i>sym</i> as a symbol literal.
458 * :fred.inspect #=> ":fred"
461 #if __STDC__
462 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
463 #else /* not __STDC__ */
464 /* As in Harbison and Steele. */
465 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
466 #endif
467 #define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_'))
469 static mrb_bool
470 is_special_global_name(const char* m)
472 switch (*m) {
473 case '~': case '*': case '$': case '?': case '!': case '@':
474 case '/': case '\\': case ';': case ',': case '.': case '=':
475 case ':': case '<': case '>': case '\"':
476 case '&': case '`': case '\'': case '+':
477 case '0':
478 m++;
479 break;
480 case '-':
481 m++;
482 if (is_identchar(*m)) m += 1;
483 break;
484 default:
485 if (!ISDIGIT(*m)) return FALSE;
486 do m++; while (ISDIGIT(*m));
487 break;
489 return !*m;
492 static mrb_bool
493 symname_p(const char *name)
495 const char *m = name;
496 mrb_bool localid = FALSE;
498 if (!m) return FALSE;
499 switch (*m) {
500 case '\0':
501 return FALSE;
503 case '$':
504 if (is_special_global_name(++m)) return TRUE;
505 goto id;
507 case '@':
508 if (*++m == '@') ++m;
509 goto id;
511 case '<':
512 switch (*++m) {
513 case '<': ++m; break;
514 case '=': if (*++m == '>') m++; break;
515 default: break;
517 break;
519 case '>':
520 switch (*++m) {
521 case '>': case '=': ++m; break;
522 default: break;
524 break;
526 case '=':
527 switch (*++m) {
528 case '~': m++; break;
529 case '=': if (*++m == '=') m++; break;
530 default: return FALSE;
532 break;
534 case '*':
535 if (*++m == '*') m++;
536 break;
537 case '!':
538 switch (*++m) {
539 case '=': case '~': m++;
541 break;
542 case '+': case '-':
543 if (*++m == '@') m++;
544 break;
545 case '|':
546 if (*++m == '|') m++;
547 break;
548 case '&':
549 if (*++m == '&') m++;
550 break;
552 case '^': case '/': case '%': case '~': case '`':
553 m++;
554 break;
556 case '[':
557 if (*++m != ']') return FALSE;
558 if (*++m == '=') m++;
559 break;
561 default:
562 localid = !ISUPPER(*m);
564 if (*m != '_' && !ISALPHA(*m)) return FALSE;
565 while (is_identchar(*m)) m += 1;
566 if (localid) {
567 switch (*m) {
568 case '!': case '?': case '=': m++;
569 default: break;
572 break;
574 return *m ? FALSE : TRUE;
577 static mrb_value
578 sym_inspect(mrb_state *mrb, mrb_value sym)
580 mrb_value str;
581 const char *name;
582 mrb_int len;
583 mrb_sym id = mrb_symbol(sym);
584 char *sp;
586 name = mrb_sym_name_len(mrb, id, &len);
587 str = mrb_str_new(mrb, NULL, len+1);
588 sp = RSTRING_PTR(str);
589 sp[0] = ':';
590 memcpy(sp+1, name, len);
591 mrb_assert_int_fit(mrb_int, len, size_t, SIZE_MAX);
592 if (!symname_p(name) || strlen(name) != (size_t)len) {
593 str = mrb_str_inspect(mrb, str);
594 sp = RSTRING_PTR(str);
595 sp[0] = ':';
596 sp[1] = '"';
598 #ifdef MRB_UTF8_STRING
599 if (SYMBOL_INLINE_P(id)) RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
600 #endif
601 return str;
604 MRB_API mrb_value
605 mrb_sym_str(mrb_state *mrb, mrb_sym sym)
607 mrb_int len;
608 const char *name = mrb_sym_name_len(mrb, sym, &len);
610 if (!name) return mrb_undef_value(); /* can't happen */
611 if (SYMBOL_INLINE_P(sym)) {
612 mrb_value str = mrb_str_new(mrb, name, len);
613 RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
614 return str;
616 return mrb_str_new_static(mrb, name, len);
619 static const char*
620 sym_cstr(mrb_state *mrb, mrb_sym sym, mrb_bool dump)
622 mrb_int len;
623 const char *name = mrb_sym_name_len(mrb, sym, &len);
625 if (!name) return NULL;
626 if (strlen(name) == (size_t)len && (!dump || symname_p(name))) {
627 return name;
629 else {
630 mrb_value str = mrb_str_new_static(mrb, name, len);
631 str = mrb_str_dump(mrb, str);
632 return RSTRING_PTR(str);
636 MRB_API const char*
637 mrb_sym_name(mrb_state *mrb, mrb_sym sym)
639 return sym_cstr(mrb, sym, FALSE);
642 MRB_API const char*
643 mrb_sym_dump(mrb_state *mrb, mrb_sym sym)
645 return sym_cstr(mrb, sym, TRUE);
648 #define lesser(a,b) (((a)>(b))?(b):(a))
650 static mrb_value
651 sym_cmp(mrb_state *mrb, mrb_value s1)
653 mrb_value s2 = mrb_get_arg1(mrb);
654 mrb_sym sym1, sym2;
656 if (!mrb_symbol_p(s2)) return mrb_nil_value();
657 sym1 = mrb_symbol(s1);
658 sym2 = mrb_symbol(s2);
659 if (sym1 == sym2) return mrb_fixnum_value(0);
660 else {
661 const char *p1, *p2;
662 int retval;
663 mrb_int len, len1, len2;
664 char buf1[8], buf2[8];
666 p1 = sym2name_len(mrb, sym1, buf1, &len1);
667 p2 = sym2name_len(mrb, sym2, buf2, &len2);
668 len = lesser(len1, len2);
669 retval = memcmp(p1, p2, len);
670 if (retval == 0) {
671 if (len1 == len2) return mrb_fixnum_value(0);
672 if (len1 > len2) return mrb_fixnum_value(1);
673 return mrb_fixnum_value(-1);
675 if (retval > 0) return mrb_fixnum_value(1);
676 return mrb_fixnum_value(-1);
680 void
681 mrb_init_symbol(mrb_state *mrb)
683 struct RClass *sym;
685 mrb->symbol_class = sym = mrb_define_class_id(mrb, MRB_SYM(Symbol), mrb->object_class); /* 15.2.11 */
686 MRB_SET_INSTANCE_TT(sym, MRB_TT_SYMBOL);
687 mrb_undef_class_method_id(mrb, sym, MRB_SYM(new));
689 mrb_define_method_id(mrb, sym, MRB_SYM(to_s), sym_to_s, MRB_ARGS_NONE()); /* 15.2.11.3.3 */
690 mrb_define_method_id(mrb, sym, MRB_SYM(name), sym_name, MRB_ARGS_NONE());
691 mrb_define_method_id(mrb, sym, MRB_SYM(to_sym), mrb_obj_itself, MRB_ARGS_NONE()); /* 15.2.11.3.4 */
692 mrb_define_method_id(mrb, sym, MRB_SYM(inspect), sym_inspect, MRB_ARGS_NONE()); /* 15.2.11.3.5(x) */
693 mrb_define_method_id(mrb, sym, MRB_OPSYM(cmp), sym_cmp, MRB_ARGS_REQ(1));
694 mrb_define_method_id(mrb, sym, MRB_OPSYM(eq), mrb_obj_equal_m, MRB_ARGS_REQ(1));