00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021
00022 #include <math.h>
00023 #include <ctype.h>
00024
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED FL_USER2
00055 #define STR_ASSOC FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s) FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061 if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063
00064
00065 #define STR_SET_NOEMBED(str) do {\
00066 FL_SET(str, STR_NOEMBED);\
00067 STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072 long tmp_n = (n);\
00073 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076
00077 #define STR_SET_LEN(str, n) do { \
00078 if (STR_EMBED_P(str)) {\
00079 STR_SET_EMBED_LEN(str, n);\
00080 }\
00081 else {\
00082 RSTRING(str)->as.heap.len = (n);\
00083 }\
00084 } while (0)
00085
00086 #define STR_DEC_LEN(str) do {\
00087 if (STR_EMBED_P(str)) {\
00088 long n = RSTRING_LEN(str);\
00089 n--;\
00090 STR_SET_EMBED_LEN(str, n);\
00091 }\
00092 else {\
00093 RSTRING(str)->as.heap.len--;\
00094 }\
00095 } while (0)
00096
00097 #define RESIZE_CAPA(str,capacity) do {\
00098 if (STR_EMBED_P(str)) {\
00099 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100 char *tmp = ALLOC_N(char, capacity+1);\
00101 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102 RSTRING(str)->as.heap.ptr = tmp;\
00103 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104 STR_SET_NOEMBED(str);\
00105 RSTRING(str)->as.heap.aux.capa = (capacity);\
00106 }\
00107 }\
00108 else {\
00109 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110 if (!STR_NOCAPA_P(str))\
00111 RSTRING(str)->as.heap.aux.capa = (capacity);\
00112 }\
00113 } while (0)
00114
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123 rb_encoding *enc;
00124
00125
00126 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127 return 1;
00128
00129 enc = STR_ENC_GET(str);
00130 if (rb_enc_mbmaxlen(enc) == 1)
00131 return 1;
00132
00133
00134
00135 return 0;
00136 }
00137
00138 VALUE rb_fs;
00139
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149 if ((int)sizeof(VALUE) * 2 < e - p) {
00150 const VALUE *s, *t;
00151 const VALUE lowbits = sizeof(VALUE) - 1;
00152 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153 while (p < (const char *)s) {
00154 if (!ISASCII(*p))
00155 return p;
00156 p++;
00157 }
00158 t = (const VALUE*)(~lowbits & (VALUE)e);
00159 while (s < t) {
00160 if (*s & NONASCII_MASK) {
00161 t = s;
00162 break;
00163 }
00164 s++;
00165 }
00166 p = (const char *)t;
00167 }
00168 #endif
00169 while (p < e) {
00170 if (!ISASCII(*p))
00171 return p;
00172 p++;
00173 }
00174 return NULL;
00175 }
00176
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180 const char *e = p + len;
00181
00182 if (rb_enc_to_index(enc) == 0) {
00183
00184 p = search_nonascii(p, e);
00185 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186 }
00187
00188 if (rb_enc_asciicompat(enc)) {
00189 p = search_nonascii(p, e);
00190 if (!p) {
00191 return ENC_CODERANGE_7BIT;
00192 }
00193 while (p < e) {
00194 int ret = rb_enc_precise_mbclen(p, e, enc);
00195 if (!MBCLEN_CHARFOUND_P(ret)) {
00196 return ENC_CODERANGE_BROKEN;
00197 }
00198 p += MBCLEN_CHARFOUND_LEN(ret);
00199 if (p < e) {
00200 p = search_nonascii(p, e);
00201 if (!p) {
00202 return ENC_CODERANGE_VALID;
00203 }
00204 }
00205 }
00206 if (e < p) {
00207 return ENC_CODERANGE_BROKEN;
00208 }
00209 return ENC_CODERANGE_VALID;
00210 }
00211
00212 while (p < e) {
00213 int ret = rb_enc_precise_mbclen(p, e, enc);
00214
00215 if (!MBCLEN_CHARFOUND_P(ret)) {
00216 return ENC_CODERANGE_BROKEN;
00217 }
00218 p += MBCLEN_CHARFOUND_LEN(ret);
00219 }
00220 if (e < p) {
00221 return ENC_CODERANGE_BROKEN;
00222 }
00223 return ENC_CODERANGE_VALID;
00224 }
00225
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229 const char *p = s;
00230
00231 if (*cr == ENC_CODERANGE_BROKEN)
00232 return e - s;
00233
00234 if (rb_enc_to_index(enc) == 0) {
00235
00236 p = search_nonascii(p, e);
00237 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238 return e - s;
00239 }
00240 else if (rb_enc_asciicompat(enc)) {
00241 p = search_nonascii(p, e);
00242 if (!p) {
00243 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244 return e - s;
00245 }
00246 while (p < e) {
00247 int ret = rb_enc_precise_mbclen(p, e, enc);
00248 if (!MBCLEN_CHARFOUND_P(ret)) {
00249 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250 return p - s;
00251 }
00252 p += MBCLEN_CHARFOUND_LEN(ret);
00253 if (p < e) {
00254 p = search_nonascii(p, e);
00255 if (!p) {
00256 *cr = ENC_CODERANGE_VALID;
00257 return e - s;
00258 }
00259 }
00260 }
00261 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262 return p - s;
00263 }
00264 else {
00265 while (p < e) {
00266 int ret = rb_enc_precise_mbclen(p, e, enc);
00267 if (!MBCLEN_CHARFOUND_P(ret)) {
00268 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269 return p - s;
00270 }
00271 p += MBCLEN_CHARFOUND_LEN(ret);
00272 }
00273 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274 return p - s;
00275 }
00276 }
00277
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281 rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287
00288
00289
00290 str_enc_copy(dest, src);
00291 switch (ENC_CODERANGE(src)) {
00292 case ENC_CODERANGE_7BIT:
00293 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294 break;
00295 case ENC_CODERANGE_VALID:
00296 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299 else
00300 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301 break;
00302 default:
00303 if (RSTRING_LEN(dest) == 0) {
00304 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306 else
00307 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308 }
00309 break;
00310 }
00311 }
00312
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316 str_enc_copy(dest, src);
00317 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323 int cr = ENC_CODERANGE(str);
00324
00325 if (cr == ENC_CODERANGE_UNKNOWN) {
00326 rb_encoding *enc = STR_ENC_GET(str);
00327 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328 ENC_CODERANGE_SET(str, cr);
00329 }
00330 return cr;
00331 }
00332
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336 rb_encoding *enc = STR_ENC_GET(str);
00337
00338 if (!rb_enc_asciicompat(enc))
00339 return FALSE;
00340 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341 return TRUE;
00342 return FALSE;
00343 }
00344
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349 rb_raise(rb_eRuntimeError, "string modified");
00350 }
00351 }
00352
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356 if (OBJ_FROZEN(s)) {
00357 rb_raise(rb_eRuntimeError, "string frozen");
00358 }
00359 }
00360
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364 if (STR_EMBED_P(str)) {
00365 return RSTRING_EMBED_LEN_MAX;
00366 }
00367 else if (STR_NOCAPA_P(str)) {
00368 return RSTRING(str)->as.heap.len;
00369 }
00370 else {
00371 return RSTRING(str)->as.heap.aux.capa;
00372 }
00373 }
00374
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378 NEWOBJ(str, struct RString);
00379 OBJSETUP(str, klass, T_STRING);
00380
00381 str->as.heap.ptr = 0;
00382 str->as.heap.len = 0;
00383 str->as.heap.aux.capa = 0;
00384
00385 return (VALUE)str;
00386 }
00387
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391 VALUE str;
00392
00393 if (len < 0) {
00394 rb_raise(rb_eArgError, "negative string size (or size too big)");
00395 }
00396
00397 str = str_alloc(klass);
00398 if (len > RSTRING_EMBED_LEN_MAX) {
00399 RSTRING(str)->as.heap.aux.capa = len;
00400 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401 STR_SET_NOEMBED(str);
00402 }
00403 else if (len == 0) {
00404 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405 }
00406 if (ptr) {
00407 memcpy(RSTRING_PTR(str), ptr, len);
00408 }
00409 STR_SET_LEN(str, len);
00410 RSTRING_PTR(str)[len] = '\0';
00411 return str;
00412 }
00413
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417 return str_new(rb_cString, ptr, len);
00418 }
00419
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423 VALUE str = rb_str_new(ptr, len);
00424 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425 return str;
00426 }
00427
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431 VALUE str = rb_str_new(ptr, len);
00432 rb_enc_associate(str, enc);
00433 return str;
00434 }
00435
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439 if (!ptr) {
00440 rb_raise(rb_eArgError, "NULL pointer given");
00441 }
00442 return rb_str_new(ptr, strlen(ptr));
00443 }
00444
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451 VALUE str = rb_str_new2(ptr);
00452 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453 return str;
00454 }
00455
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462 VALUE str = rb_str_new(ptr, len);
00463
00464 OBJ_TAINT(str);
00465 return str;
00466 }
00467
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471 VALUE str = rb_str_new2(ptr);
00472
00473 OBJ_TAINT(str);
00474 return str;
00475 }
00476
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483 rb_econv_t *ec;
00484 rb_econv_result_t ret;
00485 long len;
00486 VALUE newstr;
00487 const unsigned char *sp;
00488 unsigned char *dp;
00489
00490 if (!to) return str;
00491 if (from == to) return str;
00492 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493 to == rb_ascii8bit_encoding()) {
00494 if (STR_ENC_GET(str) != to) {
00495 str = rb_str_dup(str);
00496 rb_enc_associate(str, to);
00497 }
00498 return str;
00499 }
00500
00501 len = RSTRING_LEN(str);
00502 newstr = rb_str_new(0, len);
00503
00504 retry:
00505 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506 if (!ec) return str;
00507
00508 sp = (unsigned char*)RSTRING_PTR(str);
00509 dp = (unsigned char*)RSTRING_PTR(newstr);
00510 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512 rb_econv_close(ec);
00513 switch (ret) {
00514 case econv_destination_buffer_full:
00515
00516 len = len < 2 ? 2 : len * 2;
00517 rb_str_resize(newstr, len);
00518 goto retry;
00519
00520 case econv_finished:
00521 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522 rb_str_set_len(newstr, len);
00523 rb_enc_associate(newstr, to);
00524 return newstr;
00525
00526 default:
00527
00528 return str;
00529 }
00530 }
00531
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541 VALUE str;
00542
00543 str = rb_tainted_str_new(ptr, len);
00544 if (eenc == rb_usascii_encoding() &&
00545 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546 rb_enc_associate(str, rb_ascii8bit_encoding());
00547 return str;
00548 }
00549 rb_enc_associate(str, eenc);
00550 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611 STR_SET_EMBED(str2);
00612 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614 }
00615 else {
00616 str = rb_str_new_frozen(str);
00617 FL_SET(str2, STR_NOEMBED);
00618 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620 RSTRING(str2)->as.heap.aux.shared = str;
00621 FL_SET(str2, ELTS_SHARED);
00622 }
00623 rb_enc_cr_str_exact_copy(str2, str);
00624
00625 return str2;
00626 }
00627
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631 return str_replace_shared(str_alloc(klass), str);
00632 }
00633
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637 return str_new_shared(klass, str);
00638 }
00639
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643 VALUE str2 = str_new3(rb_obj_class(str), str);
00644
00645 OBJ_INFECT(str2, str);
00646 return str2;
00647 }
00648
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655 VALUE str2;
00656
00657 str2 = str_alloc(klass);
00658 STR_SET_NOEMBED(str2);
00659 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661 if (STR_SHARED_P(str)) {
00662 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663 assert(OBJ_FROZEN(shared));
00664 FL_SET(str2, ELTS_SHARED);
00665 RSTRING(str2)->as.heap.aux.shared = shared;
00666 }
00667 else {
00668 FL_SET(str, ELTS_SHARED);
00669 RSTRING(str)->as.heap.aux.shared = str2;
00670 }
00671 rb_enc_cr_str_exact_copy(str2, str);
00672 OBJ_INFECT(str2, str);
00673 return str2;
00674 }
00675
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679 VALUE klass, str;
00680
00681 if (OBJ_FROZEN(orig)) return orig;
00682 klass = rb_obj_class(orig);
00683 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684 long ofs;
00685 assert(OBJ_FROZEN(str));
00686 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689 ENCODING_GET(str) != ENCODING_GET(orig)) {
00690 str = str_new3(klass, str);
00691 RSTRING(str)->as.heap.ptr += ofs;
00692 RSTRING(str)->as.heap.len -= ofs;
00693 rb_enc_cr_str_exact_copy(str, orig);
00694 OBJ_INFECT(str, orig);
00695 }
00696 }
00697 else if (STR_EMBED_P(orig)) {
00698 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699 rb_enc_cr_str_exact_copy(str, orig);
00700 OBJ_INFECT(str, orig);
00701 }
00702 else if (STR_ASSOC_P(orig)) {
00703 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704 FL_UNSET(orig, STR_ASSOC);
00705 str = str_new4(klass, orig);
00706 FL_SET(str, STR_ASSOC);
00707 RSTRING(str)->as.heap.aux.shared = assoc;
00708 }
00709 else {
00710 str = str_new4(klass, orig);
00711 }
00712 OBJ_FREEZE(str);
00713 return str;
00714 }
00715
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722 return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726 rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732 VALUE v = rb_str_new5(str, 0, 0);
00733 OBJ_INFECT(v, str);
00734 return v;
00735 }
00736
00737 #define STR_BUF_MIN_SIZE 128
00738
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742 VALUE str = str_alloc(rb_cString);
00743
00744 if (capa < STR_BUF_MIN_SIZE) {
00745 capa = STR_BUF_MIN_SIZE;
00746 }
00747 FL_SET(str, STR_NOEMBED);
00748 RSTRING(str)->as.heap.aux.capa = capa;
00749 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750 RSTRING(str)->as.heap.ptr[0] = '\0';
00751
00752 return str;
00753 }
00754
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758 VALUE str;
00759 long len = strlen(ptr);
00760
00761 str = rb_str_buf_new(len);
00762 rb_str_buf_cat(str, ptr, len);
00763
00764 return str;
00765 }
00766
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773 return str_new(0, 0, len);
00774 }
00775
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780 xfree(RSTRING(str)->as.heap.ptr);
00781 }
00782 }
00783
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788 return RSTRING(str)->as.heap.aux.capa;
00789 }
00790 else {
00791 return 0;
00792 }
00793 }
00794
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798 return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800
00801 static inline void str_discard(VALUE str);
00802
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806 rb_encoding *enc;
00807 int cr;
00808 if (str == str2) return;
00809 enc = STR_ENC_GET(str2);
00810 cr = ENC_CODERANGE(str2);
00811 str_discard(str);
00812 OBJ_INFECT(str, str2);
00813 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814 STR_SET_EMBED(str);
00815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817 rb_enc_associate(str, enc);
00818 ENC_CODERANGE_SET(str, cr);
00819 return;
00820 }
00821 STR_SET_NOEMBED(str);
00822 STR_UNSET_NOCAPA(str);
00823 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825 if (STR_NOCAPA_P(str2)) {
00826 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828 }
00829 else {
00830 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831 }
00832 STR_SET_EMBED(str2);
00833 RSTRING_PTR(str2)[0] = 0;
00834 STR_SET_EMBED_LEN(str2, 0);
00835 rb_enc_associate(str, enc);
00836 ENC_CODERANGE_SET(str, cr);
00837 }
00838
00839 static ID id_to_s;
00840
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844 VALUE str;
00845
00846 if (TYPE(obj) == T_STRING) {
00847 return obj;
00848 }
00849 str = rb_funcall(obj, id_to_s, 0);
00850 if (TYPE(str) != T_STRING)
00851 return rb_any_to_s(obj);
00852 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853 return str;
00854 }
00855
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859 long len;
00860
00861 len = RSTRING_LEN(str2);
00862 if (STR_ASSOC_P(str2)) {
00863 str2 = rb_str_new4(str2);
00864 }
00865 if (STR_SHARED_P(str2)) {
00866 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867 assert(OBJ_FROZEN(shared));
00868 STR_SET_NOEMBED(str);
00869 RSTRING(str)->as.heap.len = len;
00870 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871 FL_SET(str, ELTS_SHARED);
00872 FL_UNSET(str, STR_ASSOC);
00873 RSTRING(str)->as.heap.aux.shared = shared;
00874 }
00875 else {
00876 str_replace_shared(str, str2);
00877 }
00878
00879 OBJ_INFECT(str, str2);
00880 rb_enc_cr_str_exact_copy(str, str2);
00881 return str;
00882 }
00883
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887 VALUE dup = str_alloc(klass);
00888 str_replace(dup, str);
00889 return dup;
00890 }
00891
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895 return str_duplicate(rb_obj_class(str), str);
00896 }
00897
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901 return str_replace(str_alloc(rb_cString), str);
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914 VALUE orig;
00915
00916 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917 rb_str_replace(str, orig);
00918 return str;
00919 }
00920
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924 long c;
00925 const char *q;
00926
00927 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929 }
00930 else if (rb_enc_asciicompat(enc)) {
00931 c = 0;
00932 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933 while (p < e) {
00934 if (ISASCII(*p)) {
00935 q = search_nonascii(p, e);
00936 if (!q)
00937 return c + (e - p);
00938 c += q - p;
00939 p = q;
00940 }
00941 p += rb_enc_fast_mbclen(p, e, enc);
00942 c++;
00943 }
00944 }
00945 else {
00946 while (p < e) {
00947 if (ISASCII(*p)) {
00948 q = search_nonascii(p, e);
00949 if (!q)
00950 return c + (e - p);
00951 c += q - p;
00952 p = q;
00953 }
00954 p += rb_enc_mbclen(p, e, enc);
00955 c++;
00956 }
00957 }
00958 return c;
00959 }
00960
00961 for (c=0; p<e; c++) {
00962 p += rb_enc_mbclen(p, e, enc);
00963 }
00964 return c;
00965 }
00966
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976 long c;
00977 const char *q;
00978 int ret;
00979
00980 *cr = 0;
00981 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983 }
00984 else if (rb_enc_asciicompat(enc)) {
00985 c = 0;
00986 while (p < e) {
00987 if (ISASCII(*p)) {
00988 q = search_nonascii(p, e);
00989 if (!q) {
00990 if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991 return c + (e - p);
00992 }
00993 c += q - p;
00994 p = q;
00995 }
00996 ret = rb_enc_precise_mbclen(p, e, enc);
00997 if (MBCLEN_CHARFOUND_P(ret)) {
00998 *cr |= ENC_CODERANGE_VALID;
00999 p += MBCLEN_CHARFOUND_LEN(ret);
01000 }
01001 else {
01002 *cr = ENC_CODERANGE_BROKEN;
01003 p++;
01004 }
01005 c++;
01006 }
01007 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008 return c;
01009 }
01010
01011 for (c=0; p<e; c++) {
01012 ret = rb_enc_precise_mbclen(p, e, enc);
01013 if (MBCLEN_CHARFOUND_P(ret)) {
01014 *cr |= ENC_CODERANGE_VALID;
01015 p += MBCLEN_CHARFOUND_LEN(ret);
01016 }
01017 else {
01018 *cr = ENC_CODERANGE_BROKEN;
01019 if (p + rb_enc_mbminlen(enc) <= e)
01020 p += rb_enc_mbminlen(enc);
01021 else
01022 p = e;
01023 }
01024 }
01025 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026 return c;
01027 }
01028
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034 VALUE d = *s;
01035 d |= ~(d>>1);
01036 d >>= 6;
01037 d &= NONASCII_MASK >> 7;
01038 d += (d>>8);
01039 d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041 d += (d>>32);
01042 #endif
01043 return (d&0xF);
01044 }
01045 #endif
01046
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050 const char *p, *e;
01051 long n;
01052 int cr;
01053
01054 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055 if (!enc) enc = STR_ENC_GET(str);
01056 p = RSTRING_PTR(str);
01057 e = RSTRING_END(str);
01058 cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061 enc == rb_utf8_encoding()) {
01062
01063 VALUE len = 0;
01064 if ((int)sizeof(VALUE) * 2 < e - p) {
01065 const VALUE *s, *t;
01066 const VALUE lowbits = sizeof(VALUE) - 1;
01067 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068 t = (const VALUE*)(~lowbits & (VALUE)e);
01069 while (p < (const char *)s) {
01070 if (is_utf8_lead_byte(*p)) len++;
01071 p++;
01072 }
01073 while (s < t) {
01074 len += count_utf8_lead_bytes_with_word(s);
01075 s++;
01076 }
01077 p = (const char *)s;
01078 }
01079 while (p < e) {
01080 if (is_utf8_lead_byte(*p)) len++;
01081 p++;
01082 }
01083 return (long)len;
01084 }
01085 #endif
01086 n = rb_enc_strlen_cr(p, e, enc, &cr);
01087 if (cr) {
01088 ENC_CODERANGE_SET(str, cr);
01089 }
01090 return n;
01091 }
01092
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096 return str_strlen(str, STR_ENC_GET(str));
01097 }
01098
01099
01100
01101
01102
01103
01104
01105
01106
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110 long len;
01111
01112 len = str_strlen(str, STR_ENC_GET(str));
01113 return LONG2NUM(len);
01114 }
01115
01116
01117
01118
01119
01120
01121
01122
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126 return INT2NUM(RSTRING_LEN(str));
01127 }
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137
01138
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142 if (RSTRING_LEN(str) == 0)
01143 return Qtrue;
01144 return Qfalse;
01145 }
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160 VALUE str3;
01161 rb_encoding *enc;
01162
01163 StringValue(str2);
01164 enc = rb_enc_check(str1, str2);
01165 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168 RSTRING_PTR(str2), RSTRING_LEN(str2));
01169 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170
01171 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172 OBJ_TAINT(str3);
01173 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175 return str3;
01176 }
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191 VALUE str2;
01192 long n, len;
01193 char *ptr2;
01194
01195 len = NUM2LONG(times);
01196 if (len < 0) {
01197 rb_raise(rb_eArgError, "negative argument");
01198 }
01199 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01200 rb_raise(rb_eArgError, "argument too big");
01201 }
01202
01203 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204 ptr2 = RSTRING_PTR(str2);
01205 if (len) {
01206 n = RSTRING_LEN(str);
01207 memcpy(ptr2, RSTRING_PTR(str), n);
01208 while (n <= len/2) {
01209 memcpy(ptr2 + n, ptr2, n);
01210 n *= 2;
01211 }
01212 memcpy(ptr2 + n, ptr2, len-n);
01213 }
01214 ptr2[RSTRING_LEN(str2)] = '\0';
01215 OBJ_INFECT(str2, str);
01216 rb_enc_cr_str_copy_for_substr(str2, str);
01217
01218 return str2;
01219 }
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236 static VALUE
01237 rb_str_format_m(VALUE str, VALUE arg)
01238 {
01239 volatile VALUE tmp = rb_check_array_type(arg);
01240
01241 if (!NIL_P(tmp)) {
01242 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01243 }
01244 return rb_str_format(1, &arg, str);
01245 }
01246
01247 static inline void
01248 str_modifiable(VALUE str)
01249 {
01250 if (FL_TEST(str, STR_TMPLOCK)) {
01251 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01252 }
01253 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01254 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01255 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01256 }
01257
01258 static inline int
01259 str_independent(VALUE str)
01260 {
01261 str_modifiable(str);
01262 if (!STR_SHARED_P(str)) return 1;
01263 if (STR_EMBED_P(str)) return 1;
01264 return 0;
01265 }
01266
01267 static void
01268 str_make_independent(VALUE str)
01269 {
01270 char *ptr;
01271 long len = RSTRING_LEN(str);
01272
01273 ptr = ALLOC_N(char, len+1);
01274 if (RSTRING_PTR(str)) {
01275 memcpy(ptr, RSTRING_PTR(str), len);
01276 }
01277 STR_SET_NOEMBED(str);
01278 ptr[len] = 0;
01279 RSTRING(str)->as.heap.ptr = ptr;
01280 RSTRING(str)->as.heap.len = len;
01281 RSTRING(str)->as.heap.aux.capa = len;
01282 STR_UNSET_NOCAPA(str);
01283 }
01284
01285 void
01286 rb_str_modify(VALUE str)
01287 {
01288 if (!str_independent(str))
01289 str_make_independent(str);
01290 ENC_CODERANGE_CLEAR(str);
01291 }
01292
01293
01294 static void
01295 str_modify_keep_cr(VALUE str)
01296 {
01297 if (!str_independent(str))
01298 str_make_independent(str);
01299 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01300
01301 ENC_CODERANGE_CLEAR(str);
01302 }
01303
01304 static inline void
01305 str_discard(VALUE str)
01306 {
01307 str_modifiable(str);
01308 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01309 xfree(RSTRING_PTR(str));
01310 RSTRING(str)->as.heap.ptr = 0;
01311 RSTRING(str)->as.heap.len = 0;
01312 }
01313 }
01314
01315 void
01316 rb_str_associate(VALUE str, VALUE add)
01317 {
01318
01319 if (OBJ_FROZEN(str)) rb_error_frozen("string");
01320 if (STR_ASSOC_P(str)) {
01321
01322 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01323 }
01324 else {
01325 if (STR_SHARED_P(str)) {
01326 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01327 str_make_independent(str);
01328 if (STR_ASSOC_P(assoc)) {
01329 assoc = RSTRING(assoc)->as.heap.aux.shared;
01330 rb_ary_concat(assoc, add);
01331 add = assoc;
01332 }
01333 }
01334 else if (STR_EMBED_P(str)) {
01335 str_make_independent(str);
01336 }
01337 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01338 RESIZE_CAPA(str, RSTRING_LEN(str));
01339 }
01340 FL_SET(str, STR_ASSOC);
01341 RBASIC(add)->klass = 0;
01342 RSTRING(str)->as.heap.aux.shared = add;
01343 }
01344 }
01345
01346 VALUE
01347 rb_str_associated(VALUE str)
01348 {
01349 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01350 if (STR_ASSOC_P(str)) {
01351 return RSTRING(str)->as.heap.aux.shared;
01352 }
01353 return Qfalse;
01354 }
01355
01356 VALUE
01357 rb_string_value(volatile VALUE *ptr)
01358 {
01359 VALUE s = *ptr;
01360 if (TYPE(s) != T_STRING) {
01361 s = rb_str_to_str(s);
01362 *ptr = s;
01363 }
01364 return s;
01365 }
01366
01367 char *
01368 rb_string_value_ptr(volatile VALUE *ptr)
01369 {
01370 VALUE str = rb_string_value(ptr);
01371 return RSTRING_PTR(str);
01372 }
01373
01374 char *
01375 rb_string_value_cstr(volatile VALUE *ptr)
01376 {
01377 VALUE str = rb_string_value(ptr);
01378 char *s = RSTRING_PTR(str);
01379 long len = RSTRING_LEN(str);
01380
01381 if (!s || memchr(s, 0, len)) {
01382 rb_raise(rb_eArgError, "string contains null byte");
01383 }
01384 if (s[len]) {
01385 rb_str_modify(str);
01386 s = RSTRING_PTR(str);
01387 s[RSTRING_LEN(str)] = 0;
01388 }
01389 return s;
01390 }
01391
01392 VALUE
01393 rb_check_string_type(VALUE str)
01394 {
01395 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01396 return str;
01397 }
01398
01399
01400
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410 static VALUE
01411 rb_str_s_try_convert(VALUE dummy, VALUE str)
01412 {
01413 return rb_check_string_type(str);
01414 }
01415
01416 char*
01417 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01418 {
01419 if (rb_enc_mbmaxlen(enc) == 1) {
01420 p += nth;
01421 }
01422 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01423 p += nth * rb_enc_mbmaxlen(enc);
01424 }
01425 else if (rb_enc_asciicompat(enc)) {
01426 const char *p2, *e2;
01427 int n;
01428
01429 while (p < e && 0 < nth) {
01430 e2 = p + nth;
01431 if (e < e2)
01432 return (char *)e;
01433 if (ISASCII(*p)) {
01434 p2 = search_nonascii(p, e2);
01435 if (!p2)
01436 return (char *)e2;
01437 nth -= p2 - p;
01438 p = p2;
01439 }
01440 n = rb_enc_mbclen(p, e, enc);
01441 p += n;
01442 nth--;
01443 }
01444 if (nth != 0)
01445 return (char *)e;
01446 return (char *)p;
01447 }
01448 else {
01449 while (p<e && nth--) {
01450 p += rb_enc_mbclen(p, e, enc);
01451 }
01452 }
01453 if (p > e) p = e;
01454 return (char*)p;
01455 }
01456
01457 static char*
01458 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01459 {
01460 if (singlebyte)
01461 p += nth;
01462 else {
01463 p = rb_enc_nth(p, e, nth, enc);
01464 }
01465 if (!p) return 0;
01466 if (p > e) p = e;
01467 return (char *)p;
01468 }
01469
01470
01471 static long
01472 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01473 {
01474 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01475 if (!pp) return e - p;
01476 return pp - p;
01477 }
01478
01479 long
01480 rb_str_offset(VALUE str, long pos)
01481 {
01482 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01483 STR_ENC_GET(str), single_byte_optimizable(str));
01484 }
01485
01486 #ifdef NONASCII_MASK
01487 static char *
01488 str_utf8_nth(const char *p, const char *e, long nth)
01489 {
01490 if ((int)SIZEOF_VALUE < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01491 const VALUE *s, *t;
01492 const VALUE lowbits = sizeof(VALUE) - 1;
01493 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01494 t = (const VALUE*)(~lowbits & (VALUE)e);
01495 while (p < (const char *)s) {
01496 if (is_utf8_lead_byte(*p)) nth--;
01497 p++;
01498 }
01499 do {
01500 nth -= count_utf8_lead_bytes_with_word(s);
01501 s++;
01502 } while (s < t && (int)sizeof(VALUE) <= nth);
01503 p = (char *)s;
01504 }
01505 while (p < e) {
01506 if (is_utf8_lead_byte(*p)) {
01507 if (nth == 0) break;
01508 nth--;
01509 }
01510 p++;
01511 }
01512 return (char *)p;
01513 }
01514
01515 static long
01516 str_utf8_offset(const char *p, const char *e, long nth)
01517 {
01518 const char *pp = str_utf8_nth(p, e, nth);
01519 return pp - p;
01520 }
01521 #endif
01522
01523
01524 long
01525 rb_str_sublen(VALUE str, long pos)
01526 {
01527 if (single_byte_optimizable(str) || pos < 0)
01528 return pos;
01529 else {
01530 char *p = RSTRING_PTR(str);
01531 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01532 }
01533 }
01534
01535 VALUE
01536 rb_str_subseq(VALUE str, long beg, long len)
01537 {
01538 VALUE str2;
01539
01540 if (RSTRING_LEN(str) == beg + len &&
01541 RSTRING_EMBED_LEN_MAX < len) {
01542 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01543 rb_str_drop_bytes(str2, beg);
01544 }
01545 else {
01546 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01547 }
01548
01549 rb_enc_cr_str_copy_for_substr(str2, str);
01550 OBJ_INFECT(str2, str);
01551
01552 return str2;
01553 }
01554
01555 VALUE
01556 rb_str_substr(VALUE str, long beg, long len)
01557 {
01558 rb_encoding *enc = STR_ENC_GET(str);
01559 VALUE str2;
01560 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01561
01562 if (len < 0) return Qnil;
01563 if (!RSTRING_LEN(str)) {
01564 len = 0;
01565 }
01566 if (single_byte_optimizable(str)) {
01567 if (beg > RSTRING_LEN(str)) return Qnil;
01568 if (beg < 0) {
01569 beg += RSTRING_LEN(str);
01570 if (beg < 0) return Qnil;
01571 }
01572 if (beg + len > RSTRING_LEN(str))
01573 len = RSTRING_LEN(str) - beg;
01574 if (len <= 0) {
01575 len = 0;
01576 p = 0;
01577 }
01578 else
01579 p = s + beg;
01580 goto sub;
01581 }
01582 if (beg < 0) {
01583 if (len > -beg) len = -beg;
01584 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01585 beg = -beg;
01586 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01587 p = e;
01588 if (!p) return Qnil;
01589 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01590 if (!p) return Qnil;
01591 len = e - p;
01592 goto sub;
01593 }
01594 else {
01595 beg += str_strlen(str, enc);
01596 if (beg < 0) return Qnil;
01597 }
01598 }
01599 else if (beg > 0 && beg > str_strlen(str, enc)) {
01600 return Qnil;
01601 }
01602 if (len == 0) {
01603 p = 0;
01604 }
01605 #ifdef NONASCII_MASK
01606 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01607 enc == rb_utf8_encoding()) {
01608 p = str_utf8_nth(s, e, beg);
01609 len = str_utf8_offset(p, e, len);
01610 }
01611 #endif
01612 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01613 int char_sz = rb_enc_mbmaxlen(enc);
01614
01615 p = s + beg * char_sz;
01616 if (p > e) {
01617 p = e;
01618 len = 0;
01619 }
01620 else if (len * char_sz > e - p)
01621 len = e - p;
01622 else
01623 len *= char_sz;
01624 }
01625 else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01626 len = 0;
01627 }
01628 else {
01629 len = str_offset(p, e, len, enc, 0);
01630 }
01631 sub:
01632 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01633 str2 = rb_str_new4(str);
01634 str2 = str_new3(rb_obj_class(str2), str2);
01635 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01636 RSTRING(str2)->as.heap.len = len;
01637 }
01638 else {
01639 str2 = rb_str_new5(str, p, len);
01640 rb_enc_cr_str_copy_for_substr(str2, str);
01641 OBJ_INFECT(str2, str);
01642 }
01643
01644 return str2;
01645 }
01646
01647 VALUE
01648 rb_str_freeze(VALUE str)
01649 {
01650 if (STR_ASSOC_P(str)) {
01651 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01652 OBJ_FREEZE(ary);
01653 }
01654 return rb_obj_freeze(str);
01655 }
01656
01657 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01658 #define rb_str_dup_frozen rb_str_new_frozen
01659
01660 VALUE
01661 rb_str_locktmp(VALUE str)
01662 {
01663 if (FL_TEST(str, STR_TMPLOCK)) {
01664 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01665 }
01666 FL_SET(str, STR_TMPLOCK);
01667 return str;
01668 }
01669
01670 VALUE
01671 rb_str_unlocktmp(VALUE str)
01672 {
01673 if (!FL_TEST(str, STR_TMPLOCK)) {
01674 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01675 }
01676 FL_UNSET(str, STR_TMPLOCK);
01677 return str;
01678 }
01679
01680 void
01681 rb_str_set_len(VALUE str, long len)
01682 {
01683 rb_str_modify(str);
01684 STR_SET_LEN(str, len);
01685 RSTRING_PTR(str)[len] = '\0';
01686 }
01687
01688 VALUE
01689 rb_str_resize(VALUE str, long len)
01690 {
01691 long slen;
01692
01693 if (len < 0) {
01694 rb_raise(rb_eArgError, "negative string size (or size too big)");
01695 }
01696
01697 rb_str_modify(str);
01698 slen = RSTRING_LEN(str);
01699 if (len != slen) {
01700 if (STR_EMBED_P(str)) {
01701 char *ptr;
01702 if (len <= RSTRING_EMBED_LEN_MAX) {
01703 STR_SET_EMBED_LEN(str, len);
01704 RSTRING(str)->as.ary[len] = '\0';
01705 return str;
01706 }
01707 ptr = ALLOC_N(char,len+1);
01708 MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01709 RSTRING(str)->as.heap.ptr = ptr;
01710 STR_SET_NOEMBED(str);
01711 }
01712 else if (len <= RSTRING_EMBED_LEN_MAX) {
01713 char *ptr = RSTRING(str)->as.heap.ptr;
01714 STR_SET_EMBED(str);
01715 if (slen > len) slen = len;
01716 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01717 RSTRING(str)->as.ary[len] = '\0';
01718 STR_SET_EMBED_LEN(str, len);
01719 xfree(ptr);
01720 return str;
01721 }
01722 else if (slen < len || slen - len > 1024) {
01723 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01724 }
01725 if (!STR_NOCAPA_P(str)) {
01726 RSTRING(str)->as.heap.aux.capa = len;
01727 }
01728 RSTRING(str)->as.heap.len = len;
01729 RSTRING(str)->as.heap.ptr[len] = '\0';
01730 }
01731 return str;
01732 }
01733
01734 static VALUE
01735 str_buf_cat(VALUE str, const char *ptr, long len)
01736 {
01737 long capa, total, off = -1;
01738
01739 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01740 off = ptr - RSTRING_PTR(str);
01741 }
01742 rb_str_modify(str);
01743 if (len == 0) return 0;
01744 if (STR_ASSOC_P(str)) {
01745 FL_UNSET(str, STR_ASSOC);
01746 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01747 }
01748 else if (STR_EMBED_P(str)) {
01749 capa = RSTRING_EMBED_LEN_MAX;
01750 }
01751 else {
01752 capa = RSTRING(str)->as.heap.aux.capa;
01753 }
01754 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01755 rb_raise(rb_eArgError, "string sizes too big");
01756 }
01757 total = RSTRING_LEN(str)+len;
01758 if (capa <= total) {
01759 while (total > capa) {
01760 if (capa + 1 >= LONG_MAX / 2) {
01761 capa = (total + 4095) / 4096;
01762 break;
01763 }
01764 capa = (capa + 1) * 2;
01765 }
01766 RESIZE_CAPA(str, capa);
01767 }
01768 if (off != -1) {
01769 ptr = RSTRING_PTR(str) + off;
01770 }
01771 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01772 STR_SET_LEN(str, total);
01773 RSTRING_PTR(str)[total] = '\0';
01774
01775 return str;
01776 }
01777
01778 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01779
01780 VALUE
01781 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01782 {
01783 if (len == 0) return str;
01784 if (len < 0) {
01785 rb_raise(rb_eArgError, "negative string size (or size too big)");
01786 }
01787 return str_buf_cat(str, ptr, len);
01788 }
01789
01790 VALUE
01791 rb_str_buf_cat2(VALUE str, const char *ptr)
01792 {
01793 return rb_str_buf_cat(str, ptr, strlen(ptr));
01794 }
01795
01796 VALUE
01797 rb_str_cat(VALUE str, const char *ptr, long len)
01798 {
01799 if (len < 0) {
01800 rb_raise(rb_eArgError, "negative string size (or size too big)");
01801 }
01802 if (STR_ASSOC_P(str)) {
01803 rb_str_modify(str);
01804 if (STR_EMBED_P(str)) str_make_independent(str);
01805 REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01806 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01807 RSTRING(str)->as.heap.len += len;
01808 RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0';
01809 return str;
01810 }
01811
01812 return rb_str_buf_cat(str, ptr, len);
01813 }
01814
01815 VALUE
01816 rb_str_cat2(VALUE str, const char *ptr)
01817 {
01818 return rb_str_cat(str, ptr, strlen(ptr));
01819 }
01820
01821 static VALUE
01822 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01823 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01824 {
01825 int str_encindex = ENCODING_GET(str);
01826 int res_encindex;
01827 int str_cr, res_cr;
01828 int str_a8 = ENCODING_IS_ASCII8BIT(str);
01829 int ptr_a8 = ptr_encindex == 0;
01830
01831 str_cr = ENC_CODERANGE(str);
01832
01833 if (str_encindex == ptr_encindex) {
01834 if (str_cr == ENC_CODERANGE_UNKNOWN ||
01835 (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01836 ptr_cr = ENC_CODERANGE_UNKNOWN;
01837 }
01838 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01839 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01840 }
01841 }
01842 else {
01843 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01844 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01845 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01846 if (len == 0)
01847 return str;
01848 if (RSTRING_LEN(str) == 0) {
01849 rb_str_buf_cat(str, ptr, len);
01850 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01851 return str;
01852 }
01853 goto incompatible;
01854 }
01855 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01856 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01857 }
01858 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01859 if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01860 str_cr = rb_enc_str_coderange(str);
01861 }
01862 }
01863 }
01864 if (ptr_cr_ret)
01865 *ptr_cr_ret = ptr_cr;
01866
01867 if (str_encindex != ptr_encindex &&
01868 str_cr != ENC_CODERANGE_7BIT &&
01869 ptr_cr != ENC_CODERANGE_7BIT) {
01870 incompatible:
01871 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01872 rb_enc_name(rb_enc_from_index(str_encindex)),
01873 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01874 }
01875
01876 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01877 res_encindex = str_encindex;
01878 res_cr = ENC_CODERANGE_UNKNOWN;
01879 }
01880 else if (str_cr == ENC_CODERANGE_7BIT) {
01881 if (ptr_cr == ENC_CODERANGE_7BIT) {
01882 res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01883 res_cr = ENC_CODERANGE_7BIT;
01884 }
01885 else {
01886 res_encindex = ptr_encindex;
01887 res_cr = ptr_cr;
01888 }
01889 }
01890 else if (str_cr == ENC_CODERANGE_VALID) {
01891 res_encindex = str_encindex;
01892 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01893 res_cr = str_cr;
01894 else
01895 res_cr = ptr_cr;
01896 }
01897 else {
01898 res_encindex = str_encindex;
01899 res_cr = str_cr;
01900 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01901 }
01902
01903 if (len < 0) {
01904 rb_raise(rb_eArgError, "negative string size (or size too big)");
01905 }
01906 str_buf_cat(str, ptr, len);
01907 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01908 return str;
01909 }
01910
01911 VALUE
01912 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01913 {
01914 return rb_enc_cr_str_buf_cat(str, ptr, len,
01915 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01916 }
01917
01918 VALUE
01919 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01920 {
01921
01922 int encindex = ENCODING_GET(str);
01923 rb_encoding *enc = rb_enc_from_index(encindex);
01924 if (rb_enc_asciicompat(enc)) {
01925 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01926 encindex, ENC_CODERANGE_7BIT, 0);
01927 }
01928 else {
01929 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01930 while (*ptr) {
01931 unsigned int c = (unsigned char)*ptr;
01932 int len = rb_enc_codelen(c, enc);
01933 rb_enc_mbcput(c, buf, enc);
01934 rb_enc_cr_str_buf_cat(str, buf, len,
01935 encindex, ENC_CODERANGE_VALID, 0);
01936 ptr++;
01937 }
01938 return str;
01939 }
01940 }
01941
01942 VALUE
01943 rb_str_buf_append(VALUE str, VALUE str2)
01944 {
01945 int str2_cr;
01946
01947 str2_cr = ENC_CODERANGE(str2);
01948
01949 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01950 ENCODING_GET(str2), str2_cr, &str2_cr);
01951
01952 OBJ_INFECT(str, str2);
01953 ENC_CODERANGE_SET(str2, str2_cr);
01954
01955 return str;
01956 }
01957
01958 VALUE
01959 rb_str_append(VALUE str, VALUE str2)
01960 {
01961 rb_encoding *enc;
01962 int cr, cr2;
01963
01964 StringValue(str2);
01965 if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01966 long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01967 enc = rb_enc_check(str, str2);
01968 cr = ENC_CODERANGE(str);
01969 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01970 rb_str_modify(str);
01971 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01972 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01973 RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01974 RSTRING(str)->as.heap.len = len;
01975 rb_enc_associate(str, enc);
01976 ENC_CODERANGE_SET(str, cr);
01977 OBJ_INFECT(str, str2);
01978 return str;
01979 }
01980 return rb_str_buf_append(str, str2);
01981 }
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995
01996
01997
01998
01999
02000 VALUE
02001 rb_str_concat(VALUE str1, VALUE str2)
02002 {
02003 unsigned int lc;
02004
02005 if (FIXNUM_P(str2)) {
02006 if ((int)str2 < 0)
02007 rb_raise(rb_eRangeError, "negative argument");
02008 lc = FIX2UINT(str2);
02009 }
02010 else if (TYPE(str2) == T_BIGNUM) {
02011 if (!RBIGNUM_SIGN(str2))
02012 rb_raise(rb_eRangeError, "negative argument");
02013 lc = NUM2UINT(str2);
02014 }
02015 else {
02016 return rb_str_append(str1, str2);
02017 }
02018 #if SIZEOF_INT < SIZEOF_VALUE
02019 if ((VALUE)lc > UINT_MAX) {
02020 rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02021 }
02022 #endif
02023 {
02024 rb_encoding *enc = STR_ENC_GET(str1);
02025 long pos = RSTRING_LEN(str1);
02026 int cr = ENC_CODERANGE(str1);
02027 int len;
02028
02029 if ((len = rb_enc_codelen(lc, enc)) <= 0) {
02030 rb_raise(rb_eRangeError, "%u invalid char", lc);
02031 }
02032 rb_str_resize(str1, pos+len);
02033 rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
02034 if (cr == ENC_CODERANGE_7BIT && lc > 127)
02035 cr = ENC_CODERANGE_VALID;
02036 ENC_CODERANGE_SET(str1, cr);
02037 return str1;
02038 }
02039 }
02040
02041 st_index_t
02042 rb_memhash(const void *ptr, long len)
02043 {
02044 return st_hash(ptr, len, rb_hash_start(0));
02045 }
02046
02047 st_index_t
02048 rb_str_hash(VALUE str)
02049 {
02050 int e = ENCODING_GET(str);
02051 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02052 e = 0;
02053 }
02054 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02055 }
02056
02057 int
02058 rb_str_hash_cmp(VALUE str1, VALUE str2)
02059 {
02060 long len;
02061
02062 if (!rb_str_comparable(str1, str2)) return 1;
02063 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02064 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02065 return 0;
02066 }
02067 return 1;
02068 }
02069
02070
02071
02072
02073
02074
02075
02076
02077 static VALUE
02078 rb_str_hash_m(VALUE str)
02079 {
02080 st_index_t hval = rb_str_hash(str);
02081 return INT2FIX(hval);
02082 }
02083
02084 #define lesser(a,b) (((a)>(b))?(b):(a))
02085
02086 int
02087 rb_str_comparable(VALUE str1, VALUE str2)
02088 {
02089 int idx1, idx2;
02090 int rc1, rc2;
02091
02092 if (RSTRING_LEN(str1) == 0) return TRUE;
02093 if (RSTRING_LEN(str2) == 0) return TRUE;
02094 idx1 = ENCODING_GET(str1);
02095 idx2 = ENCODING_GET(str2);
02096 if (idx1 == idx2) return TRUE;
02097 rc1 = rb_enc_str_coderange(str1);
02098 rc2 = rb_enc_str_coderange(str2);
02099 if (rc1 == ENC_CODERANGE_7BIT) {
02100 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02101 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02102 return TRUE;
02103 }
02104 if (rc2 == ENC_CODERANGE_7BIT) {
02105 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02106 return TRUE;
02107 }
02108 return FALSE;
02109 }
02110
02111 int
02112 rb_str_cmp(VALUE str1, VALUE str2)
02113 {
02114 long len;
02115 int retval;
02116
02117 len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02118 retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02119 if (retval == 0) {
02120 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02121 if (!rb_str_comparable(str1, str2)) {
02122 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02123 return 1;
02124 return -1;
02125 }
02126 return 0;
02127 }
02128 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02129 return -1;
02130 }
02131 if (retval > 0) return 1;
02132 return -1;
02133 }
02134
02135
02136 static VALUE
02137 str_eql(const VALUE str1, const VALUE str2)
02138 {
02139 const long len = RSTRING_LEN(str1);
02140
02141 if (len != RSTRING_LEN(str2)) return Qfalse;
02142 if (!rb_str_comparable(str1, str2)) return Qfalse;
02143 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02144 return Qtrue;
02145 return Qfalse;
02146 }
02147
02148
02149
02150
02151
02152
02153
02154
02155
02156 VALUE
02157 rb_str_equal(VALUE str1, VALUE str2)
02158 {
02159 if (str1 == str2) return Qtrue;
02160 if (TYPE(str2) != T_STRING) {
02161 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02162 return Qfalse;
02163 }
02164 return rb_equal(str2, str1);
02165 }
02166 return str_eql(str1, str2);
02167 }
02168
02169
02170
02171
02172
02173
02174
02175
02176 static VALUE
02177 rb_str_eql(VALUE str1, VALUE str2)
02178 {
02179 if (TYPE(str2) != T_STRING) return Qfalse;
02180 return str_eql(str1, str2);
02181 }
02182
02183
02184
02185
02186
02187
02188
02189
02190
02191
02192
02193
02194
02195
02196
02197
02198
02199
02200
02201
02202
02203
02204
02205
02206 static VALUE
02207 rb_str_cmp_m(VALUE str1, VALUE str2)
02208 {
02209 long result;
02210
02211 if (TYPE(str2) != T_STRING) {
02212 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02213 return Qnil;
02214 }
02215 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02216 return Qnil;
02217 }
02218 else {
02219 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02220
02221 if (NIL_P(tmp)) return Qnil;
02222 if (!FIXNUM_P(tmp)) {
02223 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02224 }
02225 result = -FIX2LONG(tmp);
02226 }
02227 }
02228 else {
02229 result = rb_str_cmp(str1, str2);
02230 }
02231 return LONG2NUM(result);
02232 }
02233
02234
02235
02236
02237
02238
02239
02240
02241
02242
02243
02244
02245
02246 static VALUE
02247 rb_str_casecmp(VALUE str1, VALUE str2)
02248 {
02249 long len;
02250 rb_encoding *enc;
02251 char *p1, *p1end, *p2, *p2end;
02252
02253 StringValue(str2);
02254 enc = rb_enc_compatible(str1, str2);
02255 if (!enc) {
02256 return Qnil;
02257 }
02258
02259 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02260 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02261 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02262 while (p1 < p1end && p2 < p2end) {
02263 if (*p1 != *p2) {
02264 unsigned int c1 = TOUPPER(*p1 & 0xff);
02265 unsigned int c2 = TOUPPER(*p2 & 0xff);
02266 if (c1 != c2)
02267 return INT2FIX(c1 < c2 ? -1 : 1);
02268 }
02269 p1++;
02270 p2++;
02271 }
02272 }
02273 else {
02274 while (p1 < p1end && p2 < p2end) {
02275 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02276 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02277
02278 if (0 <= c1 && 0 <= c2) {
02279 c1 = TOUPPER(c1);
02280 c2 = TOUPPER(c2);
02281 if (c1 != c2)
02282 return INT2FIX(c1 < c2 ? -1 : 1);
02283 }
02284 else {
02285 int r;
02286 l1 = rb_enc_mbclen(p1, p1end, enc);
02287 l2 = rb_enc_mbclen(p2, p2end, enc);
02288 len = l1 < l2 ? l1 : l2;
02289 r = memcmp(p1, p2, len);
02290 if (r != 0)
02291 return INT2FIX(r < 0 ? -1 : 1);
02292 if (l1 != l2)
02293 return INT2FIX(l1 < l2 ? -1 : 1);
02294 }
02295 p1 += l1;
02296 p2 += l2;
02297 }
02298 }
02299 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02300 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02301 return INT2FIX(-1);
02302 }
02303
02304 static long
02305 rb_str_index(VALUE str, VALUE sub, long offset)
02306 {
02307 long pos;
02308 char *s, *sptr, *e;
02309 long len, slen;
02310 rb_encoding *enc;
02311
02312 enc = rb_enc_check(str, sub);
02313 if (is_broken_string(sub)) {
02314 return -1;
02315 }
02316 len = str_strlen(str, enc);
02317 slen = str_strlen(sub, enc);
02318 if (offset < 0) {
02319 offset += len;
02320 if (offset < 0) return -1;
02321 }
02322 if (len - offset < slen) return -1;
02323 s = RSTRING_PTR(str);
02324 e = s + RSTRING_LEN(str);
02325 if (offset) {
02326 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02327 s += offset;
02328 }
02329 if (slen == 0) return offset;
02330
02331 sptr = RSTRING_PTR(sub);
02332 slen = RSTRING_LEN(sub);
02333 len = RSTRING_LEN(str) - offset;
02334 for (;;) {
02335 char *t;
02336 pos = rb_memsearch(sptr, slen, s, len, enc);
02337 if (pos < 0) return pos;
02338 t = rb_enc_right_char_head(s, s+pos, e, enc);
02339 if (t == s + pos) break;
02340 if ((len -= t - s) <= 0) return -1;
02341 offset += t - s;
02342 s = t;
02343 }
02344 return pos + offset;
02345 }
02346
02347
02348
02349
02350
02351
02352
02353
02354
02355
02356
02357
02358
02359
02360
02361
02362
02363
02364
02365 static VALUE
02366 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02367 {
02368 VALUE sub;
02369 VALUE initpos;
02370 long pos;
02371
02372 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02373 pos = NUM2LONG(initpos);
02374 }
02375 else {
02376 pos = 0;
02377 }
02378 if (pos < 0) {
02379 pos += str_strlen(str, STR_ENC_GET(str));
02380 if (pos < 0) {
02381 if (TYPE(sub) == T_REGEXP) {
02382 rb_backref_set(Qnil);
02383 }
02384 return Qnil;
02385 }
02386 }
02387
02388 switch (TYPE(sub)) {
02389 case T_REGEXP:
02390 if (pos > str_strlen(str, STR_ENC_GET(str)))
02391 return Qnil;
02392 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02393 rb_enc_check(str, sub), single_byte_optimizable(str));
02394
02395 pos = rb_reg_search(sub, str, pos, 0);
02396 pos = rb_str_sublen(str, pos);
02397 break;
02398
02399 default: {
02400 VALUE tmp;
02401
02402 tmp = rb_check_string_type(sub);
02403 if (NIL_P(tmp)) {
02404 rb_raise(rb_eTypeError, "type mismatch: %s given",
02405 rb_obj_classname(sub));
02406 }
02407 sub = tmp;
02408 }
02409
02410 case T_STRING:
02411 pos = rb_str_index(str, sub, pos);
02412 pos = rb_str_sublen(str, pos);
02413 break;
02414 }
02415
02416 if (pos == -1) return Qnil;
02417 return LONG2NUM(pos);
02418 }
02419
02420 static long
02421 rb_str_rindex(VALUE str, VALUE sub, long pos)
02422 {
02423 long len, slen;
02424 char *s, *sbeg, *e, *t;
02425 rb_encoding *enc;
02426 int singlebyte = single_byte_optimizable(str);
02427
02428 enc = rb_enc_check(str, sub);
02429 if (is_broken_string(sub)) {
02430 return -1;
02431 }
02432 len = str_strlen(str, enc);
02433 slen = str_strlen(sub, enc);
02434
02435 if (len < slen) return -1;
02436 if (len - pos < slen) {
02437 pos = len - slen;
02438 }
02439 if (len == 0) {
02440 return pos;
02441 }
02442 sbeg = RSTRING_PTR(str);
02443 e = RSTRING_END(str);
02444 t = RSTRING_PTR(sub);
02445 slen = RSTRING_LEN(sub);
02446 for (;;) {
02447 s = str_nth(sbeg, e, pos, enc, singlebyte);
02448 if (!s) return -1;
02449 if (memcmp(s, t, slen) == 0) {
02450 return pos;
02451 }
02452 if (pos == 0) break;
02453 pos--;
02454 }
02455 return -1;
02456 }
02457
02458
02459
02460
02461
02462
02463
02464
02465
02466
02467
02468
02469
02470
02471
02472
02473
02474
02475
02476
02477 static VALUE
02478 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02479 {
02480 VALUE sub;
02481 VALUE vpos;
02482 rb_encoding *enc = STR_ENC_GET(str);
02483 long pos, len = str_strlen(str, enc);
02484
02485 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02486 pos = NUM2LONG(vpos);
02487 if (pos < 0) {
02488 pos += len;
02489 if (pos < 0) {
02490 if (TYPE(sub) == T_REGEXP) {
02491 rb_backref_set(Qnil);
02492 }
02493 return Qnil;
02494 }
02495 }
02496 if (pos > len) pos = len;
02497 }
02498 else {
02499 pos = len;
02500 }
02501
02502 switch (TYPE(sub)) {
02503 case T_REGEXP:
02504
02505 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02506 STR_ENC_GET(str), single_byte_optimizable(str));
02507
02508 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02509 pos = rb_reg_search(sub, str, pos, 1);
02510 pos = rb_str_sublen(str, pos);
02511 }
02512 if (pos >= 0) return LONG2NUM(pos);
02513 break;
02514
02515 default: {
02516 VALUE tmp;
02517
02518 tmp = rb_check_string_type(sub);
02519 if (NIL_P(tmp)) {
02520 rb_raise(rb_eTypeError, "type mismatch: %s given",
02521 rb_obj_classname(sub));
02522 }
02523 sub = tmp;
02524 }
02525
02526 case T_STRING:
02527 pos = rb_str_rindex(str, sub, pos);
02528 if (pos >= 0) return LONG2NUM(pos);
02529 break;
02530 }
02531 return Qnil;
02532 }
02533
02534
02535
02536
02537
02538
02539
02540
02541
02542
02543
02544
02545
02546
02547
02548 static VALUE
02549 rb_str_match(VALUE x, VALUE y)
02550 {
02551 switch (TYPE(y)) {
02552 case T_STRING:
02553 rb_raise(rb_eTypeError, "type mismatch: String given");
02554
02555 case T_REGEXP:
02556 return rb_reg_match(y, x);
02557
02558 default:
02559 return rb_funcall(y, rb_intern("=~"), 1, x);
02560 }
02561 }
02562
02563
02564 static VALUE get_pat(VALUE, int);
02565
02566
02567
02568
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595
02596
02597
02598 static VALUE
02599 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02600 {
02601 VALUE re, result;
02602 if (argc < 1)
02603 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02604 re = argv[0];
02605 argv[0] = str;
02606 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02607 if (!NIL_P(result) && rb_block_given_p()) {
02608 return rb_yield(result);
02609 }
02610 return result;
02611 }
02612
02613 enum neighbor_char {
02614 NEIGHBOR_NOT_CHAR,
02615 NEIGHBOR_FOUND,
02616 NEIGHBOR_WRAPPED
02617 };
02618
02619 static enum neighbor_char
02620 enc_succ_char(char *p, long len, rb_encoding *enc)
02621 {
02622 long i;
02623 int l;
02624 while (1) {
02625 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02626 p[i] = '\0';
02627 if (i < 0)
02628 return NEIGHBOR_WRAPPED;
02629 ++((unsigned char*)p)[i];
02630 l = rb_enc_precise_mbclen(p, p+len, enc);
02631 if (MBCLEN_CHARFOUND_P(l)) {
02632 l = MBCLEN_CHARFOUND_LEN(l);
02633 if (l == len) {
02634 return NEIGHBOR_FOUND;
02635 }
02636 else {
02637 memset(p+l, 0xff, len-l);
02638 }
02639 }
02640 if (MBCLEN_INVALID_P(l) && i < len-1) {
02641 long len2;
02642 int l2;
02643 for (len2 = len-1; 0 < len2; len2--) {
02644 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02645 if (!MBCLEN_INVALID_P(l2))
02646 break;
02647 }
02648 memset(p+len2+1, 0xff, len-(len2+1));
02649 }
02650 }
02651 }
02652
02653 static enum neighbor_char
02654 enc_pred_char(char *p, long len, rb_encoding *enc)
02655 {
02656 long i;
02657 int l;
02658 while (1) {
02659 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02660 p[i] = '\xff';
02661 if (i < 0)
02662 return NEIGHBOR_WRAPPED;
02663 --((unsigned char*)p)[i];
02664 l = rb_enc_precise_mbclen(p, p+len, enc);
02665 if (MBCLEN_CHARFOUND_P(l)) {
02666 l = MBCLEN_CHARFOUND_LEN(l);
02667 if (l == len) {
02668 return NEIGHBOR_FOUND;
02669 }
02670 else {
02671 memset(p+l, 0, len-l);
02672 }
02673 }
02674 if (MBCLEN_INVALID_P(l) && i < len-1) {
02675 long len2;
02676 int l2;
02677 for (len2 = len-1; 0 < len2; len2--) {
02678 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02679 if (!MBCLEN_INVALID_P(l2))
02680 break;
02681 }
02682 memset(p+len2+1, 0, len-(len2+1));
02683 }
02684 }
02685 }
02686
02687
02688
02689
02690
02691
02692
02693
02694
02695
02696 static enum neighbor_char
02697 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02698 {
02699 enum neighbor_char ret;
02700 unsigned int c;
02701 int ctype;
02702 int range;
02703 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02704
02705 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02707 ctype = ONIGENC_CTYPE_DIGIT;
02708 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02709 ctype = ONIGENC_CTYPE_ALPHA;
02710 else
02711 return NEIGHBOR_NOT_CHAR;
02712
02713 MEMCPY(save, p, char, len);
02714 ret = enc_succ_char(p, len, enc);
02715 if (ret == NEIGHBOR_FOUND) {
02716 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02717 if (rb_enc_isctype(c, ctype, enc))
02718 return NEIGHBOR_FOUND;
02719 }
02720 MEMCPY(p, save, char, len);
02721 range = 1;
02722 while (1) {
02723 MEMCPY(save, p, char, len);
02724 ret = enc_pred_char(p, len, enc);
02725 if (ret == NEIGHBOR_FOUND) {
02726 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02727 if (!rb_enc_isctype(c, ctype, enc)) {
02728 MEMCPY(p, save, char, len);
02729 break;
02730 }
02731 }
02732 else {
02733 MEMCPY(p, save, char, len);
02734 break;
02735 }
02736 range++;
02737 }
02738 if (range == 1) {
02739 return NEIGHBOR_NOT_CHAR;
02740 }
02741
02742 if (ctype != ONIGENC_CTYPE_DIGIT) {
02743 MEMCPY(carry, p, char, len);
02744 return NEIGHBOR_WRAPPED;
02745 }
02746
02747 MEMCPY(carry, p, char, len);
02748 enc_succ_char(carry, len, enc);
02749 return NEIGHBOR_WRAPPED;
02750 }
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764
02765
02766
02767
02768
02769
02770
02771
02772
02773
02774
02775
02776
02777
02778 VALUE
02779 rb_str_succ(VALUE orig)
02780 {
02781 rb_encoding *enc;
02782 VALUE str;
02783 char *sbeg, *s, *e, *last_alnum = 0;
02784 int c = -1;
02785 long l;
02786 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02787 long carry_pos = 0, carry_len = 1;
02788 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02789
02790 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02791 rb_enc_cr_str_copy_for_substr(str, orig);
02792 OBJ_INFECT(str, orig);
02793 if (RSTRING_LEN(str) == 0) return str;
02794
02795 enc = STR_ENC_GET(orig);
02796 sbeg = RSTRING_PTR(str);
02797 s = e = sbeg + RSTRING_LEN(str);
02798
02799 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02800 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02801 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02802 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02803 s = last_alnum;
02804 break;
02805 }
02806 }
02807 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02808 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02809 switch (neighbor) {
02810 case NEIGHBOR_NOT_CHAR:
02811 continue;
02812 case NEIGHBOR_FOUND:
02813 return str;
02814 case NEIGHBOR_WRAPPED:
02815 last_alnum = s;
02816 break;
02817 }
02818 c = 1;
02819 carry_pos = s - sbeg;
02820 carry_len = l;
02821 }
02822 if (c == -1) {
02823 s = e;
02824 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02825 enum neighbor_char neighbor;
02826 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02827 neighbor = enc_succ_char(s, l, enc);
02828 if (neighbor == NEIGHBOR_FOUND)
02829 return str;
02830 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02831
02832 enc_succ_char(s, l, enc);
02833 }
02834 if (!rb_enc_asciicompat(enc)) {
02835 MEMCPY(carry, s, char, l);
02836 carry_len = l;
02837 }
02838 carry_pos = s - sbeg;
02839 }
02840 }
02841 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02842 s = RSTRING_PTR(str) + carry_pos;
02843 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02844 memmove(s, carry, carry_len);
02845 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02846 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02847 rb_enc_str_coderange(str);
02848 return str;
02849 }
02850
02851
02852
02853
02854
02855
02856
02857
02858
02859
02860
02861 static VALUE
02862 rb_str_succ_bang(VALUE str)
02863 {
02864 rb_str_shared_replace(str, rb_str_succ(str));
02865
02866 return str;
02867 }
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902 static VALUE
02903 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02904 {
02905 VALUE end, exclusive;
02906 VALUE current, after_end;
02907 ID succ;
02908 int n, excl, ascii;
02909 rb_encoding *enc;
02910
02911 rb_scan_args(argc, argv, "11", &end, &exclusive);
02912 RETURN_ENUMERATOR(beg, argc, argv);
02913 excl = RTEST(exclusive);
02914 CONST_ID(succ, "succ");
02915 StringValue(end);
02916 enc = rb_enc_check(beg, end);
02917 ascii = (is_ascii_string(beg) && is_ascii_string(end));
02918
02919 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02920 char c = RSTRING_PTR(beg)[0];
02921 char e = RSTRING_PTR(end)[0];
02922
02923 if (c > e || (excl && c == e)) return beg;
02924 for (;;) {
02925 rb_yield(rb_enc_str_new(&c, 1, enc));
02926 if (!excl && c == e) break;
02927 c++;
02928 if (excl && c == e) break;
02929 }
02930 return beg;
02931 }
02932
02933 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02934 char *s, *send;
02935 VALUE b, e;
02936 int width;
02937
02938 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02939 width = rb_long2int(send - s);
02940 while (s < send) {
02941 if (!ISDIGIT(*s)) goto no_digits;
02942 s++;
02943 }
02944 s = RSTRING_PTR(end); send = RSTRING_END(end);
02945 while (s < send) {
02946 if (!ISDIGIT(*s)) goto no_digits;
02947 s++;
02948 }
02949 b = rb_str_to_inum(beg, 10, FALSE);
02950 e = rb_str_to_inum(end, 10, FALSE);
02951 if (FIXNUM_P(b) && FIXNUM_P(e)) {
02952 long bi = FIX2LONG(b);
02953 long ei = FIX2LONG(e);
02954 rb_encoding *usascii = rb_usascii_encoding();
02955
02956 while (bi <= ei) {
02957 if (excl && bi == ei) break;
02958 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02959 bi++;
02960 }
02961 }
02962 else {
02963 ID op = excl ? '<' : rb_intern("<=");
02964 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02965
02966 args[0] = INT2FIX(width);
02967 while (rb_funcall(b, op, 1, e)) {
02968 args[1] = b;
02969 rb_yield(rb_str_format(numberof(args), args, fmt));
02970 b = rb_funcall(b, succ, 0, 0);
02971 }
02972 }
02973 return beg;
02974 }
02975
02976 no_digits:
02977 n = rb_str_cmp(beg, end);
02978 if (n > 0 || (excl && n == 0)) return beg;
02979
02980 after_end = rb_funcall(end, succ, 0, 0);
02981 current = rb_str_dup(beg);
02982 while (!rb_str_equal(current, after_end)) {
02983 VALUE next = Qnil;
02984 if (excl || !rb_str_equal(current, end))
02985 next = rb_funcall(current, succ, 0, 0);
02986 rb_yield(current);
02987 if (NIL_P(next)) break;
02988 current = next;
02989 StringValue(current);
02990 if (excl && rb_str_equal(current, end)) break;
02991 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02992 break;
02993 }
02994
02995 return beg;
02996 }
02997
02998 static VALUE
02999 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03000 {
03001 if (rb_reg_search(re, str, 0, 0) >= 0) {
03002 VALUE match = rb_backref_get();
03003 int nth = rb_reg_backref_number(match, backref);
03004 return rb_reg_nth_match(nth, match);
03005 }
03006 return Qnil;
03007 }
03008
03009 static VALUE
03010 rb_str_aref(VALUE str, VALUE indx)
03011 {
03012 long idx;
03013
03014 switch (TYPE(indx)) {
03015 case T_FIXNUM:
03016 idx = FIX2LONG(indx);
03017
03018 num_index:
03019 str = rb_str_substr(str, idx, 1);
03020 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03021 return str;
03022
03023 case T_REGEXP:
03024 return rb_str_subpat(str, indx, INT2FIX(0));
03025
03026 case T_STRING:
03027 if (rb_str_index(str, indx, 0) != -1)
03028 return rb_str_dup(indx);
03029 return Qnil;
03030
03031 default:
03032
03033 {
03034 long beg, len;
03035 VALUE tmp;
03036
03037 len = str_strlen(str, STR_ENC_GET(str));
03038 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03039 case Qfalse:
03040 break;
03041 case Qnil:
03042 return Qnil;
03043 default:
03044 tmp = rb_str_substr(str, beg, len);
03045 return tmp;
03046 }
03047 }
03048 idx = NUM2LONG(indx);
03049 goto num_index;
03050 }
03051 return Qnil;
03052 }
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071
03072
03073
03074
03075
03076
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103 static VALUE
03104 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03105 {
03106 if (argc == 2) {
03107 if (TYPE(argv[0]) == T_REGEXP) {
03108 return rb_str_subpat(str, argv[0], argv[1]);
03109 }
03110 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03111 }
03112 if (argc != 1) {
03113 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03114 }
03115 return rb_str_aref(str, argv[0]);
03116 }
03117
03118 VALUE
03119 rb_str_drop_bytes(VALUE str, long len)
03120 {
03121 char *ptr = RSTRING_PTR(str);
03122 long olen = RSTRING_LEN(str), nlen;
03123
03124 str_modifiable(str);
03125 if (len > olen) len = olen;
03126 nlen = olen - len;
03127 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03128 char *oldptr = ptr;
03129 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03130 STR_SET_EMBED(str);
03131 STR_SET_EMBED_LEN(str, nlen);
03132 ptr = RSTRING(str)->as.ary;
03133 memmove(ptr, oldptr + len, nlen);
03134 if (fl == STR_NOEMBED) xfree(oldptr);
03135 }
03136 else {
03137 if (!STR_SHARED_P(str)) rb_str_new4(str);
03138 ptr = RSTRING(str)->as.heap.ptr += len;
03139 RSTRING(str)->as.heap.len = nlen;
03140 }
03141 ptr[nlen] = 0;
03142 ENC_CODERANGE_CLEAR(str);
03143 return str;
03144 }
03145
03146 static void
03147 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03148 {
03149 if (beg == 0 && RSTRING_LEN(val) == 0) {
03150 rb_str_drop_bytes(str, len);
03151 OBJ_INFECT(str, val);
03152 return;
03153 }
03154
03155 rb_str_modify(str);
03156 if (len < RSTRING_LEN(val)) {
03157
03158 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03159 }
03160
03161 if (RSTRING_LEN(val) != len) {
03162 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03163 RSTRING_PTR(str) + beg + len,
03164 RSTRING_LEN(str) - (beg + len));
03165 }
03166 if (RSTRING_LEN(val) < beg && len < 0) {
03167 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03168 }
03169 if (RSTRING_LEN(val) > 0) {
03170 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03171 }
03172 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03173 if (RSTRING_PTR(str)) {
03174 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03175 }
03176 OBJ_INFECT(str, val);
03177 }
03178
03179 static void
03180 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03181 {
03182 long slen;
03183 char *p, *e;
03184 rb_encoding *enc;
03185 int singlebyte = single_byte_optimizable(str);
03186 int cr;
03187
03188 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03189
03190 StringValue(val);
03191 enc = rb_enc_check(str, val);
03192 slen = str_strlen(str, enc);
03193
03194 if (slen < beg) {
03195 out_of_range:
03196 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03197 }
03198 if (beg < 0) {
03199 if (-beg > slen) {
03200 goto out_of_range;
03201 }
03202 beg += slen;
03203 }
03204 if (slen < len || slen < beg + len) {
03205 len = slen - beg;
03206 }
03207 str_modify_keep_cr(str);
03208 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03209 if (!p) p = RSTRING_END(str);
03210 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03211 if (!e) e = RSTRING_END(str);
03212
03213 beg = p - RSTRING_PTR(str);
03214 len = e - p;
03215 rb_str_splice_0(str, beg, len, val);
03216 rb_enc_associate(str, enc);
03217 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03218 if (cr != ENC_CODERANGE_BROKEN)
03219 ENC_CODERANGE_SET(str, cr);
03220 }
03221
03222 void
03223 rb_str_update(VALUE str, long beg, long len, VALUE val)
03224 {
03225 rb_str_splice(str, beg, len, val);
03226 }
03227
03228 static void
03229 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03230 {
03231 int nth;
03232 VALUE match;
03233 long start, end, len;
03234 rb_encoding *enc;
03235 struct re_registers *regs;
03236
03237 if (rb_reg_search(re, str, 0, 0) < 0) {
03238 rb_raise(rb_eIndexError, "regexp not matched");
03239 }
03240 match = rb_backref_get();
03241 nth = rb_reg_backref_number(match, backref);
03242 regs = RMATCH_REGS(match);
03243 if (nth >= regs->num_regs) {
03244 out_of_range:
03245 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03246 }
03247 if (nth < 0) {
03248 if (-nth >= regs->num_regs) {
03249 goto out_of_range;
03250 }
03251 nth += regs->num_regs;
03252 }
03253
03254 start = BEG(nth);
03255 if (start == -1) {
03256 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03257 }
03258 end = END(nth);
03259 len = end - start;
03260 StringValue(val);
03261 enc = rb_enc_check(str, val);
03262 rb_str_splice_0(str, start, len, val);
03263 rb_enc_associate(str, enc);
03264 }
03265
03266 static VALUE
03267 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03268 {
03269 long idx, beg;
03270
03271 switch (TYPE(indx)) {
03272 case T_FIXNUM:
03273 idx = FIX2LONG(indx);
03274 num_index:
03275 rb_str_splice(str, idx, 1, val);
03276 return val;
03277
03278 case T_REGEXP:
03279 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03280 return val;
03281
03282 case T_STRING:
03283 beg = rb_str_index(str, indx, 0);
03284 if (beg < 0) {
03285 rb_raise(rb_eIndexError, "string not matched");
03286 }
03287 beg = rb_str_sublen(str, beg);
03288 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03289 return val;
03290
03291 default:
03292
03293 {
03294 long beg, len;
03295 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03296 rb_str_splice(str, beg, len, val);
03297 return val;
03298 }
03299 }
03300 idx = NUM2LONG(indx);
03301 goto num_index;
03302 }
03303 }
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330 static VALUE
03331 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03332 {
03333 if (argc == 3) {
03334 if (TYPE(argv[0]) == T_REGEXP) {
03335 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03336 }
03337 else {
03338 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03339 }
03340 return argv[2];
03341 }
03342 if (argc != 2) {
03343 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03344 }
03345 return rb_str_aset(str, argv[0], argv[1]);
03346 }
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359
03360
03361
03362
03363
03364
03365 static VALUE
03366 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03367 {
03368 long pos = NUM2LONG(idx);
03369
03370 if (pos == -1) {
03371 return rb_str_append(str, str2);
03372 }
03373 else if (pos < 0) {
03374 pos++;
03375 }
03376 rb_str_splice(str, pos, 0, str2);
03377 return str;
03378 }
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395
03396
03397
03398
03399
03400 static VALUE
03401 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03402 {
03403 VALUE result;
03404 VALUE buf[3];
03405 int i;
03406
03407 if (argc < 1 || 2 < argc) {
03408 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03409 }
03410 for (i=0; i<argc; i++) {
03411 buf[i] = argv[i];
03412 }
03413 str_modify_keep_cr(str);
03414 buf[i] = rb_str_new(0,0);
03415 result = rb_str_aref_m(argc, buf, str);
03416 if (!NIL_P(result)) {
03417 rb_str_aset_m(argc+1, buf, str);
03418 }
03419 return result;
03420 }
03421
03422 static VALUE
03423 get_pat(VALUE pat, int quote)
03424 {
03425 VALUE val;
03426
03427 switch (TYPE(pat)) {
03428 case T_REGEXP:
03429 return pat;
03430
03431 case T_STRING:
03432 break;
03433
03434 default:
03435 val = rb_check_string_type(pat);
03436 if (NIL_P(val)) {
03437 Check_Type(pat, T_REGEXP);
03438 }
03439 pat = val;
03440 }
03441
03442 if (quote) {
03443 pat = rb_reg_quote(pat);
03444 }
03445
03446 return rb_reg_regcomp(pat);
03447 }
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460 static VALUE
03461 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03462 {
03463 VALUE pat, repl, hash = Qnil;
03464 int iter = 0;
03465 int tainted = 0;
03466 int untrusted = 0;
03467 long plen;
03468
03469 if (argc == 1 && rb_block_given_p()) {
03470 iter = 1;
03471 }
03472 else if (argc == 2) {
03473 repl = argv[1];
03474 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03475 if (NIL_P(hash)) {
03476 StringValue(repl);
03477 }
03478 if (OBJ_TAINTED(repl)) tainted = 1;
03479 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03480 }
03481 else {
03482 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03483 }
03484
03485 pat = get_pat(argv[0], 1);
03486 str_modifiable(str);
03487 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03488 rb_encoding *enc;
03489 int cr = ENC_CODERANGE(str);
03490 VALUE match = rb_backref_get();
03491 struct re_registers *regs = RMATCH_REGS(match);
03492 long beg0 = BEG(0);
03493 long end0 = END(0);
03494 char *p, *rp;
03495 long len, rlen;
03496
03497 if (iter || !NIL_P(hash)) {
03498 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03499
03500 if (iter) {
03501 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03502 }
03503 else {
03504 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03505 repl = rb_obj_as_string(repl);
03506 }
03507 str_mod_check(str, p, len);
03508 str_frozen_check(str);
03509 }
03510 else {
03511 repl = rb_reg_regsub(repl, str, regs, pat);
03512 }
03513 enc = rb_enc_compatible(str, repl);
03514 if (!enc) {
03515 rb_encoding *str_enc = STR_ENC_GET(str);
03516 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03517 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03518 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03519 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03520 rb_enc_name(str_enc),
03521 rb_enc_name(STR_ENC_GET(repl)));
03522 }
03523 enc = STR_ENC_GET(repl);
03524 }
03525 rb_str_modify(str);
03526 rb_enc_associate(str, enc);
03527 if (OBJ_TAINTED(repl)) tainted = 1;
03528 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03529 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03530 int cr2 = ENC_CODERANGE(repl);
03531 if (cr2 == ENC_CODERANGE_BROKEN ||
03532 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03533 cr = ENC_CODERANGE_UNKNOWN;
03534 else
03535 cr = cr2;
03536 }
03537 plen = end0 - beg0;
03538 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03539 len = RSTRING_LEN(str);
03540 if (rlen > plen) {
03541 RESIZE_CAPA(str, len + rlen - plen);
03542 }
03543 p = RSTRING_PTR(str);
03544 if (rlen != plen) {
03545 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03546 }
03547 memcpy(p + beg0, rp, rlen);
03548 len += rlen - plen;
03549 STR_SET_LEN(str, len);
03550 RSTRING_PTR(str)[len] = '\0';
03551 ENC_CODERANGE_SET(str, cr);
03552 if (tainted) OBJ_TAINT(str);
03553 if (untrusted) OBJ_UNTRUST(str);
03554
03555 return str;
03556 }
03557 return Qnil;
03558 }
03559
03560
03561
03562
03563
03564
03565
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601 static VALUE
03602 rb_str_sub(int argc, VALUE *argv, VALUE str)
03603 {
03604 str = rb_str_dup(str);
03605 rb_str_sub_bang(argc, argv, str);
03606 return str;
03607 }
03608
03609 static VALUE
03610 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03611 {
03612 VALUE pat, val, repl, match, dest, hash = Qnil;
03613 struct re_registers *regs;
03614 long beg, n;
03615 long beg0, end0;
03616 long offset, blen, slen, len, last;
03617 int iter = 0;
03618 char *sp, *cp;
03619 int tainted = 0;
03620 rb_encoding *str_enc;
03621
03622 switch (argc) {
03623 case 1:
03624 RETURN_ENUMERATOR(str, argc, argv);
03625 iter = 1;
03626 break;
03627 case 2:
03628 repl = argv[1];
03629 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03630 if (NIL_P(hash)) {
03631 StringValue(repl);
03632 }
03633 if (OBJ_TAINTED(repl)) tainted = 1;
03634 break;
03635 default:
03636 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03637 }
03638
03639 pat = get_pat(argv[0], 1);
03640 beg = rb_reg_search(pat, str, 0, 0);
03641 if (beg < 0) {
03642 if (bang) return Qnil;
03643 return rb_str_dup(str);
03644 }
03645
03646 offset = 0;
03647 n = 0;
03648 blen = RSTRING_LEN(str) + 30;
03649 dest = rb_str_buf_new(blen);
03650 sp = RSTRING_PTR(str);
03651 slen = RSTRING_LEN(str);
03652 cp = sp;
03653 str_enc = STR_ENC_GET(str);
03654
03655 do {
03656 n++;
03657 match = rb_backref_get();
03658 regs = RMATCH_REGS(match);
03659 beg0 = BEG(0);
03660 end0 = END(0);
03661 if (iter || !NIL_P(hash)) {
03662 if (iter) {
03663 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03664 }
03665 else {
03666 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03667 val = rb_obj_as_string(val);
03668 }
03669 str_mod_check(str, sp, slen);
03670 if (val == dest) {
03671 rb_raise(rb_eRuntimeError, "block should not cheat");
03672 }
03673 }
03674 else {
03675 val = rb_reg_regsub(repl, str, regs, pat);
03676 }
03677
03678 if (OBJ_TAINTED(val)) tainted = 1;
03679
03680 len = beg - offset;
03681 if (len) {
03682 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03683 }
03684
03685 rb_str_buf_append(dest, val);
03686
03687 last = offset;
03688 offset = end0;
03689 if (beg0 == end0) {
03690
03691
03692
03693
03694 if (RSTRING_LEN(str) <= end0) break;
03695 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03696 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03697 offset = end0 + len;
03698 }
03699 cp = RSTRING_PTR(str) + offset;
03700 if (offset > RSTRING_LEN(str)) break;
03701 beg = rb_reg_search(pat, str, offset, 0);
03702 } while (beg >= 0);
03703 if (RSTRING_LEN(str) > offset) {
03704 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03705 }
03706 rb_reg_search(pat, str, last, 0);
03707 if (bang) {
03708 rb_str_shared_replace(str, dest);
03709 }
03710 else {
03711 RBASIC(dest)->klass = rb_obj_class(str);
03712 OBJ_INFECT(dest, str);
03713 str = dest;
03714 }
03715
03716 if (tainted) OBJ_TAINT(str);
03717 return str;
03718 }
03719
03720
03721
03722
03723
03724
03725
03726
03727
03728
03729
03730
03731
03732 static VALUE
03733 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03734 {
03735 str_modify_keep_cr(str);
03736 return str_gsub(argc, argv, str, 1);
03737 }
03738
03739
03740
03741
03742
03743
03744
03745
03746
03747
03748
03749
03750
03751
03752
03753
03754
03755
03756
03757
03758
03759
03760
03761
03762
03763
03764
03765
03766
03767
03768
03769
03770
03771
03772
03773
03774
03775
03776
03777
03778
03779
03780
03781
03782
03783 static VALUE
03784 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03785 {
03786 return str_gsub(argc, argv, str, 0);
03787 }
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801 VALUE
03802 rb_str_replace(VALUE str, VALUE str2)
03803 {
03804 str_modifiable(str);
03805 if (str == str2) return str;
03806
03807 StringValue(str2);
03808 str_discard(str);
03809 return str_replace(str, str2);
03810 }
03811
03812
03813
03814
03815
03816
03817
03818
03819
03820
03821
03822 static VALUE
03823 rb_str_clear(VALUE str)
03824 {
03825 str_discard(str);
03826 STR_SET_EMBED(str);
03827 STR_SET_EMBED_LEN(str, 0);
03828 RSTRING_PTR(str)[0] = 0;
03829 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03830 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03831 else
03832 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03833 return str;
03834 }
03835
03836
03837
03838
03839
03840
03841
03842
03843
03844
03845
03846 static VALUE
03847 rb_str_chr(VALUE str)
03848 {
03849 return rb_str_substr(str, 0, 1);
03850 }
03851
03852
03853
03854
03855
03856
03857
03858 static VALUE
03859 rb_str_getbyte(VALUE str, VALUE index)
03860 {
03861 long pos = NUM2LONG(index);
03862
03863 if (pos < 0)
03864 pos += RSTRING_LEN(str);
03865 if (pos < 0 || RSTRING_LEN(str) <= pos)
03866 return Qnil;
03867
03868 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03869 }
03870
03871
03872
03873
03874
03875
03876
03877 static VALUE
03878 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03879 {
03880 long pos = NUM2LONG(index);
03881 int byte = NUM2INT(value);
03882
03883 rb_str_modify(str);
03884
03885 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03886 rb_raise(rb_eIndexError, "index %ld out of string", pos);
03887 if (pos < 0)
03888 pos += RSTRING_LEN(str);
03889
03890 RSTRING_PTR(str)[pos] = byte;
03891
03892 return value;
03893 }
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904 static VALUE
03905 rb_str_reverse(VALUE str)
03906 {
03907 rb_encoding *enc;
03908 VALUE rev;
03909 char *s, *e, *p;
03910 int single = 1;
03911
03912 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03913 enc = STR_ENC_GET(str);
03914 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03915 s = RSTRING_PTR(str); e = RSTRING_END(str);
03916 p = RSTRING_END(rev);
03917
03918 if (RSTRING_LEN(str) > 1) {
03919 if (single_byte_optimizable(str)) {
03920 while (s < e) {
03921 *--p = *s++;
03922 }
03923 }
03924 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03925 while (s < e) {
03926 int clen = rb_enc_fast_mbclen(s, e, enc);
03927
03928 if (clen > 1 || (*s & 0x80)) single = 0;
03929 p -= clen;
03930 memcpy(p, s, clen);
03931 s += clen;
03932 }
03933 }
03934 else {
03935 while (s < e) {
03936 int clen = rb_enc_mbclen(s, e, enc);
03937
03938 if (clen > 1 || (*s & 0x80)) single = 0;
03939 p -= clen;
03940 memcpy(p, s, clen);
03941 s += clen;
03942 }
03943 }
03944 }
03945 STR_SET_LEN(rev, RSTRING_LEN(str));
03946 OBJ_INFECT(rev, str);
03947 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03948 if (single) {
03949 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03950 }
03951 else {
03952 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953 }
03954 }
03955 rb_enc_cr_str_copy_for_substr(rev, str);
03956
03957 return rev;
03958 }
03959
03960
03961
03962
03963
03964
03965
03966
03967
03968 static VALUE
03969 rb_str_reverse_bang(VALUE str)
03970 {
03971 if (RSTRING_LEN(str) > 1) {
03972 if (single_byte_optimizable(str)) {
03973 char *s, *e, c;
03974
03975 str_modify_keep_cr(str);
03976 s = RSTRING_PTR(str);
03977 e = RSTRING_END(str) - 1;
03978 while (s < e) {
03979 c = *s;
03980 *s++ = *e;
03981 *e-- = c;
03982 }
03983 }
03984 else {
03985 rb_str_shared_replace(str, rb_str_reverse(str));
03986 }
03987 }
03988 else {
03989 str_modify_keep_cr(str);
03990 }
03991 return str;
03992 }
03993
03994
03995
03996
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007 static VALUE
04008 rb_str_include(VALUE str, VALUE arg)
04009 {
04010 long i;
04011
04012 StringValue(arg);
04013 i = rb_str_index(str, arg, 0);
04014
04015 if (i == -1) return Qfalse;
04016 return Qtrue;
04017 }
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028
04029
04030
04031
04032
04033
04034
04035
04036
04037
04038
04039
04040
04041 static VALUE
04042 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04043 {
04044 int base;
04045
04046 if (argc == 0) base = 10;
04047 else {
04048 VALUE b;
04049
04050 rb_scan_args(argc, argv, "01", &b);
04051 base = NUM2INT(b);
04052 }
04053 if (base < 0) {
04054 rb_raise(rb_eArgError, "invalid radix %d", base);
04055 }
04056 return rb_str_to_inum(str, base, FALSE);
04057 }
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069
04070
04071
04072
04073
04074 static VALUE
04075 rb_str_to_f(VALUE str)
04076 {
04077 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04078 }
04079
04080
04081
04082
04083
04084
04085
04086
04087
04088
04089 static VALUE
04090 rb_str_to_s(VALUE str)
04091 {
04092 if (rb_obj_class(str) != rb_cString) {
04093 return str_duplicate(rb_cString, str);
04094 }
04095 return str;
04096 }
04097
04098 #if 0
04099 static void
04100 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04101 {
04102 char s[RUBY_MAX_CHAR_LEN];
04103 int n = rb_enc_codelen(c, enc);
04104
04105 rb_enc_mbcput(c, s, enc);
04106 rb_enc_str_buf_cat(str, s, n, enc);
04107 }
04108 #endif
04109
04110 #define CHAR_ESC_LEN 13
04111
04112 int
04113 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04114 {
04115 char buf[CHAR_ESC_LEN + 1];
04116 int l;
04117
04118 #if SIZEOF_INT > 4
04119 c &= 0xffffffff;
04120 #endif
04121 if (unicode_p) {
04122 if (c < 0x7F && ISPRINT(c)) {
04123 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04124 }
04125 else if (c < 0x10000) {
04126 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04127 }
04128 else {
04129 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04130 }
04131 }
04132 else {
04133 if (c < 0x100) {
04134 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04135 }
04136 else {
04137 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04138 }
04139 }
04140 l = (int)strlen(buf);
04141 rb_str_buf_cat(result, buf, l);
04142 return l;
04143 }
04144
04145
04146
04147
04148
04149
04150
04151
04152
04153
04154
04155
04156
04157 VALUE
04158 rb_str_inspect(VALUE str)
04159 {
04160 rb_encoding *enc = STR_ENC_GET(str);
04161 const char *p, *pend, *prev;
04162 char buf[CHAR_ESC_LEN + 1];
04163 VALUE result = rb_str_buf_new(0);
04164 rb_encoding *resenc = rb_default_internal_encoding();
04165 int unicode_p = rb_enc_unicode_p(enc);
04166 int asciicompat = rb_enc_asciicompat(enc);
04167
04168 if (resenc == NULL) resenc = rb_default_external_encoding();
04169 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04170 rb_enc_associate(result, resenc);
04171 str_buf_cat2(result, "\"");
04172
04173 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04174 prev = p;
04175 while (p < pend) {
04176 unsigned int c, cc;
04177 int n;
04178
04179 n = rb_enc_precise_mbclen(p, pend, enc);
04180 if (!MBCLEN_CHARFOUND_P(n)) {
04181 if (p > prev) str_buf_cat(result, prev, p - prev);
04182 n = rb_enc_mbminlen(enc);
04183 if (pend < p + n)
04184 n = (int)(pend - p);
04185 while (n--) {
04186 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04187 str_buf_cat(result, buf, strlen(buf));
04188 prev = ++p;
04189 }
04190 continue;
04191 }
04192 n = MBCLEN_CHARFOUND_LEN(n);
04193 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04194 p += n;
04195 if (c == '"'|| c == '\\' ||
04196 (c == '#' &&
04197 p < pend &&
04198 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04199 (cc = rb_enc_codepoint(p,pend,enc),
04200 (cc == '$' || cc == '@' || cc == '{')))) {
04201 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04202 str_buf_cat2(result, "\\");
04203 if (asciicompat || enc == resenc) {
04204 prev = p - n;
04205 continue;
04206 }
04207 }
04208 switch (c) {
04209 case '\n': cc = 'n'; break;
04210 case '\r': cc = 'r'; break;
04211 case '\t': cc = 't'; break;
04212 case '\f': cc = 'f'; break;
04213 case '\013': cc = 'v'; break;
04214 case '\010': cc = 'b'; break;
04215 case '\007': cc = 'a'; break;
04216 case 033: cc = 'e'; break;
04217 default: cc = 0; break;
04218 }
04219 if (cc) {
04220 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04221 buf[0] = '\\';
04222 buf[1] = (char)cc;
04223 str_buf_cat(result, buf, 2);
04224 prev = p;
04225 continue;
04226 }
04227 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04228 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04229 continue;
04230 }
04231 else {
04232 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04233 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04234 prev = p;
04235 continue;
04236 }
04237 }
04238 if (p > prev) str_buf_cat(result, prev, p - prev);
04239 str_buf_cat2(result, "\"");
04240
04241 OBJ_INFECT(result, str);
04242 return result;
04243 }
04244
04245 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04246
04247
04248
04249
04250
04251
04252
04253
04254
04255 VALUE
04256 rb_str_dump(VALUE str)
04257 {
04258 rb_encoding *enc = rb_enc_get(str);
04259 long len;
04260 const char *p, *pend;
04261 char *q, *qend;
04262 VALUE result;
04263 int u8 = (enc == rb_utf8_encoding());
04264
04265 len = 2;
04266 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04267 while (p < pend) {
04268 unsigned char c = *p++;
04269 switch (c) {
04270 case '"': case '\\':
04271 case '\n': case '\r':
04272 case '\t': case '\f':
04273 case '\013': case '\010': case '\007': case '\033':
04274 len += 2;
04275 break;
04276
04277 case '#':
04278 len += IS_EVSTR(p, pend) ? 2 : 1;
04279 break;
04280
04281 default:
04282 if (ISPRINT(c)) {
04283 len++;
04284 }
04285 else {
04286 if (u8) {
04287 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04288 if (MBCLEN_CHARFOUND_P(n-1)) {
04289 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04290 while (cc >>= 4) len++;
04291 len += 5;
04292 p += MBCLEN_CHARFOUND_LEN(n)-1;
04293 break;
04294 }
04295 }
04296 len += 4;
04297 }
04298 break;
04299 }
04300 }
04301 if (!rb_enc_asciicompat(enc)) {
04302 len += 19;
04303 len += strlen(enc->name);
04304 }
04305
04306 result = rb_str_new5(str, 0, len);
04307 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04308 q = RSTRING_PTR(result); qend = q + len + 1;
04309
04310 *q++ = '"';
04311 while (p < pend) {
04312 unsigned char c = *p++;
04313
04314 if (c == '"' || c == '\\') {
04315 *q++ = '\\';
04316 *q++ = c;
04317 }
04318 else if (c == '#') {
04319 if (IS_EVSTR(p, pend)) *q++ = '\\';
04320 *q++ = '#';
04321 }
04322 else if (c == '\n') {
04323 *q++ = '\\';
04324 *q++ = 'n';
04325 }
04326 else if (c == '\r') {
04327 *q++ = '\\';
04328 *q++ = 'r';
04329 }
04330 else if (c == '\t') {
04331 *q++ = '\\';
04332 *q++ = 't';
04333 }
04334 else if (c == '\f') {
04335 *q++ = '\\';
04336 *q++ = 'f';
04337 }
04338 else if (c == '\013') {
04339 *q++ = '\\';
04340 *q++ = 'v';
04341 }
04342 else if (c == '\010') {
04343 *q++ = '\\';
04344 *q++ = 'b';
04345 }
04346 else if (c == '\007') {
04347 *q++ = '\\';
04348 *q++ = 'a';
04349 }
04350 else if (c == '\033') {
04351 *q++ = '\\';
04352 *q++ = 'e';
04353 }
04354 else if (ISPRINT(c)) {
04355 *q++ = c;
04356 }
04357 else {
04358 *q++ = '\\';
04359 if (u8) {
04360 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04361 if (MBCLEN_CHARFOUND_P(n)) {
04362 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04363 p += n;
04364 snprintf(q, qend-q, "u{%x}", cc);
04365 q += strlen(q);
04366 continue;
04367 }
04368 }
04369 snprintf(q, qend-q, "x%02X", c);
04370 q += 3;
04371 }
04372 }
04373 *q++ = '"';
04374 *q = '\0';
04375 if (!rb_enc_asciicompat(enc)) {
04376 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04377 enc = rb_ascii8bit_encoding();
04378 }
04379 OBJ_INFECT(result, str);
04380
04381 rb_enc_associate(result, enc);
04382 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04383 return result;
04384 }
04385
04386
04387 static void
04388 rb_str_check_dummy_enc(rb_encoding *enc)
04389 {
04390 if (rb_enc_dummy_p(enc)) {
04391 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04392 rb_enc_name(enc));
04393 }
04394 }
04395
04396
04397
04398
04399
04400
04401
04402
04403
04404
04405 static VALUE
04406 rb_str_upcase_bang(VALUE str)
04407 {
04408 rb_encoding *enc;
04409 char *s, *send;
04410 int modify = 0;
04411 int n;
04412
04413 str_modify_keep_cr(str);
04414 enc = STR_ENC_GET(str);
04415 rb_str_check_dummy_enc(enc);
04416 s = RSTRING_PTR(str); send = RSTRING_END(str);
04417 if (single_byte_optimizable(str)) {
04418 while (s < send) {
04419 unsigned int c = *(unsigned char*)s;
04420
04421 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04422 *s = 'A' + (c - 'a');
04423 modify = 1;
04424 }
04425 s++;
04426 }
04427 }
04428 else {
04429 int ascompat = rb_enc_asciicompat(enc);
04430
04431 while (s < send) {
04432 unsigned int c;
04433
04434 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04435 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04436 *s = 'A' + (c - 'a');
04437 modify = 1;
04438 }
04439 s++;
04440 }
04441 else {
04442 c = rb_enc_codepoint_len(s, send, &n, enc);
04443 if (rb_enc_islower(c, enc)) {
04444
04445 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04446 modify = 1;
04447 }
04448 s += n;
04449 }
04450 }
04451 }
04452
04453 if (modify) return str;
04454 return Qnil;
04455 }
04456
04457
04458
04459
04460
04461
04462
04463
04464
04465
04466
04467
04468
04469
04470 static VALUE
04471 rb_str_upcase(VALUE str)
04472 {
04473 str = rb_str_dup(str);
04474 rb_str_upcase_bang(str);
04475 return str;
04476 }
04477
04478
04479
04480
04481
04482
04483
04484
04485
04486
04487
04488 static VALUE
04489 rb_str_downcase_bang(VALUE str)
04490 {
04491 rb_encoding *enc;
04492 char *s, *send;
04493 int modify = 0;
04494
04495 str_modify_keep_cr(str);
04496 enc = STR_ENC_GET(str);
04497 rb_str_check_dummy_enc(enc);
04498 s = RSTRING_PTR(str); send = RSTRING_END(str);
04499 if (single_byte_optimizable(str)) {
04500 while (s < send) {
04501 unsigned int c = *(unsigned char*)s;
04502
04503 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04504 *s = 'a' + (c - 'A');
04505 modify = 1;
04506 }
04507 s++;
04508 }
04509 }
04510 else {
04511 int ascompat = rb_enc_asciicompat(enc);
04512
04513 while (s < send) {
04514 unsigned int c;
04515 int n;
04516
04517 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04518 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04519 *s = 'a' + (c - 'A');
04520 modify = 1;
04521 }
04522 s++;
04523 }
04524 else {
04525 c = rb_enc_codepoint_len(s, send, &n, enc);
04526 if (rb_enc_isupper(c, enc)) {
04527
04528 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04529 modify = 1;
04530 }
04531 s += n;
04532 }
04533 }
04534 }
04535
04536 if (modify) return str;
04537 return Qnil;
04538 }
04539
04540
04541
04542
04543
04544
04545
04546
04547
04548
04549
04550
04551
04552
04553 static VALUE
04554 rb_str_downcase(VALUE str)
04555 {
04556 str = rb_str_dup(str);
04557 rb_str_downcase_bang(str);
04558 return str;
04559 }
04560
04561
04562
04563
04564
04565
04566
04567
04568
04569
04570
04571
04572
04573
04574
04575
04576 static VALUE
04577 rb_str_capitalize_bang(VALUE str)
04578 {
04579 rb_encoding *enc;
04580 char *s, *send;
04581 int modify = 0;
04582 unsigned int c;
04583 int n;
04584
04585 str_modify_keep_cr(str);
04586 enc = STR_ENC_GET(str);
04587 rb_str_check_dummy_enc(enc);
04588 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04589 s = RSTRING_PTR(str); send = RSTRING_END(str);
04590
04591 c = rb_enc_codepoint_len(s, send, &n, enc);
04592 if (rb_enc_islower(c, enc)) {
04593 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04594 modify = 1;
04595 }
04596 s += n;
04597 while (s < send) {
04598 c = rb_enc_codepoint_len(s, send, &n, enc);
04599 if (rb_enc_isupper(c, enc)) {
04600 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04601 modify = 1;
04602 }
04603 s += n;
04604 }
04605
04606 if (modify) return str;
04607 return Qnil;
04608 }
04609
04610
04611
04612
04613
04614
04615
04616
04617
04618
04619
04620
04621
04622
04623
04624 static VALUE
04625 rb_str_capitalize(VALUE str)
04626 {
04627 str = rb_str_dup(str);
04628 rb_str_capitalize_bang(str);
04629 return str;
04630 }
04631
04632
04633
04634
04635
04636
04637
04638
04639
04640
04641
04642 static VALUE
04643 rb_str_swapcase_bang(VALUE str)
04644 {
04645 rb_encoding *enc;
04646 char *s, *send;
04647 int modify = 0;
04648 int n;
04649
04650 str_modify_keep_cr(str);
04651 enc = STR_ENC_GET(str);
04652 rb_str_check_dummy_enc(enc);
04653 s = RSTRING_PTR(str); send = RSTRING_END(str);
04654 while (s < send) {
04655 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04656
04657 if (rb_enc_isupper(c, enc)) {
04658
04659 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04660 modify = 1;
04661 }
04662 else if (rb_enc_islower(c, enc)) {
04663
04664 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04665 modify = 1;
04666 }
04667 s += n;
04668 }
04669
04670 if (modify) return str;
04671 return Qnil;
04672 }
04673
04674
04675
04676
04677
04678
04679
04680
04681
04682
04683
04684
04685
04686
04687 static VALUE
04688 rb_str_swapcase(VALUE str)
04689 {
04690 str = rb_str_dup(str);
04691 rb_str_swapcase_bang(str);
04692 return str;
04693 }
04694
04695 typedef unsigned char *USTR;
04696
04697 struct tr {
04698 int gen;
04699 unsigned int now, max;
04700 char *p, *pend;
04701 };
04702
04703 static unsigned int
04704 trnext(struct tr *t, rb_encoding *enc)
04705 {
04706 int n;
04707
04708 for (;;) {
04709 if (!t->gen) {
04710 if (t->p == t->pend) return -1;
04711 if (t->p < t->pend - 1 && *t->p == '\\') {
04712 t->p++;
04713 }
04714 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04715 t->p += n;
04716 if (t->p < t->pend - 1 && *t->p == '-') {
04717 t->p++;
04718 if (t->p < t->pend) {
04719 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04720 t->p += n;
04721 if (t->now > c) {
04722 if (t->now < 0x80 && c < 0x80) {
04723 rb_raise(rb_eArgError,
04724 "invalid range \"%c-%c\" in string transliteration",
04725 t->now, c);
04726 }
04727 else {
04728 rb_raise(rb_eArgError, "invalid range in string transliteration");
04729 }
04730 continue;
04731 }
04732 t->gen = 1;
04733 t->max = c;
04734 }
04735 }
04736 return t->now;
04737 }
04738 else if (++t->now < t->max) {
04739 return t->now;
04740 }
04741 else {
04742 t->gen = 0;
04743 return t->max;
04744 }
04745 }
04746 }
04747
04748 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04749
04750 static VALUE
04751 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04752 {
04753 const unsigned int errc = -1;
04754 unsigned int trans[256];
04755 rb_encoding *enc, *e1, *e2;
04756 struct tr trsrc, trrepl;
04757 int cflag = 0;
04758 unsigned int c, c0;
04759 int last = 0, modify = 0, i, l;
04760 char *s, *send;
04761 VALUE hash = 0;
04762 int singlebyte = single_byte_optimizable(str);
04763 int cr;
04764
04765 #define CHECK_IF_ASCII(c) \
04766 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04767 (cr = ENC_CODERANGE_VALID) : 0)
04768
04769 StringValue(src);
04770 StringValue(repl);
04771 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04772 if (RSTRING_LEN(repl) == 0) {
04773 return rb_str_delete_bang(1, &src, str);
04774 }
04775
04776 cr = ENC_CODERANGE(str);
04777 e1 = rb_enc_check(str, src);
04778 e2 = rb_enc_check(str, repl);
04779 if (e1 == e2) {
04780 enc = e1;
04781 }
04782 else {
04783 enc = rb_enc_check(src, repl);
04784 }
04785 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04786 if (RSTRING_LEN(src) > 1 &&
04787 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04788 trsrc.p + l < trsrc.pend) {
04789 cflag = 1;
04790 trsrc.p += l;
04791 }
04792 trrepl.p = RSTRING_PTR(repl);
04793 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04794 trsrc.gen = trrepl.gen = 0;
04795 trsrc.now = trrepl.now = 0;
04796 trsrc.max = trrepl.max = 0;
04797
04798 if (cflag) {
04799 for (i=0; i<256; i++) {
04800 trans[i] = 1;
04801 }
04802 while ((c = trnext(&trsrc, enc)) != errc) {
04803 if (c < 256) {
04804 trans[c] = errc;
04805 }
04806 else {
04807 if (!hash) hash = rb_hash_new();
04808 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04809 }
04810 }
04811 while ((c = trnext(&trrepl, enc)) != errc)
04812 ;
04813 last = trrepl.now;
04814 for (i=0; i<256; i++) {
04815 if (trans[i] != errc) {
04816 trans[i] = last;
04817 }
04818 }
04819 }
04820 else {
04821 unsigned int r;
04822
04823 for (i=0; i<256; i++) {
04824 trans[i] = errc;
04825 }
04826 while ((c = trnext(&trsrc, enc)) != errc) {
04827 r = trnext(&trrepl, enc);
04828 if (r == errc) r = trrepl.now;
04829 if (c < 256) {
04830 trans[c] = r;
04831 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04832 }
04833 else {
04834 if (!hash) hash = rb_hash_new();
04835 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04836 }
04837 }
04838 }
04839
04840 if (cr == ENC_CODERANGE_VALID)
04841 cr = ENC_CODERANGE_7BIT;
04842 str_modify_keep_cr(str);
04843 s = RSTRING_PTR(str); send = RSTRING_END(str);
04844 if (sflag) {
04845 int clen, tlen;
04846 long offset, max = RSTRING_LEN(str);
04847 unsigned int save = -1;
04848 char *buf = ALLOC_N(char, max), *t = buf;
04849
04850 while (s < send) {
04851 int may_modify = 0;
04852
04853 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04854 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04855
04856 s += clen;
04857 if (c < 256) {
04858 c = trans[c];
04859 }
04860 else if (hash) {
04861 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04862 if (NIL_P(tmp)) {
04863 if (cflag) c = last;
04864 else c = errc;
04865 }
04866 else if (cflag) c = errc;
04867 else c = NUM2INT(tmp);
04868 }
04869 else {
04870 c = errc;
04871 }
04872 if (c != (unsigned int)-1) {
04873 if (save == c) {
04874 CHECK_IF_ASCII(c);
04875 continue;
04876 }
04877 save = c;
04878 tlen = rb_enc_codelen(c, enc);
04879 modify = 1;
04880 }
04881 else {
04882 save = -1;
04883 c = c0;
04884 if (enc != e1) may_modify = 1;
04885 }
04886 while (t - buf + tlen >= max) {
04887 offset = t - buf;
04888 max *= 2;
04889 REALLOC_N(buf, char, max);
04890 t = buf + offset;
04891 }
04892 rb_enc_mbcput(c, t, enc);
04893 if (may_modify && memcmp(s, t, tlen) != 0) {
04894 modify = 1;
04895 }
04896 CHECK_IF_ASCII(c);
04897 t += tlen;
04898 }
04899 *t = '\0';
04900 RSTRING(str)->as.heap.ptr = buf;
04901 RSTRING(str)->as.heap.len = t - buf;
04902 STR_SET_NOEMBED(str);
04903 RSTRING(str)->as.heap.aux.capa = max;
04904 }
04905 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04906 while (s < send) {
04907 c = (unsigned char)*s;
04908 if (trans[c] != errc) {
04909 if (!cflag) {
04910 c = trans[c];
04911 *s = c;
04912 modify = 1;
04913 }
04914 else {
04915 *s = last;
04916 modify = 1;
04917 }
04918 }
04919 CHECK_IF_ASCII(c);
04920 s++;
04921 }
04922 }
04923 else {
04924 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04925 long offset;
04926 char *buf = ALLOC_N(char, max), *t = buf;
04927
04928 while (s < send) {
04929 int may_modify = 0;
04930 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04931 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04932
04933 if (c < 256) {
04934 c = trans[c];
04935 }
04936 else if (hash) {
04937 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04938 if (NIL_P(tmp)) {
04939 if (cflag) c = last;
04940 else c = errc;
04941 }
04942 else if (cflag) c = errc;
04943 else c = NUM2INT(tmp);
04944 }
04945 else {
04946 c = errc;
04947 }
04948 if (c != errc) {
04949 tlen = rb_enc_codelen(c, enc);
04950 modify = 1;
04951 }
04952 else {
04953 c = c0;
04954 if (enc != e1) may_modify = 1;
04955 }
04956 while (t - buf + tlen >= max) {
04957 offset = t - buf;
04958 max *= 2;
04959 REALLOC_N(buf, char, max);
04960 t = buf + offset;
04961 }
04962 if (s != t) {
04963 rb_enc_mbcput(c, t, enc);
04964 if (may_modify && memcmp(s, t, tlen) != 0) {
04965 modify = 1;
04966 }
04967 }
04968 CHECK_IF_ASCII(c);
04969 s += clen;
04970 t += tlen;
04971 }
04972 if (!STR_EMBED_P(str)) {
04973 xfree(RSTRING(str)->as.heap.ptr);
04974 }
04975 *t = '\0';
04976 RSTRING(str)->as.heap.ptr = buf;
04977 RSTRING(str)->as.heap.len = t - buf;
04978 STR_SET_NOEMBED(str);
04979 RSTRING(str)->as.heap.aux.capa = max;
04980 }
04981
04982 if (modify) {
04983 if (cr != ENC_CODERANGE_BROKEN)
04984 ENC_CODERANGE_SET(str, cr);
04985 rb_enc_associate(str, enc);
04986 return str;
04987 }
04988 return Qnil;
04989 }
04990
04991
04992
04993
04994
04995
04996
04997
04998
04999
05000
05001 static VALUE
05002 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05003 {
05004 return tr_trans(str, src, repl, 0);
05005 }
05006
05007
05008
05009
05010
05011
05012
05013
05014
05015
05016
05017
05018
05019
05020
05021
05022
05023
05024
05025 static VALUE
05026 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05027 {
05028 str = rb_str_dup(str);
05029 tr_trans(str, src, repl, 0);
05030 return str;
05031 }
05032
05033 static void
05034 tr_setup_table(VALUE str, char stable[256], int first,
05035 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05036 {
05037 const unsigned int errc = -1;
05038 char buf[256];
05039 struct tr tr;
05040 unsigned int c;
05041 VALUE table = 0, ptable = 0;
05042 int i, l, cflag = 0;
05043
05044 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05045 tr.gen = tr.now = tr.max = 0;
05046
05047 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05048 cflag = 1;
05049 tr.p += l;
05050
05051 table = rb_hash_new();
05052 ptable = *ctablep;
05053 *ctablep = table;
05054 }
05055 else {
05056 table = rb_hash_new();
05057 ptable = *tablep;
05058 *tablep = table;
05059 }
05060 if (first) {
05061 for (i=0; i<256; i++) {
05062 stable[i] = 1;
05063 }
05064 }
05065 for (i=0; i<256; i++) {
05066 buf[i] = cflag;
05067 }
05068
05069 while ((c = trnext(&tr, enc)) != errc) {
05070 if (c < 256) {
05071 buf[c & 0xff] = !cflag;
05072 }
05073 else {
05074 VALUE key = UINT2NUM(c);
05075
05076 if (!table) {
05077 table = rb_hash_new();
05078 ptable = *tablep;
05079 *tablep = table;
05080 }
05081 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05082 rb_hash_aset(table, key, Qtrue);
05083 }
05084 }
05085 }
05086 for (i=0; i<256; i++) {
05087 stable[i] = stable[i] && buf[i];
05088 }
05089 }
05090
05091
05092 static int
05093 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05094 {
05095 if (c < 256) {
05096 return table[c] != 0;
05097 }
05098 else {
05099 VALUE v = UINT2NUM(c);
05100
05101 if (del) {
05102 if (!NIL_P(rb_hash_lookup(del, v)) &&
05103 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05104 return TRUE;
05105 }
05106 }
05107 else if (nodel && NIL_P(rb_hash_lookup(nodel, v))) {
05108 return TRUE;
05109 }
05110 return FALSE;
05111 }
05112 }
05113
05114
05115
05116
05117
05118
05119
05120
05121
05122 static VALUE
05123 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05124 {
05125 char squeez[256];
05126 rb_encoding *enc = 0;
05127 char *s, *send, *t;
05128 VALUE del = 0, nodel = 0;
05129 int modify = 0;
05130 int i, ascompat, cr;
05131
05132 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05133 if (argc < 1) {
05134 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05135 }
05136 for (i=0; i<argc; i++) {
05137 VALUE s = argv[i];
05138
05139 StringValue(s);
05140 enc = rb_enc_check(str, s);
05141 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05142 }
05143
05144 str_modify_keep_cr(str);
05145 ascompat = rb_enc_asciicompat(enc);
05146 s = t = RSTRING_PTR(str);
05147 send = RSTRING_END(str);
05148 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05149 while (s < send) {
05150 unsigned int c;
05151 int clen;
05152
05153 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05154 if (squeez[c]) {
05155 modify = 1;
05156 }
05157 else {
05158 if (t != s) *t = c;
05159 t++;
05160 }
05161 s++;
05162 }
05163 else {
05164 c = rb_enc_codepoint_len(s, send, &clen, enc);
05165
05166 if (tr_find(c, squeez, del, nodel)) {
05167 modify = 1;
05168 }
05169 else {
05170 if (t != s) rb_enc_mbcput(c, t, enc);
05171 t += clen;
05172 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05173 }
05174 s += clen;
05175 }
05176 }
05177 *t = '\0';
05178 STR_SET_LEN(str, t - RSTRING_PTR(str));
05179 ENC_CODERANGE_SET(str, cr);
05180
05181 if (modify) return str;
05182 return Qnil;
05183 }
05184
05185
05186
05187
05188
05189
05190
05191
05192
05193
05194
05195
05196
05197
05198
05199
05200 static VALUE
05201 rb_str_delete(int argc, VALUE *argv, VALUE str)
05202 {
05203 str = rb_str_dup(str);
05204 rb_str_delete_bang(argc, argv, str);
05205 return str;
05206 }
05207
05208
05209
05210
05211
05212
05213
05214
05215
05216
05217 static VALUE
05218 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05219 {
05220 char squeez[256];
05221 rb_encoding *enc = 0;
05222 VALUE del = 0, nodel = 0;
05223 char *s, *send, *t;
05224 int i, modify = 0;
05225 int ascompat, singlebyte = single_byte_optimizable(str);
05226 unsigned int save;
05227
05228 if (argc == 0) {
05229 enc = STR_ENC_GET(str);
05230 }
05231 else {
05232 for (i=0; i<argc; i++) {
05233 VALUE s = argv[i];
05234
05235 StringValue(s);
05236 enc = rb_enc_check(str, s);
05237 if (singlebyte && !single_byte_optimizable(s))
05238 singlebyte = 0;
05239 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05240 }
05241 }
05242
05243 str_modify_keep_cr(str);
05244 s = t = RSTRING_PTR(str);
05245 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05246 send = RSTRING_END(str);
05247 save = -1;
05248 ascompat = rb_enc_asciicompat(enc);
05249
05250 if (singlebyte) {
05251 while (s < send) {
05252 unsigned int c = *(unsigned char*)s++;
05253 if (c != save || (argc > 0 && !squeez[c])) {
05254 *t++ = save = c;
05255 }
05256 }
05257 } else {
05258 while (s < send) {
05259 unsigned int c;
05260 int clen;
05261
05262 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05263 if (c != save || (argc > 0 && !squeez[c])) {
05264 *t++ = save = c;
05265 }
05266 s++;
05267 }
05268 else {
05269 c = rb_enc_codepoint_len(s, send, &clen, enc);
05270
05271 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05272 if (t != s) rb_enc_mbcput(c, t, enc);
05273 save = c;
05274 t += clen;
05275 }
05276 s += clen;
05277 }
05278 }
05279 }
05280
05281 *t = '\0';
05282 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05283 STR_SET_LEN(str, t - RSTRING_PTR(str));
05284 modify = 1;
05285 }
05286
05287 if (modify) return str;
05288 return Qnil;
05289 }
05290
05291
05292
05293
05294
05295
05296
05297
05298
05299
05300
05301
05302
05303
05304
05305
05306
05307 static VALUE
05308 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05309 {
05310 str = rb_str_dup(str);
05311 rb_str_squeeze_bang(argc, argv, str);
05312 return str;
05313 }
05314
05315
05316
05317
05318
05319
05320
05321
05322
05323
05324 static VALUE
05325 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05326 {
05327 return tr_trans(str, src, repl, 1);
05328 }
05329
05330
05331
05332
05333
05334
05335
05336
05337
05338
05339
05340
05341
05342
05343
05344 static VALUE
05345 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05346 {
05347 str = rb_str_dup(str);
05348 tr_trans(str, src, repl, 1);
05349 return str;
05350 }
05351
05352
05353
05354
05355
05356
05357
05358
05359
05360
05361
05362
05363
05364
05365
05366
05367
05368
05369 static VALUE
05370 rb_str_count(int argc, VALUE *argv, VALUE str)
05371 {
05372 char table[256];
05373 rb_encoding *enc = 0;
05374 VALUE del = 0, nodel = 0;
05375 char *s, *send;
05376 int i;
05377 int ascompat;
05378
05379 if (argc < 1) {
05380 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05381 }
05382 for (i=0; i<argc; i++) {
05383 VALUE tstr = argv[i];
05384 unsigned char c;
05385
05386 StringValue(tstr);
05387 enc = rb_enc_check(str, tstr);
05388 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05389 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05390 int n = 0;
05391
05392 s = RSTRING_PTR(str);
05393 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05394 send = RSTRING_END(str);
05395 while (s < send) {
05396 if (*(unsigned char*)s++ == c) n++;
05397 }
05398 return INT2NUM(n);
05399 }
05400 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05401 }
05402
05403 s = RSTRING_PTR(str);
05404 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05405 send = RSTRING_END(str);
05406 ascompat = rb_enc_asciicompat(enc);
05407 i = 0;
05408 while (s < send) {
05409 unsigned int c;
05410
05411 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05412 if (table[c]) {
05413 i++;
05414 }
05415 s++;
05416 }
05417 else {
05418 int clen;
05419 c = rb_enc_codepoint_len(s, send, &clen, enc);
05420 if (tr_find(c, table, del, nodel)) {
05421 i++;
05422 }
05423 s += clen;
05424 }
05425 }
05426
05427 return INT2NUM(i);
05428 }
05429
05430 static const char isspacetable[256] = {
05431 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05433 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05443 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05447 };
05448
05449 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05450
05451
05452
05453
05454
05455
05456
05457
05458
05459
05460
05461
05462
05463
05464
05465
05466
05467
05468
05469
05470
05471
05472
05473
05474
05475
05476
05477
05478
05479
05480
05481
05482
05483
05484
05485
05486
05487
05488
05489
05490
05491
05492
05493 static VALUE
05494 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05495 {
05496 rb_encoding *enc;
05497 VALUE spat;
05498 VALUE limit;
05499 enum {awk, string, regexp} split_type;
05500 long beg, end, i = 0;
05501 int lim = 0;
05502 VALUE result, tmp;
05503
05504 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05505 lim = NUM2INT(limit);
05506 if (lim <= 0) limit = Qnil;
05507 else if (lim == 1) {
05508 if (RSTRING_LEN(str) == 0)
05509 return rb_ary_new2(0);
05510 return rb_ary_new3(1, str);
05511 }
05512 i = 1;
05513 }
05514
05515 enc = STR_ENC_GET(str);
05516 if (NIL_P(spat)) {
05517 if (!NIL_P(rb_fs)) {
05518 spat = rb_fs;
05519 goto fs_set;
05520 }
05521 split_type = awk;
05522 }
05523 else {
05524 fs_set:
05525 if (TYPE(spat) == T_STRING) {
05526 rb_encoding *enc2 = STR_ENC_GET(spat);
05527
05528 split_type = string;
05529 if (RSTRING_LEN(spat) == 0) {
05530
05531 spat = rb_reg_regcomp(spat);
05532 split_type = regexp;
05533 }
05534 else if (rb_enc_asciicompat(enc2) == 1) {
05535 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05536 split_type = awk;
05537 }
05538 }
05539 else {
05540 int l;
05541 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05542 RSTRING_LEN(spat) == l) {
05543 split_type = awk;
05544 }
05545 }
05546 }
05547 else {
05548 spat = get_pat(spat, 1);
05549 split_type = regexp;
05550 }
05551 }
05552
05553 result = rb_ary_new();
05554 beg = 0;
05555 if (split_type == awk) {
05556 char *ptr = RSTRING_PTR(str);
05557 char *eptr = RSTRING_END(str);
05558 char *bptr = ptr;
05559 int skip = 1;
05560 unsigned int c;
05561
05562 end = beg;
05563 if (is_ascii_string(str)) {
05564 while (ptr < eptr) {
05565 c = (unsigned char)*ptr++;
05566 if (skip) {
05567 if (ascii_isspace(c)) {
05568 beg = ptr - bptr;
05569 }
05570 else {
05571 end = ptr - bptr;
05572 skip = 0;
05573 if (!NIL_P(limit) && lim <= i) break;
05574 }
05575 }
05576 else if (ascii_isspace(c)) {
05577 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05578 skip = 1;
05579 beg = ptr - bptr;
05580 if (!NIL_P(limit)) ++i;
05581 }
05582 else {
05583 end = ptr - bptr;
05584 }
05585 }
05586 }
05587 else {
05588 while (ptr < eptr) {
05589 int n;
05590
05591 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05592 ptr += n;
05593 if (skip) {
05594 if (rb_isspace(c)) {
05595 beg = ptr - bptr;
05596 }
05597 else {
05598 end = ptr - bptr;
05599 skip = 0;
05600 if (!NIL_P(limit) && lim <= i) break;
05601 }
05602 }
05603 else if (rb_isspace(c)) {
05604 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05605 skip = 1;
05606 beg = ptr - bptr;
05607 if (!NIL_P(limit)) ++i;
05608 }
05609 else {
05610 end = ptr - bptr;
05611 }
05612 }
05613 }
05614 }
05615 else if (split_type == string) {
05616 char *ptr = RSTRING_PTR(str);
05617 char *temp = ptr;
05618 char *eptr = RSTRING_END(str);
05619 char *sptr = RSTRING_PTR(spat);
05620 long slen = RSTRING_LEN(spat);
05621
05622 if (is_broken_string(str)) {
05623 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05624 }
05625 if (is_broken_string(spat)) {
05626 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05627 }
05628 enc = rb_enc_check(str, spat);
05629 while (ptr < eptr &&
05630 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05631
05632 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05633 if (t != ptr + end) {
05634 ptr = t;
05635 continue;
05636 }
05637 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05638 ptr += end + slen;
05639 if (!NIL_P(limit) && lim <= ++i) break;
05640 }
05641 beg = ptr - temp;
05642 }
05643 else {
05644 char *ptr = RSTRING_PTR(str);
05645 long len = RSTRING_LEN(str);
05646 long start = beg;
05647 long idx;
05648 int last_null = 0;
05649 struct re_registers *regs;
05650
05651 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05652 regs = RMATCH_REGS(rb_backref_get());
05653 if (start == end && BEG(0) == END(0)) {
05654 if (!ptr) {
05655 rb_ary_push(result, str_new_empty(str));
05656 break;
05657 }
05658 else if (last_null == 1) {
05659 rb_ary_push(result, rb_str_subseq(str, beg,
05660 rb_enc_fast_mbclen(ptr+beg,
05661 ptr+len,
05662 enc)));
05663 beg = start;
05664 }
05665 else {
05666 if (ptr+start == ptr+len)
05667 start++;
05668 else
05669 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05670 last_null = 1;
05671 continue;
05672 }
05673 }
05674 else {
05675 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05676 beg = start = END(0);
05677 }
05678 last_null = 0;
05679
05680 for (idx=1; idx < regs->num_regs; idx++) {
05681 if (BEG(idx) == -1) continue;
05682 if (BEG(idx) == END(idx))
05683 tmp = str_new_empty(str);
05684 else
05685 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05686 rb_ary_push(result, tmp);
05687 }
05688 if (!NIL_P(limit) && lim <= ++i) break;
05689 }
05690 }
05691 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05692 if (RSTRING_LEN(str) == beg)
05693 tmp = str_new_empty(str);
05694 else
05695 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05696 rb_ary_push(result, tmp);
05697 }
05698 if (NIL_P(limit) && lim == 0) {
05699 long len;
05700 while ((len = RARRAY_LEN(result)) > 0 &&
05701 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05702 rb_ary_pop(result);
05703 }
05704
05705 return result;
05706 }
05707
05708 VALUE
05709 rb_str_split(VALUE str, const char *sep0)
05710 {
05711 VALUE sep;
05712
05713 StringValue(str);
05714 sep = rb_str_new2(sep0);
05715 return rb_str_split_m(1, &sep, str);
05716 }
05717
05718
05719
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751
05752
05753
05754
05755
05756 static VALUE
05757 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05758 {
05759 rb_encoding *enc;
05760 VALUE rs;
05761 unsigned int newline;
05762 const char *p, *pend, *s, *ptr;
05763 long len, rslen;
05764 VALUE line;
05765 int n;
05766 VALUE orig = str;
05767
05768 if (argc == 0) {
05769 rs = rb_rs;
05770 }
05771 else {
05772 rb_scan_args(argc, argv, "01", &rs);
05773 }
05774 RETURN_ENUMERATOR(str, argc, argv);
05775 if (NIL_P(rs)) {
05776 rb_yield(str);
05777 return orig;
05778 }
05779 str = rb_str_new4(str);
05780 ptr = p = s = RSTRING_PTR(str);
05781 pend = p + RSTRING_LEN(str);
05782 len = RSTRING_LEN(str);
05783 StringValue(rs);
05784 if (rs == rb_default_rs) {
05785 enc = rb_enc_get(str);
05786 while (p < pend) {
05787 char *p0;
05788
05789 p = memchr(p, '\n', pend - p);
05790 if (!p) break;
05791 p0 = rb_enc_left_char_head(s, p, pend, enc);
05792 if (!rb_enc_is_newline(p0, pend, enc)) {
05793 p++;
05794 continue;
05795 }
05796 p = p0 + rb_enc_mbclen(p0, pend, enc);
05797 line = rb_str_new5(str, s, p - s);
05798 OBJ_INFECT(line, str);
05799 rb_enc_cr_str_copy_for_substr(line, str);
05800 rb_yield(line);
05801 str_mod_check(str, ptr, len);
05802 s = p;
05803 }
05804 goto finish;
05805 }
05806
05807 enc = rb_enc_check(str, rs);
05808 rslen = RSTRING_LEN(rs);
05809 if (rslen == 0) {
05810 newline = '\n';
05811 }
05812 else {
05813 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05814 }
05815
05816 while (p < pend) {
05817 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05818
05819 again:
05820 if (rslen == 0 && c == newline) {
05821 p += n;
05822 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05823 goto again;
05824 }
05825 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05826 p += n;
05827 }
05828 p -= n;
05829 }
05830 if (c == newline &&
05831 (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
05832 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05833 OBJ_INFECT(line, str);
05834 rb_enc_cr_str_copy_for_substr(line, str);
05835 rb_yield(line);
05836 str_mod_check(str, ptr, len);
05837 s = p + (rslen ? rslen : n);
05838 }
05839 p += n;
05840 }
05841
05842 finish:
05843 if (s != pend) {
05844 line = rb_str_new5(str, s, pend - s);
05845 OBJ_INFECT(line, str);
05846 rb_enc_cr_str_copy_for_substr(line, str);
05847 rb_yield(line);
05848 }
05849
05850 return orig;
05851 }
05852
05853
05854
05855
05856
05857
05858
05859
05860
05861
05862
05863
05864
05865
05866
05867
05868
05869
05870
05871
05872 static VALUE
05873 rb_str_each_byte(VALUE str)
05874 {
05875 long i;
05876
05877 RETURN_ENUMERATOR(str, 0, 0);
05878 for (i=0; i<RSTRING_LEN(str); i++) {
05879 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05880 }
05881 return str;
05882 }
05883
05884
05885
05886
05887
05888
05889
05890
05891
05892
05893
05894
05895
05896
05897
05898
05899
05900
05901
05902
05903 static VALUE
05904 rb_str_each_char(VALUE str)
05905 {
05906 VALUE orig = str;
05907 long i, len, n;
05908 const char *ptr;
05909 rb_encoding *enc;
05910
05911 RETURN_ENUMERATOR(str, 0, 0);
05912 str = rb_str_new4(str);
05913 ptr = RSTRING_PTR(str);
05914 len = RSTRING_LEN(str);
05915 enc = rb_enc_get(str);
05916 switch (ENC_CODERANGE(str)) {
05917 case ENC_CODERANGE_VALID:
05918 case ENC_CODERANGE_7BIT:
05919 for (i = 0; i < len; i += n) {
05920 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05921 rb_yield(rb_str_subseq(str, i, n));
05922 }
05923 break;
05924 default:
05925 for (i = 0; i < len; i += n) {
05926 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05927 rb_yield(rb_str_subseq(str, i, n));
05928 }
05929 }
05930 return orig;
05931 }
05932
05933
05934
05935
05936
05937
05938
05939
05940
05941
05942
05943
05944
05945
05946
05947
05948
05949
05950
05951
05952
05953
05954 static VALUE
05955 rb_str_each_codepoint(VALUE str)
05956 {
05957 VALUE orig = str;
05958 long len;
05959 int n;
05960 unsigned int c;
05961 const char *ptr, *end;
05962 rb_encoding *enc;
05963
05964 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05965 RETURN_ENUMERATOR(str, 0, 0);
05966 str = rb_str_new4(str);
05967 ptr = RSTRING_PTR(str);
05968 len = RSTRING_LEN(str);
05969 end = RSTRING_END(str);
05970 enc = STR_ENC_GET(str);
05971 while (ptr < end) {
05972 c = rb_enc_codepoint_len(ptr, end, &n, enc);
05973 rb_yield(UINT2NUM(c));
05974 ptr += n;
05975 }
05976 return orig;
05977 }
05978
05979 static long
05980 chopped_length(VALUE str)
05981 {
05982 rb_encoding *enc = STR_ENC_GET(str);
05983 const char *p, *p2, *beg, *end;
05984
05985 beg = RSTRING_PTR(str);
05986 end = beg + RSTRING_LEN(str);
05987 if (beg > end) return 0;
05988 p = rb_enc_prev_char(beg, end, end, enc);
05989 if (!p) return 0;
05990 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05991 p2 = rb_enc_prev_char(beg, p, end, enc);
05992 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05993 }
05994 return p - beg;
05995 }
05996
05997
05998
05999
06000
06001
06002
06003
06004
06005
06006 static VALUE
06007 rb_str_chop_bang(VALUE str)
06008 {
06009 str_modify_keep_cr(str);
06010 if (RSTRING_LEN(str) > 0) {
06011 long len;
06012 len = chopped_length(str);
06013 STR_SET_LEN(str, len);
06014 RSTRING_PTR(str)[len] = '\0';
06015 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06016 ENC_CODERANGE_CLEAR(str);
06017 }
06018 return str;
06019 }
06020 return Qnil;
06021 }
06022
06023
06024
06025
06026
06027
06028
06029
06030
06031
06032
06033
06034
06035
06036
06037
06038
06039
06040
06041 static VALUE
06042 rb_str_chop(VALUE str)
06043 {
06044 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06045 rb_enc_cr_str_copy_for_substr(str2, str);
06046 OBJ_INFECT(str2, str);
06047 return str2;
06048 }
06049
06050
06051
06052
06053
06054
06055
06056
06057
06058
06059 static VALUE
06060 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06061 {
06062 rb_encoding *enc;
06063 VALUE rs;
06064 int newline;
06065 char *p, *pp, *e;
06066 long len, rslen;
06067
06068 str_modify_keep_cr(str);
06069 len = RSTRING_LEN(str);
06070 if (len == 0) return Qnil;
06071 p = RSTRING_PTR(str);
06072 e = p + len;
06073 if (argc == 0) {
06074 rs = rb_rs;
06075 if (rs == rb_default_rs) {
06076 smart_chomp:
06077 enc = rb_enc_get(str);
06078 if (rb_enc_mbminlen(enc) > 1) {
06079 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06080 if (rb_enc_is_newline(pp, e, enc)) {
06081 e = pp;
06082 }
06083 pp = e - rb_enc_mbminlen(enc);
06084 if (pp >= p) {
06085 pp = rb_enc_left_char_head(p, pp, e, enc);
06086 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06087 e = pp;
06088 }
06089 }
06090 if (e == RSTRING_END(str)) {
06091 return Qnil;
06092 }
06093 len = e - RSTRING_PTR(str);
06094 STR_SET_LEN(str, len);
06095 }
06096 else {
06097 if (RSTRING_PTR(str)[len-1] == '\n') {
06098 STR_DEC_LEN(str);
06099 if (RSTRING_LEN(str) > 0 &&
06100 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06101 STR_DEC_LEN(str);
06102 }
06103 }
06104 else if (RSTRING_PTR(str)[len-1] == '\r') {
06105 STR_DEC_LEN(str);
06106 }
06107 else {
06108 return Qnil;
06109 }
06110 }
06111 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06112 return str;
06113 }
06114 }
06115 else {
06116 rb_scan_args(argc, argv, "01", &rs);
06117 }
06118 if (NIL_P(rs)) return Qnil;
06119 StringValue(rs);
06120 rslen = RSTRING_LEN(rs);
06121 if (rslen == 0) {
06122 while (len>0 && p[len-1] == '\n') {
06123 len--;
06124 if (len>0 && p[len-1] == '\r')
06125 len--;
06126 }
06127 if (len < RSTRING_LEN(str)) {
06128 STR_SET_LEN(str, len);
06129 RSTRING_PTR(str)[len] = '\0';
06130 return str;
06131 }
06132 return Qnil;
06133 }
06134 if (rslen > len) return Qnil;
06135 newline = RSTRING_PTR(rs)[rslen-1];
06136 if (rslen == 1 && newline == '\n')
06137 goto smart_chomp;
06138
06139 enc = rb_enc_check(str, rs);
06140 if (is_broken_string(rs)) {
06141 return Qnil;
06142 }
06143 pp = e - rslen;
06144 if (p[len-1] == newline &&
06145 (rslen <= 1 ||
06146 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06147 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06148 return Qnil;
06149 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06150 ENC_CODERANGE_CLEAR(str);
06151 }
06152 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06153 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06154 return str;
06155 }
06156 return Qnil;
06157 }
06158
06159
06160
06161
06162
06163
06164
06165
06166
06167
06168
06169
06170
06171
06172
06173
06174
06175
06176
06177
06178
06179 static VALUE
06180 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06181 {
06182 str = rb_str_dup(str);
06183 rb_str_chomp_bang(argc, argv, str);
06184 return str;
06185 }
06186
06187
06188
06189
06190
06191
06192
06193
06194
06195
06196
06197
06198
06199 static VALUE
06200 rb_str_lstrip_bang(VALUE str)
06201 {
06202 rb_encoding *enc;
06203 char *s, *t, *e;
06204
06205 str_modify_keep_cr(str);
06206 enc = STR_ENC_GET(str);
06207 s = RSTRING_PTR(str);
06208 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06209 e = t = RSTRING_END(str);
06210
06211 while (s < e) {
06212 int n;
06213 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06214
06215 if (!rb_isspace(cc)) break;
06216 s += n;
06217 }
06218
06219 if (s > RSTRING_PTR(str)) {
06220 STR_SET_LEN(str, t-s);
06221 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06222 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06223 return str;
06224 }
06225 return Qnil;
06226 }
06227
06228
06229
06230
06231
06232
06233
06234
06235
06236
06237
06238
06239
06240 static VALUE
06241 rb_str_lstrip(VALUE str)
06242 {
06243 str = rb_str_dup(str);
06244 rb_str_lstrip_bang(str);
06245 return str;
06246 }
06247
06248
06249
06250
06251
06252
06253
06254
06255
06256
06257
06258
06259
06260
06261 static VALUE
06262 rb_str_rstrip_bang(VALUE str)
06263 {
06264 rb_encoding *enc;
06265 char *s, *t, *e;
06266
06267 str_modify_keep_cr(str);
06268 enc = STR_ENC_GET(str);
06269 rb_str_check_dummy_enc(enc);
06270 s = RSTRING_PTR(str);
06271 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06272 t = e = RSTRING_END(str);
06273
06274
06275 if (single_byte_optimizable(str)) {
06276 unsigned char c;
06277 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06278 }
06279 else {
06280 char *tp;
06281
06282 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06283 unsigned int c = rb_enc_codepoint(tp, e, enc);
06284 if (c && !rb_isspace(c)) break;
06285 t = tp;
06286 }
06287 }
06288 if (t < e) {
06289 long len = t-RSTRING_PTR(str);
06290
06291 STR_SET_LEN(str, len);
06292 RSTRING_PTR(str)[len] = '\0';
06293 return str;
06294 }
06295 return Qnil;
06296 }
06297
06298
06299
06300
06301
06302
06303
06304
06305
06306
06307
06308
06309
06310 static VALUE
06311 rb_str_rstrip(VALUE str)
06312 {
06313 str = rb_str_dup(str);
06314 rb_str_rstrip_bang(str);
06315 return str;
06316 }
06317
06318
06319
06320
06321
06322
06323
06324
06325
06326
06327 static VALUE
06328 rb_str_strip_bang(VALUE str)
06329 {
06330 VALUE l = rb_str_lstrip_bang(str);
06331 VALUE r = rb_str_rstrip_bang(str);
06332
06333 if (NIL_P(l) && NIL_P(r)) return Qnil;
06334 return str;
06335 }
06336
06337
06338
06339
06340
06341
06342
06343
06344
06345
06346
06347
06348 static VALUE
06349 rb_str_strip(VALUE str)
06350 {
06351 str = rb_str_dup(str);
06352 rb_str_strip_bang(str);
06353 return str;
06354 }
06355
06356 static VALUE
06357 scan_once(VALUE str, VALUE pat, long *start)
06358 {
06359 VALUE result, match;
06360 struct re_registers *regs;
06361 int i;
06362
06363 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06364 match = rb_backref_get();
06365 regs = RMATCH_REGS(match);
06366 if (BEG(0) == END(0)) {
06367 rb_encoding *enc = STR_ENC_GET(str);
06368
06369
06370
06371 if (RSTRING_LEN(str) > END(0))
06372 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06373 RSTRING_END(str), enc);
06374 else
06375 *start = END(0)+1;
06376 }
06377 else {
06378 *start = END(0);
06379 }
06380 if (regs->num_regs == 1) {
06381 return rb_reg_nth_match(0, match);
06382 }
06383 result = rb_ary_new2(regs->num_regs);
06384 for (i=1; i < regs->num_regs; i++) {
06385 rb_ary_push(result, rb_reg_nth_match(i, match));
06386 }
06387
06388 return result;
06389 }
06390 return Qnil;
06391 }
06392
06393
06394
06395
06396
06397
06398
06399
06400
06401
06402
06403
06404
06405
06406
06407
06408
06409
06410
06411
06412
06413
06414
06415
06416
06417
06418
06419
06420
06421
06422
06423
06424
06425 static VALUE
06426 rb_str_scan(VALUE str, VALUE pat)
06427 {
06428 VALUE result;
06429 long start = 0;
06430 long last = -1, prev = 0;
06431 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06432
06433 pat = get_pat(pat, 1);
06434 if (!rb_block_given_p()) {
06435 VALUE ary = rb_ary_new();
06436
06437 while (!NIL_P(result = scan_once(str, pat, &start))) {
06438 last = prev;
06439 prev = start;
06440 rb_ary_push(ary, result);
06441 }
06442 if (last >= 0) rb_reg_search(pat, str, last, 0);
06443 return ary;
06444 }
06445
06446 while (!NIL_P(result = scan_once(str, pat, &start))) {
06447 last = prev;
06448 prev = start;
06449 rb_yield(result);
06450 str_mod_check(str, p, len);
06451 }
06452 if (last >= 0) rb_reg_search(pat, str, last, 0);
06453 return str;
06454 }
06455
06456
06457
06458
06459
06460
06461
06462
06463
06464
06465
06466
06467
06468
06469
06470
06471 static VALUE
06472 rb_str_hex(VALUE str)
06473 {
06474 rb_encoding *enc = rb_enc_get(str);
06475
06476 if (!rb_enc_asciicompat(enc)) {
06477 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06478 }
06479 return rb_str_to_inum(str, 16, FALSE);
06480 }
06481
06482
06483
06484
06485
06486
06487
06488
06489
06490
06491
06492
06493
06494
06495
06496
06497 static VALUE
06498 rb_str_oct(VALUE str)
06499 {
06500 rb_encoding *enc = rb_enc_get(str);
06501
06502 if (!rb_enc_asciicompat(enc)) {
06503 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06504 }
06505 return rb_str_to_inum(str, -8, FALSE);
06506 }
06507
06508
06509
06510
06511
06512
06513
06514
06515
06516
06517
06518
06519 static VALUE
06520 rb_str_crypt(VALUE str, VALUE salt)
06521 {
06522 extern char *crypt(const char *, const char *);
06523 VALUE result;
06524 const char *s, *saltp;
06525 #ifdef BROKEN_CRYPT
06526 char salt_8bit_clean[3];
06527 #endif
06528
06529 StringValue(salt);
06530 if (RSTRING_LEN(salt) < 2)
06531 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06532
06533 s = RSTRING_PTR(str);
06534 if (!s) s = "";
06535 saltp = RSTRING_PTR(salt);
06536 #ifdef BROKEN_CRYPT
06537 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06538 salt_8bit_clean[0] = saltp[0] & 0x7f;
06539 salt_8bit_clean[1] = saltp[1] & 0x7f;
06540 salt_8bit_clean[2] = '\0';
06541 saltp = salt_8bit_clean;
06542 }
06543 #endif
06544 result = rb_str_new2(crypt(s, saltp));
06545 OBJ_INFECT(result, str);
06546 OBJ_INFECT(result, salt);
06547 return result;
06548 }
06549
06550
06551
06552
06553
06554
06555
06556
06557
06558
06559
06560
06561
06562
06563
06564
06565
06566
06567
06568
06569
06570
06571 VALUE
06572 rb_str_intern(VALUE s)
06573 {
06574 VALUE str = RB_GC_GUARD(s);
06575 ID id;
06576
06577 id = rb_intern_str(str);
06578 return ID2SYM(id);
06579 }
06580
06581
06582
06583
06584
06585
06586
06587
06588
06589
06590
06591 VALUE
06592 rb_str_ord(VALUE s)
06593 {
06594 unsigned int c;
06595
06596 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06597 return UINT2NUM(c);
06598 }
06599
06600
06601
06602
06603
06604
06605
06606
06607
06608
06609
06610 static VALUE
06611 rb_str_sum(int argc, VALUE *argv, VALUE str)
06612 {
06613 VALUE vbits;
06614 int bits;
06615 char *ptr, *p, *pend;
06616 long len;
06617 VALUE sum = INT2FIX(0);
06618 unsigned long sum0 = 0;
06619
06620 if (argc == 0) {
06621 bits = 16;
06622 }
06623 else {
06624 rb_scan_args(argc, argv, "01", &vbits);
06625 bits = NUM2INT(vbits);
06626 }
06627 ptr = p = RSTRING_PTR(str);
06628 len = RSTRING_LEN(str);
06629 pend = p + len;
06630
06631 while (p < pend) {
06632 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06633 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06634 str_mod_check(str, ptr, len);
06635 sum0 = 0;
06636 }
06637 sum0 += (unsigned char)*p;
06638 p++;
06639 }
06640
06641 if (bits == 0) {
06642 if (sum0) {
06643 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06644 }
06645 }
06646 else {
06647 if (sum == INT2FIX(0)) {
06648 if (bits < (int)sizeof(long)*CHAR_BIT) {
06649 sum0 &= (((unsigned long)1)<<bits)-1;
06650 }
06651 sum = LONG2FIX(sum0);
06652 }
06653 else {
06654 VALUE mod;
06655
06656 if (sum0) {
06657 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06658 }
06659
06660 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06661 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06662 sum = rb_funcall(sum, '&', 1, mod);
06663 }
06664 }
06665 return sum;
06666 }
06667
06668 static VALUE
06669 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06670 {
06671 rb_encoding *enc;
06672 VALUE w;
06673 long width, len, flen = 1, fclen = 1;
06674 VALUE res;
06675 char *p;
06676 const char *f = " ";
06677 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06678 volatile VALUE pad;
06679 int singlebyte = 1, cr;
06680
06681 rb_scan_args(argc, argv, "11", &w, &pad);
06682 enc = STR_ENC_GET(str);
06683 width = NUM2LONG(w);
06684 if (argc == 2) {
06685 StringValue(pad);
06686 enc = rb_enc_check(str, pad);
06687 f = RSTRING_PTR(pad);
06688 flen = RSTRING_LEN(pad);
06689 fclen = str_strlen(pad, enc);
06690 singlebyte = single_byte_optimizable(pad);
06691 if (flen == 0 || fclen == 0) {
06692 rb_raise(rb_eArgError, "zero width padding");
06693 }
06694 }
06695 len = str_strlen(str, enc);
06696 if (width < 0 || len >= width) return rb_str_dup(str);
06697 n = width - len;
06698 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06699 rlen = n - llen;
06700 cr = ENC_CODERANGE(str);
06701 if (flen > 1) {
06702 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06703 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06704 }
06705 size = RSTRING_LEN(str);
06706 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06707 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06708 (len += llen2 + rlen2) >= LONG_MAX - size) {
06709 rb_raise(rb_eArgError, "argument too big");
06710 }
06711 len += size;
06712 res = rb_str_new5(str, 0, len);
06713 p = RSTRING_PTR(res);
06714 if (flen <= 1) {
06715 memset(p, *f, llen);
06716 p += llen;
06717 }
06718 else {
06719 while (llen >= fclen) {
06720 memcpy(p,f,flen);
06721 p += flen;
06722 llen -= fclen;
06723 }
06724 if (llen > 0) {
06725 memcpy(p, f, llen2);
06726 p += llen2;
06727 }
06728 }
06729 memcpy(p, RSTRING_PTR(str), size);
06730 p += size;
06731 if (flen <= 1) {
06732 memset(p, *f, rlen);
06733 p += rlen;
06734 }
06735 else {
06736 while (rlen >= fclen) {
06737 memcpy(p,f,flen);
06738 p += flen;
06739 rlen -= fclen;
06740 }
06741 if (rlen > 0) {
06742 memcpy(p, f, rlen2);
06743 p += rlen2;
06744 }
06745 }
06746 *p = '\0';
06747 STR_SET_LEN(res, p-RSTRING_PTR(res));
06748 OBJ_INFECT(res, str);
06749 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06750 rb_enc_associate(res, enc);
06751 if (argc == 2)
06752 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06753 if (cr != ENC_CODERANGE_BROKEN)
06754 ENC_CODERANGE_SET(res, cr);
06755 return res;
06756 }
06757
06758
06759
06760
06761
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772 static VALUE
06773 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06774 {
06775 return rb_str_justify(argc, argv, str, 'l');
06776 }
06777
06778
06779
06780
06781
06782
06783
06784
06785
06786
06787
06788
06789
06790
06791
06792 static VALUE
06793 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06794 {
06795 return rb_str_justify(argc, argv, str, 'r');
06796 }
06797
06798
06799
06800
06801
06802
06803
06804
06805
06806
06807
06808
06809
06810
06811
06812 static VALUE
06813 rb_str_center(int argc, VALUE *argv, VALUE str)
06814 {
06815 return rb_str_justify(argc, argv, str, 'c');
06816 }
06817
06818
06819
06820
06821
06822
06823
06824
06825
06826
06827
06828
06829
06830
06831
06832
06833 static VALUE
06834 rb_str_partition(VALUE str, VALUE sep)
06835 {
06836 long pos;
06837 int regex = FALSE;
06838
06839 if (TYPE(sep) == T_REGEXP) {
06840 pos = rb_reg_search(sep, str, 0, 0);
06841 regex = TRUE;
06842 }
06843 else {
06844 VALUE tmp;
06845
06846 tmp = rb_check_string_type(sep);
06847 if (NIL_P(tmp)) {
06848 rb_raise(rb_eTypeError, "type mismatch: %s given",
06849 rb_obj_classname(sep));
06850 }
06851 sep = tmp;
06852 pos = rb_str_index(str, sep, 0);
06853 }
06854 if (pos < 0) {
06855 failed:
06856 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06857 }
06858 if (regex) {
06859 sep = rb_str_subpat(str, sep, INT2FIX(0));
06860 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06861 }
06862 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06863 sep,
06864 rb_str_subseq(str, pos+RSTRING_LEN(sep),
06865 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06866 }
06867
06868
06869
06870
06871
06872
06873
06874
06875
06876
06877
06878
06879
06880
06881
06882
06883 static VALUE
06884 rb_str_rpartition(VALUE str, VALUE sep)
06885 {
06886 long pos = RSTRING_LEN(str);
06887 int regex = FALSE;
06888
06889 if (TYPE(sep) == T_REGEXP) {
06890 pos = rb_reg_search(sep, str, pos, 1);
06891 regex = TRUE;
06892 }
06893 else {
06894 VALUE tmp;
06895
06896 tmp = rb_check_string_type(sep);
06897 if (NIL_P(tmp)) {
06898 rb_raise(rb_eTypeError, "type mismatch: %s given",
06899 rb_obj_classname(sep));
06900 }
06901 sep = tmp;
06902 pos = rb_str_sublen(str, pos);
06903 pos = rb_str_rindex(str, sep, pos);
06904 }
06905 if (pos < 0) {
06906 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06907 }
06908 if (regex) {
06909 sep = rb_reg_nth_match(0, rb_backref_get());
06910 }
06911 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06912 sep,
06913 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06914 }
06915
06916
06917
06918
06919
06920
06921
06922
06923
06924
06925
06926
06927
06928
06929
06930
06931
06932 static VALUE
06933 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06934 {
06935 int i;
06936
06937 for (i=0; i<argc; i++) {
06938 VALUE tmp = rb_check_string_type(argv[i]);
06939 if (NIL_P(tmp)) continue;
06940 rb_enc_check(str, tmp);
06941 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06942 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06943 return Qtrue;
06944 }
06945 return Qfalse;
06946 }
06947
06948
06949
06950
06951
06952
06953
06954
06955 static VALUE
06956 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06957 {
06958 int i;
06959 char *p, *s, *e;
06960 rb_encoding *enc;
06961
06962 for (i=0; i<argc; i++) {
06963 VALUE tmp = rb_check_string_type(argv[i]);
06964 if (NIL_P(tmp)) continue;
06965 enc = rb_enc_check(str, tmp);
06966 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06967 p = RSTRING_PTR(str);
06968 e = p + RSTRING_LEN(str);
06969 s = e - RSTRING_LEN(tmp);
06970 if (rb_enc_left_char_head(p, s, e, enc) != s)
06971 continue;
06972 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06973 return Qtrue;
06974 }
06975 return Qfalse;
06976 }
06977
06978 void
06979 rb_str_setter(VALUE val, ID id, VALUE *var)
06980 {
06981 if (!NIL_P(val) && TYPE(val) != T_STRING) {
06982 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06983 }
06984 *var = val;
06985 }
06986
06987
06988
06989
06990
06991
06992
06993
06994
06995 static VALUE
06996 rb_str_force_encoding(VALUE str, VALUE enc)
06997 {
06998 str_modifiable(str);
06999 rb_enc_associate(str, rb_to_encoding(enc));
07000 ENC_CODERANGE_CLEAR(str);
07001 return str;
07002 }
07003
07004
07005
07006
07007
07008
07009
07010
07011
07012
07013
07014
07015 static VALUE
07016 rb_str_valid_encoding_p(VALUE str)
07017 {
07018 int cr = rb_enc_str_coderange(str);
07019
07020 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07021 }
07022
07023
07024
07025
07026
07027
07028
07029
07030
07031
07032
07033 static VALUE
07034 rb_str_is_ascii_only_p(VALUE str)
07035 {
07036 int cr = rb_enc_str_coderange(str);
07037
07038 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07039 }
07040
07041
07042
07043
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074
07075
07076
07077
07078
07079
07080
07081
07082
07083 static VALUE
07084 sym_equal(VALUE sym1, VALUE sym2)
07085 {
07086 if (sym1 == sym2) return Qtrue;
07087 return Qfalse;
07088 }
07089
07090
07091 static int
07092 sym_printable(const char *s, const char *send, rb_encoding *enc)
07093 {
07094 while (s < send) {
07095 int n;
07096 int c = rb_enc_codepoint_len(s, send, &n, enc);
07097
07098 if (!rb_enc_isprint(c, enc)) return FALSE;
07099 s += n;
07100 }
07101 return TRUE;
07102 }
07103
07104
07105
07106
07107
07108
07109
07110
07111
07112
07113 static VALUE
07114 sym_inspect(VALUE sym)
07115 {
07116 VALUE str;
07117 ID id = SYM2ID(sym);
07118 rb_encoding *enc;
07119 const char *ptr;
07120 long len;
07121 char *dest;
07122 rb_encoding *resenc = rb_default_internal_encoding();
07123
07124 if (resenc == NULL) resenc = rb_default_external_encoding();
07125 sym = rb_id2str(id);
07126 enc = STR_ENC_GET(sym);
07127 ptr = RSTRING_PTR(sym);
07128 len = RSTRING_LEN(sym);
07129 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07130 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07131 str = rb_str_inspect(sym);
07132 len = RSTRING_LEN(str);
07133 rb_str_resize(str, len + 1);
07134 dest = RSTRING_PTR(str);
07135 memmove(dest + 1, dest, len);
07136 dest[0] = ':';
07137 }
07138 else {
07139 char *dest;
07140 str = rb_enc_str_new(0, len + 1, enc);
07141 dest = RSTRING_PTR(str);
07142 dest[0] = ':';
07143 memcpy(dest + 1, ptr, len);
07144 }
07145 return str;
07146 }
07147
07148
07149
07150
07151
07152
07153
07154
07155
07156
07157
07158
07159
07160 VALUE
07161 rb_sym_to_s(VALUE sym)
07162 {
07163 ID id = SYM2ID(sym);
07164
07165 return str_new3(rb_cString, rb_id2str(id));
07166 }
07167
07168
07169
07170
07171
07172
07173
07174
07175
07176
07177
07178
07179 static VALUE
07180 sym_to_sym(VALUE sym)
07181 {
07182 return sym;
07183 }
07184
07185 VALUE rb_funcall_passing_block(VALUE recv, ID mid, int argc, const VALUE *argv);
07186
07187 static VALUE
07188 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07189 {
07190 VALUE obj;
07191
07192 if (argc < 1) {
07193 rb_raise(rb_eArgError, "no receiver given");
07194 }
07195 obj = argv[0];
07196 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07197 }
07198
07199
07200
07201
07202
07203
07204
07205
07206
07207
07208 static VALUE
07209 sym_to_proc(VALUE sym)
07210 {
07211 static VALUE sym_proc_cache = Qfalse;
07212 enum {SYM_PROC_CACHE_SIZE = 67};
07213 VALUE proc;
07214 long id, index;
07215 VALUE *aryp;
07216
07217 if (!sym_proc_cache) {
07218 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07219 rb_gc_register_mark_object(sym_proc_cache);
07220 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07221 }
07222
07223 id = SYM2ID(sym);
07224 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07225
07226 aryp = RARRAY_PTR(sym_proc_cache);
07227 if (aryp[index] == sym) {
07228 return aryp[index + 1];
07229 }
07230 else {
07231 proc = rb_proc_new(sym_call, (VALUE)id);
07232 aryp[index] = sym;
07233 aryp[index + 1] = proc;
07234 return proc;
07235 }
07236 }
07237
07238
07239
07240
07241
07242
07243
07244
07245
07246 static VALUE
07247 sym_succ(VALUE sym)
07248 {
07249 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07250 }
07251
07252
07253
07254
07255
07256
07257
07258
07259
07260 static VALUE
07261 sym_cmp(VALUE sym, VALUE other)
07262 {
07263 if (!SYMBOL_P(other)) {
07264 return Qnil;
07265 }
07266 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07267 }
07268
07269
07270
07271
07272
07273
07274
07275
07276
07277 static VALUE
07278 sym_casecmp(VALUE sym, VALUE other)
07279 {
07280 if (!SYMBOL_P(other)) {
07281 return Qnil;
07282 }
07283 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07284 }
07285
07286
07287
07288
07289
07290
07291
07292
07293 static VALUE
07294 sym_match(VALUE sym, VALUE other)
07295 {
07296 return rb_str_match(rb_sym_to_s(sym), other);
07297 }
07298
07299
07300
07301
07302
07303
07304
07305
07306
07307 static VALUE
07308 sym_aref(int argc, VALUE *argv, VALUE sym)
07309 {
07310 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07311 }
07312
07313
07314
07315
07316
07317
07318
07319
07320 static VALUE
07321 sym_length(VALUE sym)
07322 {
07323 return rb_str_length(rb_id2str(SYM2ID(sym)));
07324 }
07325
07326
07327
07328
07329
07330
07331
07332
07333 static VALUE
07334 sym_empty(VALUE sym)
07335 {
07336 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07337 }
07338
07339
07340
07341
07342
07343
07344
07345
07346 static VALUE
07347 sym_upcase(VALUE sym)
07348 {
07349 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07350 }
07351
07352
07353
07354
07355
07356
07357
07358
07359 static VALUE
07360 sym_downcase(VALUE sym)
07361 {
07362 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07363 }
07364
07365
07366
07367
07368
07369
07370
07371
07372 static VALUE
07373 sym_capitalize(VALUE sym)
07374 {
07375 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07376 }
07377
07378
07379
07380
07381
07382
07383
07384
07385 static VALUE
07386 sym_swapcase(VALUE sym)
07387 {
07388 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07389 }
07390
07391
07392
07393
07394
07395
07396
07397
07398 static VALUE
07399 sym_encoding(VALUE sym)
07400 {
07401 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07402 }
07403
07404 ID
07405 rb_to_id(VALUE name)
07406 {
07407 VALUE tmp;
07408 ID id;
07409
07410 switch (TYPE(name)) {
07411 default:
07412 tmp = rb_check_string_type(name);
07413 if (NIL_P(tmp)) {
07414 tmp = rb_inspect(name);
07415 rb_raise(rb_eTypeError, "%s is not a symbol",
07416 RSTRING_PTR(tmp));
07417 }
07418 name = tmp;
07419
07420 case T_STRING:
07421 name = rb_str_intern(name);
07422
07423 case T_SYMBOL:
07424 return SYM2ID(name);
07425 }
07426 return id;
07427 }
07428
07429
07430
07431
07432
07433
07434
07435
07436
07437
07438
07439
07440
07441
07442 void
07443 Init_String(void)
07444 {
07445 #undef rb_intern
07446 #define rb_intern(str) rb_intern_const(str)
07447
07448 rb_cString = rb_define_class("String", rb_cObject);
07449 rb_include_module(rb_cString, rb_mComparable);
07450 rb_define_alloc_func(rb_cString, str_alloc);
07451 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07452 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07453 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07454 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07455 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07456 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07457 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07458 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07459 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07460 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07461 rb_define_method(rb_cString, "*", rb_str_times, 1);
07462 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07463 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07464 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07465 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07466 rb_define_method(rb_cString, "length", rb_str_length, 0);
07467 rb_define_method(rb_cString, "size", rb_str_length, 0);
07468 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07469 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07470 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07471 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07472 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07473 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07474 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07475 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07476 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07477 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07478 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07479 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07480 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07481 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07482 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07483 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07484
07485 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07486 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07487 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07488 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07489 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07490 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07491
07492 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07493 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07494 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07495 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07496
07497 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07498 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07499 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07500 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07501
07502 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07503 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07504 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07505 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07506 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07507 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07508 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07509 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07510 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07511 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07512 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07513 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07514 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07515 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07516 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07517
07518 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07519 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07520 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07521
07522 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07523
07524 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07525 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07526 rb_define_method(rb_cString, "center", rb_str_center, -1);
07527
07528 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07529 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07530 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07531 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07532 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07533 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07534 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07535
07536 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07537 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07538 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07539 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07540 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07541 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07542 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07543
07544 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07545 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07546 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07547 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07548 rb_define_method(rb_cString, "count", rb_str_count, -1);
07549
07550 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07551 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07552 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07553 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07554
07555 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07556 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07557 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07558 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07559
07560 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07561
07562 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07563 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07564
07565 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07566 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07567
07568 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07569 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07570 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07571 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07572
07573 id_to_s = rb_intern("to_s");
07574
07575 rb_fs = Qnil;
07576 rb_define_variable("$;", &rb_fs);
07577 rb_define_variable("$-F", &rb_fs);
07578
07579 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07580 rb_include_module(rb_cSymbol, rb_mComparable);
07581 rb_undef_alloc_func(rb_cSymbol);
07582 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07583 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07584
07585 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07586 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07587 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07588 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07589 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07590 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07591 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07592 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07593 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07594 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07595
07596 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07597 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07598 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07599
07600 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07601 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07602 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07603 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07604 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07605 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07606
07607 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07608 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07609 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07610 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07611
07612 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07613 }
07614