Ruby: string.c Source File

00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author: yugui $
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include <assert.h>
00018 
00019 #define BEG(no) regs->beg[no]
00020 #define END(no) regs->end[no]
00021 
00022 #include <math.h>
00023 #include <ctype.h>
00024 
00025 #ifdef HAVE_UNISTD_H
00026 #include <unistd.h>
00027 #endif
00028 
00029 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00030 
00031 #undef rb_str_new_cstr
00032 #undef rb_tainted_str_new_cstr
00033 #undef rb_usascii_str_new_cstr
00034 #undef rb_external_str_new_cstr
00035 #undef rb_locale_str_new_cstr
00036 #undef rb_str_new2
00037 #undef rb_str_new3
00038 #undef rb_str_new4
00039 #undef rb_str_new5
00040 #undef rb_tainted_str_new2
00041 #undef rb_usascii_str_new2
00042 #undef rb_str_dup_frozen
00043 #undef rb_str_buf_new_cstr
00044 #undef rb_str_buf_new2
00045 #undef rb_str_buf_cat2
00046 #undef rb_str_cat2
00047 
00048 VALUE rb_cString;
00049 VALUE rb_cSymbol;
00050 
00051 #define RUBY_MAX_CHAR_LEN 16
00052 #define STR_TMPLOCK FL_USER7
00053 #define STR_NOEMBED FL_USER1
00054 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00055 #define STR_ASSOC   FL_USER3
00056 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED)
00057 #define STR_ASSOC_P(s)  FL_ALL(s, STR_NOEMBED|STR_ASSOC)
00058 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00059 #define STR_NOCAPA_P(s) (FL_TEST(s,STR_NOEMBED) && FL_ANY(s,ELTS_SHARED|STR_ASSOC))
00060 #define STR_UNSET_NOCAPA(s) do {\
00061     if (FL_TEST(s,STR_NOEMBED)) FL_UNSET(s,(ELTS_SHARED|STR_ASSOC));\
00062 } while (0)
00063 
00064 
00065 #define STR_SET_NOEMBED(str) do {\
00066     FL_SET(str, STR_NOEMBED);\
00067     STR_SET_EMBED_LEN(str, 0);\
00068 } while (0)
00069 #define STR_SET_EMBED(str) FL_UNSET(str, STR_NOEMBED)
00070 #define STR_EMBED_P(str) (!FL_TEST(str, STR_NOEMBED))
00071 #define STR_SET_EMBED_LEN(str, n) do { \
00072     long tmp_n = (n);\
00073     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00074     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00075 } while (0)
00076 
00077 #define STR_SET_LEN(str, n) do { \
00078     if (STR_EMBED_P(str)) {\
00079         STR_SET_EMBED_LEN(str, n);\
00080     }\
00081     else {\
00082         RSTRING(str)->as.heap.len = (n);\
00083     }\
00084 } while (0)
00085 
00086 #define STR_DEC_LEN(str) do {\
00087     if (STR_EMBED_P(str)) {\
00088         long n = RSTRING_LEN(str);\
00089         n--;\
00090         STR_SET_EMBED_LEN(str, n);\
00091     }\
00092     else {\
00093         RSTRING(str)->as.heap.len--;\
00094     }\
00095 } while (0)
00096 
00097 #define RESIZE_CAPA(str,capacity) do {\
00098     if (STR_EMBED_P(str)) {\
00099         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00100             char *tmp = ALLOC_N(char, capacity+1);\
00101             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00102             RSTRING(str)->as.heap.ptr = tmp;\
00103             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00104             STR_SET_NOEMBED(str);\
00105             RSTRING(str)->as.heap.aux.capa = (capacity);\
00106         }\
00107     }\
00108     else {\
00109         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00110         if (!STR_NOCAPA_P(str))\
00111             RSTRING(str)->as.heap.aux.capa = (capacity);\
00112     }\
00113 } while (0)
00114 
00115 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00116 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00117 
00118 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00119 
00120 static inline int
00121 single_byte_optimizable(VALUE str)
00122 {
00123     rb_encoding *enc;
00124 
00125     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00126     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00127         return 1;
00128 
00129     enc = STR_ENC_GET(str);
00130     if (rb_enc_mbmaxlen(enc) == 1)
00131         return 1;
00132 
00133     /* Conservative.  Possibly single byte.
00134      * "\xa1" in Shift_JIS for example. */
00135     return 0;
00136 }
00137 
00138 VALUE rb_fs;
00139 
00140 static inline const char *
00141 search_nonascii(const char *p, const char *e)
00142 {
00143 #if SIZEOF_VALUE == 8
00144 # define NONASCII_MASK 0x8080808080808080ULL
00145 #elif SIZEOF_VALUE == 4
00146 # define NONASCII_MASK 0x80808080UL
00147 #endif
00148 #ifdef NONASCII_MASK
00149     if ((int)sizeof(VALUE) * 2 < e - p) {
00150         const VALUE *s, *t;
00151         const VALUE lowbits = sizeof(VALUE) - 1;
00152         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00153         while (p < (const char *)s) {
00154             if (!ISASCII(*p))
00155                 return p;
00156             p++;
00157         }
00158         t = (const VALUE*)(~lowbits & (VALUE)e);
00159         while (s < t) {
00160             if (*s & NONASCII_MASK) {
00161                 t = s;
00162                 break;
00163             }
00164             s++;
00165         }
00166         p = (const char *)t;
00167     }
00168 #endif
00169     while (p < e) {
00170         if (!ISASCII(*p))
00171             return p;
00172         p++;
00173     }
00174     return NULL;
00175 }
00176 
00177 static int
00178 coderange_scan(const char *p, long len, rb_encoding *enc)
00179 {
00180     const char *e = p + len;
00181 
00182     if (rb_enc_to_index(enc) == 0) {
00183         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00184         p = search_nonascii(p, e);
00185         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00186     }
00187 
00188     if (rb_enc_asciicompat(enc)) {
00189         p = search_nonascii(p, e);
00190         if (!p) {
00191             return ENC_CODERANGE_7BIT;
00192         }
00193         while (p < e) {
00194             int ret = rb_enc_precise_mbclen(p, e, enc);
00195             if (!MBCLEN_CHARFOUND_P(ret)) {
00196                 return ENC_CODERANGE_BROKEN;
00197             }
00198             p += MBCLEN_CHARFOUND_LEN(ret);
00199             if (p < e) {
00200                 p = search_nonascii(p, e);
00201                 if (!p) {
00202                     return ENC_CODERANGE_VALID;
00203                 }
00204             }
00205         }
00206         if (e < p) {
00207             return ENC_CODERANGE_BROKEN;
00208         }
00209         return ENC_CODERANGE_VALID;
00210     }
00211 
00212     while (p < e) {
00213         int ret = rb_enc_precise_mbclen(p, e, enc);
00214 
00215         if (!MBCLEN_CHARFOUND_P(ret)) {
00216             return ENC_CODERANGE_BROKEN;
00217         }
00218         p += MBCLEN_CHARFOUND_LEN(ret);
00219     }
00220     if (e < p) {
00221         return ENC_CODERANGE_BROKEN;
00222     }
00223     return ENC_CODERANGE_VALID;
00224 }
00225 
00226 long
00227 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00228 {
00229     const char *p = s;
00230 
00231     if (*cr == ENC_CODERANGE_BROKEN)
00232         return e - s;
00233 
00234     if (rb_enc_to_index(enc) == 0) {
00235         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00236         p = search_nonascii(p, e);
00237         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00238         return e - s;
00239     }
00240     else if (rb_enc_asciicompat(enc)) {
00241         p = search_nonascii(p, e);
00242         if (!p) {
00243             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00244             return e - s;
00245         }
00246         while (p < e) {
00247             int ret = rb_enc_precise_mbclen(p, e, enc);
00248             if (!MBCLEN_CHARFOUND_P(ret)) {
00249                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00250                 return p - s;
00251             }
00252             p += MBCLEN_CHARFOUND_LEN(ret);
00253             if (p < e) {
00254                 p = search_nonascii(p, e);
00255                 if (!p) {
00256                     *cr = ENC_CODERANGE_VALID;
00257                     return e - s;
00258                 }
00259             }
00260         }
00261         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00262         return p - s;
00263     }
00264     else {
00265         while (p < e) {
00266             int ret = rb_enc_precise_mbclen(p, e, enc);
00267             if (!MBCLEN_CHARFOUND_P(ret)) {
00268                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00269                 return p - s;
00270             }
00271             p += MBCLEN_CHARFOUND_LEN(ret);
00272         }
00273         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00274         return p - s;
00275     }
00276 }
00277 
00278 static inline void
00279 str_enc_copy(VALUE str1, VALUE str2)
00280 {
00281     rb_enc_set_index(str1, ENCODING_GET(str2));
00282 }
00283 
00284 static void
00285 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00286 {
00287     /* this function is designed for copying encoding and coderange
00288      * from src to new string "dest" which is made from the part of src.
00289      */
00290     str_enc_copy(dest, src);
00291     switch (ENC_CODERANGE(src)) {
00292       case ENC_CODERANGE_7BIT:
00293         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00294         break;
00295       case ENC_CODERANGE_VALID:
00296         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00297             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00298             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00299         else
00300             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00301         break;
00302       default:
00303         if (RSTRING_LEN(dest) == 0) {
00304             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00305                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00306             else
00307                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00308         }
00309         break;
00310     }
00311 }
00312 
00313 static void
00314 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00315 {
00316     str_enc_copy(dest, src);
00317     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00318 }
00319 
00320 int
00321 rb_enc_str_coderange(VALUE str)
00322 {
00323     int cr = ENC_CODERANGE(str);
00324 
00325     if (cr == ENC_CODERANGE_UNKNOWN) {
00326         rb_encoding *enc = STR_ENC_GET(str);
00327         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00328         ENC_CODERANGE_SET(str, cr);
00329     }
00330     return cr;
00331 }
00332 
00333 int
00334 rb_enc_str_asciionly_p(VALUE str)
00335 {
00336     rb_encoding *enc = STR_ENC_GET(str);
00337 
00338     if (!rb_enc_asciicompat(enc))
00339         return FALSE;
00340     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00341         return TRUE;
00342     return FALSE;
00343 }
00344 
00345 static inline void
00346 str_mod_check(VALUE s, const char *p, long len)
00347 {
00348     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00349         rb_raise(rb_eRuntimeError, "string modified");
00350     }
00351 }
00352 
00353 static inline void
00354 str_frozen_check(VALUE s)
00355 {
00356     if (OBJ_FROZEN(s)) {
00357         rb_raise(rb_eRuntimeError, "string frozen");
00358     }
00359 }
00360 
00361 size_t
00362 rb_str_capacity(VALUE str)
00363 {
00364     if (STR_EMBED_P(str)) {
00365         return RSTRING_EMBED_LEN_MAX;
00366     }
00367     else if (STR_NOCAPA_P(str)) {
00368         return RSTRING(str)->as.heap.len;
00369     }
00370     else {
00371         return RSTRING(str)->as.heap.aux.capa;
00372     }
00373 }
00374 
00375 static inline VALUE
00376 str_alloc(VALUE klass)
00377 {
00378     NEWOBJ(str, struct RString);
00379     OBJSETUP(str, klass, T_STRING);
00380 
00381     str->as.heap.ptr = 0;
00382     str->as.heap.len = 0;
00383     str->as.heap.aux.capa = 0;
00384 
00385     return (VALUE)str;
00386 }
00387 
00388 static VALUE
00389 str_new(VALUE klass, const char *ptr, long len)
00390 {
00391     VALUE str;
00392 
00393     if (len < 0) {
00394         rb_raise(rb_eArgError, "negative string size (or size too big)");
00395     }
00396 
00397     str = str_alloc(klass);
00398     if (len > RSTRING_EMBED_LEN_MAX) {
00399         RSTRING(str)->as.heap.aux.capa = len;
00400         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00401         STR_SET_NOEMBED(str);
00402     }
00403     else if (len == 0) {
00404         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00405     }
00406     if (ptr) {
00407         memcpy(RSTRING_PTR(str), ptr, len);
00408     }
00409     STR_SET_LEN(str, len);
00410     RSTRING_PTR(str)[len] = '\0';
00411     return str;
00412 }
00413 
00414 VALUE
00415 rb_str_new(const char *ptr, long len)
00416 {
00417     return str_new(rb_cString, ptr, len);
00418 }
00419 
00420 VALUE
00421 rb_usascii_str_new(const char *ptr, long len)
00422 {
00423     VALUE str = rb_str_new(ptr, len);
00424     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00425     return str;
00426 }
00427 
00428 VALUE
00429 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00430 {
00431     VALUE str = rb_str_new(ptr, len);
00432     rb_enc_associate(str, enc);
00433     return str;
00434 }
00435 
00436 VALUE
00437 rb_str_new_cstr(const char *ptr)
00438 {
00439     if (!ptr) {
00440         rb_raise(rb_eArgError, "NULL pointer given");
00441     }
00442     return rb_str_new(ptr, strlen(ptr));
00443 }
00444 
00445 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00446 #define rb_str_new2 rb_str_new_cstr
00447 
00448 VALUE
00449 rb_usascii_str_new_cstr(const char *ptr)
00450 {
00451     VALUE str = rb_str_new2(ptr);
00452     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00453     return str;
00454 }
00455 
00456 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00457 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00458 
00459 VALUE
00460 rb_tainted_str_new(const char *ptr, long len)
00461 {
00462     VALUE str = rb_str_new(ptr, len);
00463 
00464     OBJ_TAINT(str);
00465     return str;
00466 }
00467 
00468 VALUE
00469 rb_tainted_str_new_cstr(const char *ptr)
00470 {
00471     VALUE str = rb_str_new2(ptr);
00472 
00473     OBJ_TAINT(str);
00474     return str;
00475 }
00476 
00477 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00478 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00479 
00480 VALUE
00481 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00482 {
00483     rb_econv_t *ec;
00484     rb_econv_result_t ret;
00485     long len;
00486     VALUE newstr;
00487     const unsigned char *sp;
00488     unsigned char *dp;
00489 
00490     if (!to) return str;
00491     if (from == to) return str;
00492     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00493         to == rb_ascii8bit_encoding()) {
00494         if (STR_ENC_GET(str) != to) {
00495             str = rb_str_dup(str);
00496             rb_enc_associate(str, to);
00497         }
00498         return str;
00499     }
00500 
00501     len = RSTRING_LEN(str);
00502     newstr = rb_str_new(0, len);
00503 
00504   retry:
00505     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00506     if (!ec) return str;
00507 
00508     sp = (unsigned char*)RSTRING_PTR(str);
00509     dp = (unsigned char*)RSTRING_PTR(newstr);
00510     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00511                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00512     rb_econv_close(ec);
00513     switch (ret) {
00514       case econv_destination_buffer_full:
00515         /* destination buffer short */
00516         len = len < 2 ? 2 : len * 2;
00517         rb_str_resize(newstr, len);
00518         goto retry;
00519 
00520       case econv_finished:
00521         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00522         rb_str_set_len(newstr, len);
00523         rb_enc_associate(newstr, to);
00524         return newstr;
00525 
00526       default:
00527         /* some error, return original */
00528         return str;
00529     }
00530 }
00531 
00532 VALUE
00533 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00534 {
00535     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00536 }
00537 
00538 VALUE
00539 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00540 {
00541     VALUE str;
00542 
00543     str = rb_tainted_str_new(ptr, len);
00544     if (eenc == rb_usascii_encoding() &&
00545         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00546         rb_enc_associate(str, rb_ascii8bit_encoding());
00547         return str;
00548     }
00549     rb_enc_associate(str, eenc);
00550     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00551 }
00552 
00553 VALUE
00554 rb_external_str_new(const char *ptr, long len)
00555 {
00556     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00557 }
00558 
00559 VALUE
00560 rb_external_str_new_cstr(const char *ptr)
00561 {
00562     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00563 }
00564 
00565 VALUE
00566 rb_locale_str_new(const char *ptr, long len)
00567 {
00568     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00569 }
00570 
00571 VALUE
00572 rb_locale_str_new_cstr(const char *ptr)
00573 {
00574     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00575 }
00576 
00577 VALUE
00578 rb_filesystem_str_new(const char *ptr, long len)
00579 {
00580     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00581 }
00582 
00583 VALUE
00584 rb_filesystem_str_new_cstr(const char *ptr)
00585 {
00586     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00587 }
00588 
00589 VALUE
00590 rb_str_export(VALUE str)
00591 {
00592     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00593 }
00594 
00595 VALUE
00596 rb_str_export_locale(VALUE str)
00597 {
00598     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00599 }
00600 
00601 VALUE
00602 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00603 {
00604     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00605 }
00606 
00607 static VALUE
00608 str_replace_shared(VALUE str2, VALUE str)
00609 {
00610     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00611         STR_SET_EMBED(str2);
00612         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00613         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00614     }
00615     else {
00616         str = rb_str_new_frozen(str);
00617         FL_SET(str2, STR_NOEMBED);
00618         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00619         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00620         RSTRING(str2)->as.heap.aux.shared = str;
00621         FL_SET(str2, ELTS_SHARED);
00622     }
00623     rb_enc_cr_str_exact_copy(str2, str);
00624 
00625     return str2;
00626 }
00627 
00628 static VALUE
00629 str_new_shared(VALUE klass, VALUE str)
00630 {
00631     return str_replace_shared(str_alloc(klass), str);
00632 }
00633 
00634 static VALUE
00635 str_new3(VALUE klass, VALUE str)
00636 {
00637     return str_new_shared(klass, str);
00638 }
00639 
00640 VALUE
00641 rb_str_new_shared(VALUE str)
00642 {
00643     VALUE str2 = str_new3(rb_obj_class(str), str);
00644 
00645     OBJ_INFECT(str2, str);
00646     return str2;
00647 }
00648 
00649 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00650 #define rb_str_new3 rb_str_new_shared
00651 
00652 static VALUE
00653 str_new4(VALUE klass, VALUE str)
00654 {
00655     VALUE str2;
00656 
00657     str2 = str_alloc(klass);
00658     STR_SET_NOEMBED(str2);
00659     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00660     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00661     if (STR_SHARED_P(str)) {
00662         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00663         assert(OBJ_FROZEN(shared));
00664         FL_SET(str2, ELTS_SHARED);
00665         RSTRING(str2)->as.heap.aux.shared = shared;
00666     }
00667     else {
00668         FL_SET(str, ELTS_SHARED);
00669         RSTRING(str)->as.heap.aux.shared = str2;
00670     }
00671     rb_enc_cr_str_exact_copy(str2, str);
00672     OBJ_INFECT(str2, str);
00673     return str2;
00674 }
00675 
00676 VALUE
00677 rb_str_new_frozen(VALUE orig)
00678 {
00679     VALUE klass, str;
00680 
00681     if (OBJ_FROZEN(orig)) return orig;
00682     klass = rb_obj_class(orig);
00683     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00684         long ofs;
00685         assert(OBJ_FROZEN(str));
00686         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00687         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00688             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00689             ENCODING_GET(str) != ENCODING_GET(orig)) {
00690             str = str_new3(klass, str);
00691             RSTRING(str)->as.heap.ptr += ofs;
00692             RSTRING(str)->as.heap.len -= ofs;
00693             rb_enc_cr_str_exact_copy(str, orig);
00694             OBJ_INFECT(str, orig);
00695         }
00696     }
00697     else if (STR_EMBED_P(orig)) {
00698         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00699         rb_enc_cr_str_exact_copy(str, orig);
00700         OBJ_INFECT(str, orig);
00701     }
00702     else if (STR_ASSOC_P(orig)) {
00703         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00704         FL_UNSET(orig, STR_ASSOC);
00705         str = str_new4(klass, orig);
00706         FL_SET(str, STR_ASSOC);
00707         RSTRING(str)->as.heap.aux.shared = assoc;
00708     }
00709     else {
00710         str = str_new4(klass, orig);
00711     }
00712     OBJ_FREEZE(str);
00713     return str;
00714 }
00715 
00716 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00717 #define rb_str_new4 rb_str_new_frozen
00718 
00719 VALUE
00720 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00721 {
00722     return str_new(rb_obj_class(obj), ptr, len);
00723 }
00724 
00725 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00726            rb_str_new_with_class, (obj, ptr, len))
00727 #define rb_str_new5 rb_str_new_with_class
00728 
00729 static VALUE
00730 str_new_empty(VALUE str)
00731 {
00732     VALUE v = rb_str_new5(str, 0, 0);
00733     OBJ_INFECT(v, str);
00734     return v;
00735 }
00736 
00737 #define STR_BUF_MIN_SIZE 128
00738 
00739 VALUE
00740 rb_str_buf_new(long capa)
00741 {
00742     VALUE str = str_alloc(rb_cString);
00743 
00744     if (capa < STR_BUF_MIN_SIZE) {
00745         capa = STR_BUF_MIN_SIZE;
00746     }
00747     FL_SET(str, STR_NOEMBED);
00748     RSTRING(str)->as.heap.aux.capa = capa;
00749     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00750     RSTRING(str)->as.heap.ptr[0] = '\0';
00751 
00752     return str;
00753 }
00754 
00755 VALUE
00756 rb_str_buf_new_cstr(const char *ptr)
00757 {
00758     VALUE str;
00759     long len = strlen(ptr);
00760 
00761     str = rb_str_buf_new(len);
00762     rb_str_buf_cat(str, ptr, len);
00763 
00764     return str;
00765 }
00766 
00767 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00768 #define rb_str_buf_new2 rb_str_buf_new_cstr
00769 
00770 VALUE
00771 rb_str_tmp_new(long len)
00772 {
00773     return str_new(0, 0, len);
00774 }
00775 
00776 void
00777 rb_str_free(VALUE str)
00778 {
00779     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00780         xfree(RSTRING(str)->as.heap.ptr);
00781     }
00782 }
00783 
00784 size_t
00785 rb_str_memsize(VALUE str)
00786 {
00787     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00788         return RSTRING(str)->as.heap.aux.capa;
00789     }
00790     else {
00791         return 0;
00792     }
00793 }
00794 
00795 VALUE
00796 rb_str_to_str(VALUE str)
00797 {
00798     return rb_convert_type(str, T_STRING, "String", "to_str");
00799 }
00800 
00801 static inline void str_discard(VALUE str);
00802 
00803 void
00804 rb_str_shared_replace(VALUE str, VALUE str2)
00805 {
00806     rb_encoding *enc;
00807     int cr;
00808     if (str == str2) return;
00809     enc = STR_ENC_GET(str2);
00810     cr = ENC_CODERANGE(str2);
00811     str_discard(str);
00812     OBJ_INFECT(str, str2);
00813     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00814         STR_SET_EMBED(str);
00815         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00816         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00817         rb_enc_associate(str, enc);
00818         ENC_CODERANGE_SET(str, cr);
00819         return;
00820     }
00821     STR_SET_NOEMBED(str);
00822     STR_UNSET_NOCAPA(str);
00823     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00824     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00825     if (STR_NOCAPA_P(str2)) {
00826         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00827         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00828     }
00829     else {
00830         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00831     }
00832     STR_SET_EMBED(str2);        /* abandon str2 */
00833     RSTRING_PTR(str2)[0] = 0;
00834     STR_SET_EMBED_LEN(str2, 0);
00835     rb_enc_associate(str, enc);
00836     ENC_CODERANGE_SET(str, cr);
00837 }
00838 
00839 static ID id_to_s;
00840 
00841 VALUE
00842 rb_obj_as_string(VALUE obj)
00843 {
00844     VALUE str;
00845 
00846     if (TYPE(obj) == T_STRING) {
00847         return obj;
00848     }
00849     str = rb_funcall(obj, id_to_s, 0);
00850     if (TYPE(str) != T_STRING)
00851         return rb_any_to_s(obj);
00852     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00853     return str;
00854 }
00855 
00856 static VALUE
00857 str_replace(VALUE str, VALUE str2)
00858 {
00859     long len;
00860 
00861     len = RSTRING_LEN(str2);
00862     if (STR_ASSOC_P(str2)) {
00863         str2 = rb_str_new4(str2);
00864     }
00865     if (STR_SHARED_P(str2)) {
00866         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00867         assert(OBJ_FROZEN(shared));
00868         STR_SET_NOEMBED(str);
00869         RSTRING(str)->as.heap.len = len;
00870         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00871         FL_SET(str, ELTS_SHARED);
00872         FL_UNSET(str, STR_ASSOC);
00873         RSTRING(str)->as.heap.aux.shared = shared;
00874     }
00875     else {
00876         str_replace_shared(str, str2);
00877     }
00878 
00879     OBJ_INFECT(str, str2);
00880     rb_enc_cr_str_exact_copy(str, str2);
00881     return str;
00882 }
00883 
00884 static VALUE
00885 str_duplicate(VALUE klass, VALUE str)
00886 {
00887     VALUE dup = str_alloc(klass);
00888     str_replace(dup, str);
00889     return dup;
00890 }
00891 
00892 VALUE
00893 rb_str_dup(VALUE str)
00894 {
00895     return str_duplicate(rb_obj_class(str), str);
00896 }
00897 
00898 VALUE
00899 rb_str_resurrect(VALUE str)
00900 {
00901     return str_replace(str_alloc(rb_cString), str);
00902 }
00903 
00904 /*
00905  *  call-seq:
00906  *     String.new(str="")   -> new_str
00907  *
00908  *  Returns a new string object containing a copy of <i>str</i>.
00909  */
00910 
00911 static VALUE
00912 rb_str_init(int argc, VALUE *argv, VALUE str)
00913 {
00914     VALUE orig;
00915 
00916     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00917         rb_str_replace(str, orig);
00918     return str;
00919 }
00920 
00921 static inline long
00922 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00923 {
00924     long c;
00925     const char *q;
00926 
00927     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00928         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00929     }
00930     else if (rb_enc_asciicompat(enc)) {
00931         c = 0;
00932         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00933             while (p < e) {
00934                 if (ISASCII(*p)) {
00935                     q = search_nonascii(p, e);
00936                     if (!q)
00937                         return c + (e - p);
00938                     c += q - p;
00939                     p = q;
00940                 }
00941                 p += rb_enc_fast_mbclen(p, e, enc);
00942                 c++;
00943             }
00944         }
00945         else {
00946             while (p < e) {
00947                 if (ISASCII(*p)) {
00948                     q = search_nonascii(p, e);
00949                     if (!q)
00950                         return c + (e - p);
00951                     c += q - p;
00952                     p = q;
00953                 }
00954                 p += rb_enc_mbclen(p, e, enc);
00955                 c++;
00956             }
00957         }
00958         return c;
00959     }
00960 
00961     for (c=0; p<e; c++) {
00962         p += rb_enc_mbclen(p, e, enc);
00963     }
00964     return c;
00965 }
00966 
00967 long
00968 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00969 {
00970     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00971 }
00972 
00973 long
00974 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00975 {
00976     long c;
00977     const char *q;
00978     int ret;
00979 
00980     *cr = 0;
00981     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00982         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00983     }
00984     else if (rb_enc_asciicompat(enc)) {
00985         c = 0;
00986         while (p < e) {
00987             if (ISASCII(*p)) {
00988                 q = search_nonascii(p, e);
00989                 if (!q) {
00990                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
00991                     return c + (e - p);
00992                 }
00993                 c += q - p;
00994                 p = q;
00995             }
00996             ret = rb_enc_precise_mbclen(p, e, enc);
00997             if (MBCLEN_CHARFOUND_P(ret)) {
00998                 *cr |= ENC_CODERANGE_VALID;
00999                 p += MBCLEN_CHARFOUND_LEN(ret);
01000             }
01001             else {
01002                 *cr = ENC_CODERANGE_BROKEN;
01003                 p++;
01004             }
01005             c++;
01006         }
01007         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01008         return c;
01009     }
01010 
01011     for (c=0; p<e; c++) {
01012         ret = rb_enc_precise_mbclen(p, e, enc);
01013         if (MBCLEN_CHARFOUND_P(ret)) {
01014             *cr |= ENC_CODERANGE_VALID;
01015             p += MBCLEN_CHARFOUND_LEN(ret);
01016         }
01017         else {
01018             *cr = ENC_CODERANGE_BROKEN;
01019             if (p + rb_enc_mbminlen(enc) <= e)
01020                 p += rb_enc_mbminlen(enc);
01021             else
01022                 p = e;
01023         }
01024     }
01025     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01026     return c;
01027 }
01028 
01029 #ifdef NONASCII_MASK
01030 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01031 static inline VALUE
01032 count_utf8_lead_bytes_with_word(const VALUE *s)
01033 {
01034     VALUE d = *s;
01035     d |= ~(d>>1);
01036     d >>= 6;
01037     d &= NONASCII_MASK >> 7;
01038     d += (d>>8);
01039     d += (d>>16);
01040 #if SIZEOF_VALUE == 8
01041     d += (d>>32);
01042 #endif
01043     return (d&0xF);
01044 }
01045 #endif
01046 
01047 static long
01048 str_strlen(VALUE str, rb_encoding *enc)
01049 {
01050     const char *p, *e;
01051     long n;
01052     int cr;
01053 
01054     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01055     if (!enc) enc = STR_ENC_GET(str);
01056     p = RSTRING_PTR(str);
01057     e = RSTRING_END(str);
01058     cr = ENC_CODERANGE(str);
01059 #ifdef NONASCII_MASK
01060     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01061         enc == rb_utf8_encoding()) {
01062 
01063         VALUE len = 0;
01064         if ((int)sizeof(VALUE) * 2 < e - p) {
01065             const VALUE *s, *t;
01066             const VALUE lowbits = sizeof(VALUE) - 1;
01067             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01068             t = (const VALUE*)(~lowbits & (VALUE)e);
01069             while (p < (const char *)s) {
01070                 if (is_utf8_lead_byte(*p)) len++;
01071                 p++;
01072             }
01073             while (s < t) {
01074                 len += count_utf8_lead_bytes_with_word(s);
01075                 s++;
01076             }
01077             p = (const char *)s;
01078         }
01079         while (p < e) {
01080             if (is_utf8_lead_byte(*p)) len++;
01081             p++;
01082         }
01083         return (long)len;
01084     }
01085 #endif
01086     n = rb_enc_strlen_cr(p, e, enc, &cr);
01087     if (cr) {
01088         ENC_CODERANGE_SET(str, cr);
01089     }
01090     return n;
01091 }
01092 
01093 long
01094 rb_str_strlen(VALUE str)
01095 {
01096     return str_strlen(str, STR_ENC_GET(str));
01097 }
01098 
01099 /*
01100  *  call-seq:
01101  *     str.length   -> integer
01102  *     str.size     -> integer
01103  *
01104  *  Returns the character length of <i>str</i>.
01105  */
01106 
01107 VALUE
01108 rb_str_length(VALUE str)
01109 {
01110     long len;
01111 
01112     len = str_strlen(str, STR_ENC_GET(str));
01113     return LONG2NUM(len);
01114 }
01115 
01116 /*
01117  *  call-seq:
01118  *     str.bytesize  -> integer
01119  *
01120  *  Returns the length of <i>str</i> in bytes.
01121  */
01122 
01123 static VALUE
01124 rb_str_bytesize(VALUE str)
01125 {
01126     return INT2NUM(RSTRING_LEN(str));
01127 }
01128 
01129 /*
01130  *  call-seq:
01131  *     str.empty?   -> true or false
01132  *
01133  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01134  *
01135  *     "hello".empty?   #=> false
01136  *     "".empty?        #=> true
01137  */
01138 
01139 static VALUE
01140 rb_str_empty(VALUE str)
01141 {
01142     if (RSTRING_LEN(str) == 0)
01143         return Qtrue;
01144     return Qfalse;
01145 }
01146 
01147 /*
01148  *  call-seq:
01149  *     str + other_str   -> new_str
01150  *
01151  *  Concatenation---Returns a new <code>String</code> containing
01152  *  <i>other_str</i> concatenated to <i>str</i>.
01153  *
01154  *     "Hello from " + self.to_s   #=> "Hello from main"
01155  */
01156 
01157 VALUE
01158 rb_str_plus(VALUE str1, VALUE str2)
01159 {
01160     VALUE str3;
01161     rb_encoding *enc;
01162 
01163     StringValue(str2);
01164     enc = rb_enc_check(str1, str2);
01165     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01166     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01167     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01168            RSTRING_PTR(str2), RSTRING_LEN(str2));
01169     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01170 
01171     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01172         OBJ_TAINT(str3);
01173     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01174                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01175     return str3;
01176 }
01177 
01178 /*
01179  *  call-seq:
01180  *     str * integer   -> new_str
01181  *
01182  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01183  *  the receiver.
01184  *
01185  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01186  */
01187 
01188 VALUE
01189 rb_str_times(VALUE str, VALUE times)
01190 {
01191     VALUE str2;
01192     long n, len;
01193     char *ptr2;
01194 
01195     len = NUM2LONG(times);
01196     if (len < 0) {
01197         rb_raise(rb_eArgError, "negative argument");
01198     }
01199     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01200         rb_raise(rb_eArgError, "argument too big");
01201     }
01202 
01203     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01204     ptr2 = RSTRING_PTR(str2);
01205     if (len) {
01206         n = RSTRING_LEN(str);
01207         memcpy(ptr2, RSTRING_PTR(str), n);
01208         while (n <= len/2) {
01209             memcpy(ptr2 + n, ptr2, n);
01210             n *= 2;
01211         }
01212         memcpy(ptr2 + n, ptr2, len-n);
01213     }
01214     ptr2[RSTRING_LEN(str2)] = '\0';
01215     OBJ_INFECT(str2, str);
01216     rb_enc_cr_str_copy_for_substr(str2, str);
01217 
01218     return str2;
01219 }
01220 
01221 /*
01222  *  call-seq:
01223  *     str % arg   -> new_str
01224  *
01225  *  Format---Uses <i>str</i> as a format specification, and returns the result
01226  *  of applying it to <i>arg</i>. If the format specification contains more than
01227  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01228  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01229  *  details of the format string.
01230  *
01231  *     "%05d" % 123                              #=> "00123"
01232  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01233  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01234  */
01235 
01236 static VALUE
01237 rb_str_format_m(VALUE str, VALUE arg)
01238 {
01239     volatile VALUE tmp = rb_check_array_type(arg);
01240 
01241     if (!NIL_P(tmp)) {
01242         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01243     }
01244     return rb_str_format(1, &arg, str);
01245 }
01246 
01247 static inline void
01248 str_modifiable(VALUE str)
01249 {
01250     if (FL_TEST(str, STR_TMPLOCK)) {
01251         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01252     }
01253     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01254     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01255         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01256 }
01257 
01258 static inline int
01259 str_independent(VALUE str)
01260 {
01261     str_modifiable(str);
01262     if (!STR_SHARED_P(str)) return 1;
01263     if (STR_EMBED_P(str)) return 1;
01264     return 0;
01265 }
01266 
01267 static void
01268 str_make_independent(VALUE str)
01269 {
01270     char *ptr;
01271     long len = RSTRING_LEN(str);
01272 
01273     ptr = ALLOC_N(char, len+1);
01274     if (RSTRING_PTR(str)) {
01275         memcpy(ptr, RSTRING_PTR(str), len);
01276     }
01277     STR_SET_NOEMBED(str);
01278     ptr[len] = 0;
01279     RSTRING(str)->as.heap.ptr = ptr;
01280     RSTRING(str)->as.heap.len = len;
01281     RSTRING(str)->as.heap.aux.capa = len;
01282     STR_UNSET_NOCAPA(str);
01283 }
01284 
01285 void
01286 rb_str_modify(VALUE str)
01287 {
01288     if (!str_independent(str))
01289         str_make_independent(str);
01290     ENC_CODERANGE_CLEAR(str);
01291 }
01292 
01293 /* As rb_str_modify(), but don't clear coderange */
01294 static void
01295 str_modify_keep_cr(VALUE str)
01296 {
01297     if (!str_independent(str))
01298         str_make_independent(str);
01299     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01300         /* Force re-scan later */
01301         ENC_CODERANGE_CLEAR(str);
01302 }
01303 
01304 static inline void
01305 str_discard(VALUE str)
01306 {
01307     str_modifiable(str);
01308     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01309         xfree(RSTRING_PTR(str));
01310         RSTRING(str)->as.heap.ptr = 0;
01311         RSTRING(str)->as.heap.len = 0;
01312     }
01313 }
01314 
01315 void
01316 rb_str_associate(VALUE str, VALUE add)
01317 {
01318     /* sanity check */
01319     if (OBJ_FROZEN(str)) rb_error_frozen("string");
01320     if (STR_ASSOC_P(str)) {
01321         /* already associated */
01322         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01323     }
01324     else {
01325         if (STR_SHARED_P(str)) {
01326             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01327             str_make_independent(str);
01328             if (STR_ASSOC_P(assoc)) {
01329                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01330                 rb_ary_concat(assoc, add);
01331                 add = assoc;
01332             }
01333         }
01334         else if (STR_EMBED_P(str)) {
01335             str_make_independent(str);
01336         }
01337         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01338             RESIZE_CAPA(str, RSTRING_LEN(str));
01339         }
01340         FL_SET(str, STR_ASSOC);
01341         RBASIC(add)->klass = 0;
01342         RSTRING(str)->as.heap.aux.shared = add;
01343     }
01344 }
01345 
01346 VALUE
01347 rb_str_associated(VALUE str)
01348 {
01349     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01350     if (STR_ASSOC_P(str)) {
01351         return RSTRING(str)->as.heap.aux.shared;
01352     }
01353     return Qfalse;
01354 }
01355 
01356 VALUE
01357 rb_string_value(volatile VALUE *ptr)
01358 {
01359     VALUE s = *ptr;
01360     if (TYPE(s) != T_STRING) {
01361         s = rb_str_to_str(s);
01362         *ptr = s;
01363     }
01364     return s;
01365 }
01366 
01367 char *
01368 rb_string_value_ptr(volatile VALUE *ptr)
01369 {
01370     VALUE str = rb_string_value(ptr);
01371     return RSTRING_PTR(str);
01372 }
01373 
01374 char *
01375 rb_string_value_cstr(volatile VALUE *ptr)
01376 {
01377     VALUE str = rb_string_value(ptr);
01378     char *s = RSTRING_PTR(str);
01379     long len = RSTRING_LEN(str);
01380 
01381     if (!s || memchr(s, 0, len)) {
01382         rb_raise(rb_eArgError, "string contains null byte");
01383     }
01384     if (s[len]) {
01385         rb_str_modify(str);
01386         s = RSTRING_PTR(str);
01387         s[RSTRING_LEN(str)] = 0;
01388     }
01389     return s;
01390 }
01391 
01392 VALUE
01393 rb_check_string_type(VALUE str)
01394 {
01395     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01396     return str;
01397 }
01398 
01399 /*
01400  *  call-seq:
01401  *     String.try_convert(obj) -> string or nil
01402  *
01403  *  Try to convert <i>obj</i> into a String, using to_str method.
01404  *  Returns converted string or nil if <i>obj</i> cannot be converted
01405  *  for any reason.
01406  *
01407  *     String.try_convert("str")     #=> "str"
01408  *     String.try_convert(/re/)      #=> nil
01409  */
01410 static VALUE
01411 rb_str_s_try_convert(VALUE dummy, VALUE str)
01412 {
01413     return rb_check_string_type(str);
01414 }
01415 
01416 char*
01417 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01418 {
01419     if (rb_enc_mbmaxlen(enc) == 1) {
01420         p += nth;
01421     }
01422     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01423         p += nth * rb_enc_mbmaxlen(enc);
01424     }
01425     else if (rb_enc_asciicompat(enc)) {
01426         const char *p2, *e2;
01427         int n;
01428 
01429         while (p < e && 0 < nth) {
01430             e2 = p + nth;
01431             if (e < e2)
01432                 return (char *)e;
01433             if (ISASCII(*p)) {
01434                 p2 = search_nonascii(p, e2);
01435                 if (!p2)
01436                     return (char *)e2;
01437                 nth -= p2 - p;
01438                 p = p2;
01439             }
01440             n = rb_enc_mbclen(p, e, enc);
01441             p += n;
01442             nth--;
01443         }
01444         if (nth != 0)
01445             return (char *)e;
01446         return (char *)p;
01447     }
01448     else {
01449         while (p<e && nth--) {
01450             p += rb_enc_mbclen(p, e, enc);
01451         }
01452     }
01453     if (p > e) p = e;
01454     return (char*)p;
01455 }
01456 
01457 static char*
01458 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01459 {
01460     if (singlebyte)
01461         p += nth;
01462     else {
01463         p = rb_enc_nth(p, e, nth, enc);
01464     }
01465     if (!p) return 0;
01466     if (p > e) p = e;
01467     return (char *)p;
01468 }
01469 
01470 /* char offset to byte offset */
01471 static long
01472 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01473 {
01474     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01475     if (!pp) return e - p;
01476     return pp - p;
01477 }
01478 
01479 long
01480 rb_str_offset(VALUE str, long pos)
01481 {
01482     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01483                       STR_ENC_GET(str), single_byte_optimizable(str));
01484 }
01485 
01486 #ifdef NONASCII_MASK
01487 static char *
01488 str_utf8_nth(const char *p, const char *e, long nth)
01489 {
01490     if ((int)SIZEOF_VALUE < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01491         const VALUE *s, *t;
01492         const VALUE lowbits = sizeof(VALUE) - 1;
01493         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01494         t = (const VALUE*)(~lowbits & (VALUE)e);
01495         while (p < (const char *)s) {
01496             if (is_utf8_lead_byte(*p)) nth--;
01497             p++;
01498         }
01499         do {
01500             nth -= count_utf8_lead_bytes_with_word(s);
01501             s++;
01502         } while (s < t && (int)sizeof(VALUE) <= nth);
01503         p = (char *)s;
01504     }
01505     while (p < e) {
01506         if (is_utf8_lead_byte(*p)) {
01507             if (nth == 0) break;
01508             nth--;
01509         }
01510         p++;
01511     }
01512     return (char *)p;
01513 }
01514 
01515 static long
01516 str_utf8_offset(const char *p, const char *e, long nth)
01517 {
01518     const char *pp = str_utf8_nth(p, e, nth);
01519     return pp - p;
01520 }
01521 #endif
01522 
01523 /* byte offset to char offset */
01524 long
01525 rb_str_sublen(VALUE str, long pos)
01526 {
01527     if (single_byte_optimizable(str) || pos < 0)
01528         return pos;
01529     else {
01530         char *p = RSTRING_PTR(str);
01531         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01532     }
01533 }
01534 
01535 VALUE
01536 rb_str_subseq(VALUE str, long beg, long len)
01537 {
01538     VALUE str2;
01539 
01540     if (RSTRING_LEN(str) == beg + len &&
01541         RSTRING_EMBED_LEN_MAX < len) {
01542         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01543         rb_str_drop_bytes(str2, beg);
01544     }
01545     else {
01546         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01547     }
01548 
01549     rb_enc_cr_str_copy_for_substr(str2, str);
01550     OBJ_INFECT(str2, str);
01551 
01552     return str2;
01553 }
01554 
01555 VALUE
01556 rb_str_substr(VALUE str, long beg, long len)
01557 {
01558     rb_encoding *enc = STR_ENC_GET(str);
01559     VALUE str2;
01560     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01561 
01562     if (len < 0) return Qnil;
01563     if (!RSTRING_LEN(str)) {
01564         len = 0;
01565     }
01566     if (single_byte_optimizable(str)) {
01567         if (beg > RSTRING_LEN(str)) return Qnil;
01568         if (beg < 0) {
01569             beg += RSTRING_LEN(str);
01570             if (beg < 0) return Qnil;
01571         }
01572         if (beg + len > RSTRING_LEN(str))
01573             len = RSTRING_LEN(str) - beg;
01574         if (len <= 0) {
01575             len = 0;
01576             p = 0;
01577         }
01578         else
01579             p = s + beg;
01580         goto sub;
01581     }
01582     if (beg < 0) {
01583         if (len > -beg) len = -beg;
01584         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01585             beg = -beg;
01586             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01587             p = e;
01588             if (!p) return Qnil;
01589             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01590             if (!p) return Qnil;
01591             len = e - p;
01592             goto sub;
01593         }
01594         else {
01595             beg += str_strlen(str, enc);
01596             if (beg < 0) return Qnil;
01597         }
01598     }
01599     else if (beg > 0 && beg > str_strlen(str, enc)) {
01600         return Qnil;
01601     }
01602     if (len == 0) {
01603         p = 0;
01604     }
01605 #ifdef NONASCII_MASK
01606     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01607         enc == rb_utf8_encoding()) {
01608         p = str_utf8_nth(s, e, beg);
01609         len = str_utf8_offset(p, e, len);
01610     }
01611 #endif
01612     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01613         int char_sz = rb_enc_mbmaxlen(enc);
01614 
01615         p = s + beg * char_sz;
01616         if (p > e) {
01617             p = e;
01618             len = 0;
01619         }
01620         else if (len * char_sz > e - p)
01621             len = e - p;
01622         else
01623             len *= char_sz;
01624     }
01625     else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
01626         len = 0;
01627     }
01628     else {
01629         len = str_offset(p, e, len, enc, 0);
01630     }
01631   sub:
01632     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01633         str2 = rb_str_new4(str);
01634         str2 = str_new3(rb_obj_class(str2), str2);
01635         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01636         RSTRING(str2)->as.heap.len = len;
01637     }
01638     else {
01639         str2 = rb_str_new5(str, p, len);
01640         rb_enc_cr_str_copy_for_substr(str2, str);
01641         OBJ_INFECT(str2, str);
01642     }
01643 
01644     return str2;
01645 }
01646 
01647 VALUE
01648 rb_str_freeze(VALUE str)
01649 {
01650     if (STR_ASSOC_P(str)) {
01651         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01652         OBJ_FREEZE(ary);
01653     }
01654     return rb_obj_freeze(str);
01655 }
01656 
01657 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01658 #define rb_str_dup_frozen rb_str_new_frozen
01659 
01660 VALUE
01661 rb_str_locktmp(VALUE str)
01662 {
01663     if (FL_TEST(str, STR_TMPLOCK)) {
01664         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01665     }
01666     FL_SET(str, STR_TMPLOCK);
01667     return str;
01668 }
01669 
01670 VALUE
01671 rb_str_unlocktmp(VALUE str)
01672 {
01673     if (!FL_TEST(str, STR_TMPLOCK)) {
01674         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01675     }
01676     FL_UNSET(str, STR_TMPLOCK);
01677     return str;
01678 }
01679 
01680 void
01681 rb_str_set_len(VALUE str, long len)
01682 {
01683     rb_str_modify(str);
01684     STR_SET_LEN(str, len);
01685     RSTRING_PTR(str)[len] = '\0';
01686 }
01687 
01688 VALUE
01689 rb_str_resize(VALUE str, long len)
01690 {
01691     long slen;
01692 
01693     if (len < 0) {
01694         rb_raise(rb_eArgError, "negative string size (or size too big)");
01695     }
01696 
01697     rb_str_modify(str);
01698     slen = RSTRING_LEN(str);
01699     if (len != slen) {
01700         if (STR_EMBED_P(str)) {
01701             char *ptr;
01702             if (len <= RSTRING_EMBED_LEN_MAX) {
01703                 STR_SET_EMBED_LEN(str, len);
01704                 RSTRING(str)->as.ary[len] = '\0';
01705                 return str;
01706             }
01707             ptr = ALLOC_N(char,len+1);
01708             MEMCPY(ptr, RSTRING(str)->as.ary, char, slen);
01709             RSTRING(str)->as.heap.ptr = ptr;
01710             STR_SET_NOEMBED(str);
01711         }
01712         else if (len <= RSTRING_EMBED_LEN_MAX) {
01713             char *ptr = RSTRING(str)->as.heap.ptr;
01714             STR_SET_EMBED(str);
01715             if (slen > len) slen = len;
01716             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01717             RSTRING(str)->as.ary[len] = '\0';
01718             STR_SET_EMBED_LEN(str, len);
01719             xfree(ptr);
01720             return str;
01721         }
01722         else if (slen < len || slen - len > 1024) {
01723             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01724         }
01725         if (!STR_NOCAPA_P(str)) {
01726             RSTRING(str)->as.heap.aux.capa = len;
01727         }
01728         RSTRING(str)->as.heap.len = len;
01729         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01730     }
01731     return str;
01732 }
01733 
01734 static VALUE
01735 str_buf_cat(VALUE str, const char *ptr, long len)
01736 {
01737     long capa, total, off = -1;
01738 
01739     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01740         off = ptr - RSTRING_PTR(str);
01741     }
01742     rb_str_modify(str);
01743     if (len == 0) return 0;
01744     if (STR_ASSOC_P(str)) {
01745         FL_UNSET(str, STR_ASSOC);
01746         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01747     }
01748     else if (STR_EMBED_P(str)) {
01749         capa = RSTRING_EMBED_LEN_MAX;
01750     }
01751     else {
01752         capa = RSTRING(str)->as.heap.aux.capa;
01753     }
01754     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01755         rb_raise(rb_eArgError, "string sizes too big");
01756     }
01757     total = RSTRING_LEN(str)+len;
01758     if (capa <= total) {
01759         while (total > capa) {
01760             if (capa + 1 >= LONG_MAX / 2) {
01761                 capa = (total + 4095) / 4096;
01762                 break;
01763             }
01764             capa = (capa + 1) * 2;
01765         }
01766         RESIZE_CAPA(str, capa);
01767     }
01768     if (off != -1) {
01769         ptr = RSTRING_PTR(str) + off;
01770     }
01771     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01772     STR_SET_LEN(str, total);
01773     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01774 
01775     return str;
01776 }
01777 
01778 #define str_buf_cat2(str, ptr) str_buf_cat(str, (ptr), strlen(ptr))
01779 
01780 VALUE
01781 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01782 {
01783     if (len == 0) return str;
01784     if (len < 0) {
01785         rb_raise(rb_eArgError, "negative string size (or size too big)");
01786     }
01787     return str_buf_cat(str, ptr, len);
01788 }
01789 
01790 VALUE
01791 rb_str_buf_cat2(VALUE str, const char *ptr)
01792 {
01793     return rb_str_buf_cat(str, ptr, strlen(ptr));
01794 }
01795 
01796 VALUE
01797 rb_str_cat(VALUE str, const char *ptr, long len)
01798 {
01799     if (len < 0) {
01800         rb_raise(rb_eArgError, "negative string size (or size too big)");
01801     }
01802     if (STR_ASSOC_P(str)) {
01803         rb_str_modify(str);
01804         if (STR_EMBED_P(str)) str_make_independent(str);
01805         REALLOC_N(RSTRING(str)->as.heap.ptr, char, RSTRING(str)->as.heap.len+len+1);
01806         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, ptr, len);
01807         RSTRING(str)->as.heap.len += len;
01808         RSTRING(str)->as.heap.ptr[RSTRING(str)->as.heap.len] = '\0'; /* sentinel */
01809         return str;
01810     }
01811 
01812     return rb_str_buf_cat(str, ptr, len);
01813 }
01814 
01815 VALUE
01816 rb_str_cat2(VALUE str, const char *ptr)
01817 {
01818     return rb_str_cat(str, ptr, strlen(ptr));
01819 }
01820 
01821 static VALUE
01822 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01823     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01824 {
01825     int str_encindex = ENCODING_GET(str);
01826     int res_encindex;
01827     int str_cr, res_cr;
01828     int str_a8 = ENCODING_IS_ASCII8BIT(str);
01829     int ptr_a8 = ptr_encindex == 0;
01830 
01831     str_cr = ENC_CODERANGE(str);
01832 
01833     if (str_encindex == ptr_encindex) {
01834         if (str_cr == ENC_CODERANGE_UNKNOWN ||
01835             (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01836             ptr_cr = ENC_CODERANGE_UNKNOWN;
01837         }
01838         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01839             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01840         }
01841     }
01842     else {
01843         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01844         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01845         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01846             if (len == 0)
01847                 return str;
01848             if (RSTRING_LEN(str) == 0) {
01849                 rb_str_buf_cat(str, ptr, len);
01850                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01851                 return str;
01852             }
01853             goto incompatible;
01854         }
01855         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01856             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01857         }
01858         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01859             if (str_a8 || ptr_cr != ENC_CODERANGE_7BIT) {
01860                 str_cr = rb_enc_str_coderange(str);
01861             }
01862         }
01863     }
01864     if (ptr_cr_ret)
01865         *ptr_cr_ret = ptr_cr;
01866 
01867     if (str_encindex != ptr_encindex &&
01868         str_cr != ENC_CODERANGE_7BIT &&
01869         ptr_cr != ENC_CODERANGE_7BIT) {
01870       incompatible:
01871         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01872             rb_enc_name(rb_enc_from_index(str_encindex)),
01873             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01874     }
01875 
01876     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01877         res_encindex = str_encindex;
01878         res_cr = ENC_CODERANGE_UNKNOWN;
01879     }
01880     else if (str_cr == ENC_CODERANGE_7BIT) {
01881         if (ptr_cr == ENC_CODERANGE_7BIT) {
01882             res_encindex = !str_a8 ? str_encindex : ptr_encindex;
01883             res_cr = ENC_CODERANGE_7BIT;
01884         }
01885         else {
01886             res_encindex = ptr_encindex;
01887             res_cr = ptr_cr;
01888         }
01889     }
01890     else if (str_cr == ENC_CODERANGE_VALID) {
01891         res_encindex = str_encindex;
01892         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01893             res_cr = str_cr;
01894         else
01895             res_cr = ptr_cr;
01896     }
01897     else { /* str_cr == ENC_CODERANGE_BROKEN */
01898         res_encindex = str_encindex;
01899         res_cr = str_cr;
01900         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01901     }
01902 
01903     if (len < 0) {
01904         rb_raise(rb_eArgError, "negative string size (or size too big)");
01905     }
01906     str_buf_cat(str, ptr, len);
01907     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01908     return str;
01909 }
01910 
01911 VALUE
01912 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01913 {
01914     return rb_enc_cr_str_buf_cat(str, ptr, len,
01915         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01916 }
01917 
01918 VALUE
01919 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
01920 {
01921     /* ptr must reference NUL terminated ASCII string. */
01922     int encindex = ENCODING_GET(str);
01923     rb_encoding *enc = rb_enc_from_index(encindex);
01924     if (rb_enc_asciicompat(enc)) {
01925         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
01926             encindex, ENC_CODERANGE_7BIT, 0);
01927     }
01928     else {
01929         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
01930         while (*ptr) {
01931             unsigned int c = (unsigned char)*ptr;
01932             int len = rb_enc_codelen(c, enc);
01933             rb_enc_mbcput(c, buf, enc);
01934             rb_enc_cr_str_buf_cat(str, buf, len,
01935                 encindex, ENC_CODERANGE_VALID, 0);
01936             ptr++;
01937         }
01938         return str;
01939     }
01940 }
01941 
01942 VALUE
01943 rb_str_buf_append(VALUE str, VALUE str2)
01944 {
01945     int str2_cr;
01946 
01947     str2_cr = ENC_CODERANGE(str2);
01948 
01949     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
01950         ENCODING_GET(str2), str2_cr, &str2_cr);
01951 
01952     OBJ_INFECT(str, str2);
01953     ENC_CODERANGE_SET(str2, str2_cr);
01954 
01955     return str;
01956 }
01957 
01958 VALUE
01959 rb_str_append(VALUE str, VALUE str2)
01960 {
01961     rb_encoding *enc;
01962     int cr, cr2;
01963 
01964     StringValue(str2);
01965     if (RSTRING_LEN(str2) > 0 && STR_ASSOC_P(str)) {
01966         long len = RSTRING_LEN(str)+RSTRING_LEN(str2);
01967         enc = rb_enc_check(str, str2);
01968         cr = ENC_CODERANGE(str);
01969         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
01970         rb_str_modify(str);
01971         REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01972         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
01973                RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
01974         RSTRING(str)->as.heap.len = len;
01975         rb_enc_associate(str, enc);
01976         ENC_CODERANGE_SET(str, cr);
01977         OBJ_INFECT(str, str2);
01978         return str;
01979     }
01980     return rb_str_buf_append(str, str2);
01981 }
01982 
01983 
01984 /*
01985  *  call-seq:
01986  *     str << integer       -> str
01987  *     str.concat(integer)  -> str
01988  *     str << obj           -> str
01989  *     str.concat(obj)      -> str
01990  *
01991  *  Append---Concatenates the given object to <i>str</i>. If the object is a
01992  *  <code>Integer</code>, it is considered as a codepoint, and is converted
01993  *  to a character before concatenation.
01994  *
01995  *     a = "hello "
01996  *     a << "world"   #=> "hello world"
01997  *     a.concat(33)   #=> "hello world!"
01998  */
01999 
02000 VALUE
02001 rb_str_concat(VALUE str1, VALUE str2)
02002 {
02003     unsigned int lc;
02004 
02005     if (FIXNUM_P(str2)) {
02006         if ((int)str2 < 0)
02007             rb_raise(rb_eRangeError, "negative argument");
02008         lc = FIX2UINT(str2);
02009     }
02010     else if (TYPE(str2) == T_BIGNUM) {
02011         if (!RBIGNUM_SIGN(str2))
02012             rb_raise(rb_eRangeError, "negative argument");
02013         lc = NUM2UINT(str2);
02014     }
02015     else {
02016         return rb_str_append(str1, str2);
02017     }
02018 #if SIZEOF_INT < SIZEOF_VALUE
02019     if ((VALUE)lc > UINT_MAX) {
02020         rb_raise(rb_eRangeError, "%"PRIuVALUE" out of char range", lc);
02021     }
02022 #endif
02023     {
02024         rb_encoding *enc = STR_ENC_GET(str1);
02025         long pos = RSTRING_LEN(str1);
02026         int cr = ENC_CODERANGE(str1);
02027         int len;
02028 
02029         if ((len = rb_enc_codelen(lc, enc)) <= 0) {
02030             rb_raise(rb_eRangeError, "%u invalid char", lc);
02031         }
02032         rb_str_resize(str1, pos+len);
02033         rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc);
02034         if (cr == ENC_CODERANGE_7BIT && lc > 127)
02035             cr = ENC_CODERANGE_VALID;
02036         ENC_CODERANGE_SET(str1, cr);
02037         return str1;
02038     }
02039 }
02040 
02041 st_index_t
02042 rb_memhash(const void *ptr, long len)
02043 {
02044     return st_hash(ptr, len, rb_hash_start(0));
02045 }
02046 
02047 st_index_t
02048 rb_str_hash(VALUE str)
02049 {
02050     int e = ENCODING_GET(str);
02051     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02052         e = 0;
02053     }
02054     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02055 }
02056 
02057 int
02058 rb_str_hash_cmp(VALUE str1, VALUE str2)
02059 {
02060     long len;
02061 
02062     if (!rb_str_comparable(str1, str2)) return 1;
02063     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02064         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02065         return 0;
02066     }
02067     return 1;
02068 }
02069 
02070 /*
02071  * call-seq:
02072  *    str.hash   -> fixnum
02073  *
02074  * Return a hash based on the string's length and content.
02075  */
02076 
02077 static VALUE
02078 rb_str_hash_m(VALUE str)
02079 {
02080     st_index_t hval = rb_str_hash(str);
02081     return INT2FIX(hval);
02082 }
02083 
02084 #define lesser(a,b) (((a)>(b))?(b):(a))
02085 
02086 int
02087 rb_str_comparable(VALUE str1, VALUE str2)
02088 {
02089     int idx1, idx2;
02090     int rc1, rc2;
02091 
02092     if (RSTRING_LEN(str1) == 0) return TRUE;
02093     if (RSTRING_LEN(str2) == 0) return TRUE;
02094     idx1 = ENCODING_GET(str1);
02095     idx2 = ENCODING_GET(str2);
02096     if (idx1 == idx2) return TRUE;
02097     rc1 = rb_enc_str_coderange(str1);
02098     rc2 = rb_enc_str_coderange(str2);
02099     if (rc1 == ENC_CODERANGE_7BIT) {
02100         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02101         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02102             return TRUE;
02103     }
02104     if (rc2 == ENC_CODERANGE_7BIT) {
02105         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02106             return TRUE;
02107     }
02108     return FALSE;
02109 }
02110 
02111 int
02112 rb_str_cmp(VALUE str1, VALUE str2)
02113 {
02114     long len;
02115     int retval;
02116 
02117     len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2));
02118     retval = memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len);
02119     if (retval == 0) {
02120         if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) {
02121             if (!rb_str_comparable(str1, str2)) {
02122                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02123                     return 1;
02124                 return -1;
02125             }
02126             return 0;
02127         }
02128         if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return 1;
02129         return -1;
02130     }
02131     if (retval > 0) return 1;
02132     return -1;
02133 }
02134 
02135 /* expect tail call optimization */
02136 static VALUE
02137 str_eql(const VALUE str1, const VALUE str2)
02138 {
02139     const long len = RSTRING_LEN(str1);
02140 
02141     if (len != RSTRING_LEN(str2)) return Qfalse;
02142     if (!rb_str_comparable(str1, str2)) return Qfalse;
02143     if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0)
02144         return Qtrue;
02145     return Qfalse;
02146 }
02147 /*
02148  *  call-seq:
02149  *     str == obj   -> true or false
02150  *
02151  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02152  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02153  *  <code><=></code> <i>obj</i> returns zero.
02154  */
02155 
02156 VALUE
02157 rb_str_equal(VALUE str1, VALUE str2)
02158 {
02159     if (str1 == str2) return Qtrue;
02160     if (TYPE(str2) != T_STRING) {
02161         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02162             return Qfalse;
02163         }
02164         return rb_equal(str2, str1);
02165     }
02166     return str_eql(str1, str2);
02167 }
02168 
02169 /*
02170  * call-seq:
02171  *   str.eql?(other)   -> true or false
02172  *
02173  * Two strings are equal if they have the same length and content.
02174  */
02175 
02176 static VALUE
02177 rb_str_eql(VALUE str1, VALUE str2)
02178 {
02179     if (TYPE(str2) != T_STRING) return Qfalse;
02180     return str_eql(str1, str2);
02181 }
02182 
02183 /*
02184  *  call-seq:
02185  *     str <=> other_str   -> -1, 0, +1 or nil
02186  *
02187  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02188  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02189  *  <i>str</i>. If the strings are of different lengths, and the strings are
02190  *  equal when compared up to the shortest length, then the longer string is
02191  *  considered greater than the shorter one. In older versions of Ruby, setting
02192  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02193  *  in favor of using <code>String#casecmp</code>.
02194  *
02195  *  <code><=></code> is the basis for the methods <code><</code>,
02196  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02197  *  included from module <code>Comparable</code>.  The method
02198  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02199  *
02200  *     "abcdef" <=> "abcde"     #=> 1
02201  *     "abcdef" <=> "abcdef"    #=> 0
02202  *     "abcdef" <=> "abcdefg"   #=> -1
02203  *     "abcdef" <=> "ABCDEF"    #=> 1
02204  */
02205 
02206 static VALUE
02207 rb_str_cmp_m(VALUE str1, VALUE str2)
02208 {
02209     long result;
02210 
02211     if (TYPE(str2) != T_STRING) {
02212         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02213             return Qnil;
02214         }
02215         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02216             return Qnil;
02217         }
02218         else {
02219             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02220 
02221             if (NIL_P(tmp)) return Qnil;
02222             if (!FIXNUM_P(tmp)) {
02223                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02224             }
02225             result = -FIX2LONG(tmp);
02226         }
02227     }
02228     else {
02229         result = rb_str_cmp(str1, str2);
02230     }
02231     return LONG2NUM(result);
02232 }
02233 
02234 /*
02235  *  call-seq:
02236  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02237  *
02238  *  Case-insensitive version of <code>String#<=></code>.
02239  *
02240  *     "abcdef".casecmp("abcde")     #=> 1
02241  *     "aBcDeF".casecmp("abcdef")    #=> 0
02242  *     "abcdef".casecmp("abcdefg")   #=> -1
02243  *     "abcdef".casecmp("ABCDEF")    #=> 0
02244  */
02245 
02246 static VALUE
02247 rb_str_casecmp(VALUE str1, VALUE str2)
02248 {
02249     long len;
02250     rb_encoding *enc;
02251     char *p1, *p1end, *p2, *p2end;
02252 
02253     StringValue(str2);
02254     enc = rb_enc_compatible(str1, str2);
02255     if (!enc) {
02256         return Qnil;
02257     }
02258 
02259     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02260     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02261     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02262         while (p1 < p1end && p2 < p2end) {
02263             if (*p1 != *p2) {
02264                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02265                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02266                 if (c1 != c2)
02267                     return INT2FIX(c1 < c2 ? -1 : 1);
02268             }
02269             p1++;
02270             p2++;
02271         }
02272     }
02273     else {
02274         while (p1 < p1end && p2 < p2end) {
02275             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02276             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02277 
02278             if (0 <= c1 && 0 <= c2) {
02279                 c1 = TOUPPER(c1);
02280                 c2 = TOUPPER(c2);
02281                 if (c1 != c2)
02282                     return INT2FIX(c1 < c2 ? -1 : 1);
02283             }
02284             else {
02285                 int r;
02286                 l1 = rb_enc_mbclen(p1, p1end, enc);
02287                 l2 = rb_enc_mbclen(p2, p2end, enc);
02288                 len = l1 < l2 ? l1 : l2;
02289                 r = memcmp(p1, p2, len);
02290                 if (r != 0)
02291                     return INT2FIX(r < 0 ? -1 : 1);
02292                 if (l1 != l2)
02293                     return INT2FIX(l1 < l2 ? -1 : 1);
02294             }
02295             p1 += l1;
02296             p2 += l2;
02297         }
02298     }
02299     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02300     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02301     return INT2FIX(-1);
02302 }
02303 
02304 static long
02305 rb_str_index(VALUE str, VALUE sub, long offset)
02306 {
02307     long pos;
02308     char *s, *sptr, *e;
02309     long len, slen;
02310     rb_encoding *enc;
02311 
02312     enc = rb_enc_check(str, sub);
02313     if (is_broken_string(sub)) {
02314         return -1;
02315     }
02316     len = str_strlen(str, enc);
02317     slen = str_strlen(sub, enc);
02318     if (offset < 0) {
02319         offset += len;
02320         if (offset < 0) return -1;
02321     }
02322     if (len - offset < slen) return -1;
02323     s = RSTRING_PTR(str);
02324     e = s + RSTRING_LEN(str);
02325     if (offset) {
02326         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02327         s += offset;
02328     }
02329     if (slen == 0) return offset;
02330     /* need proceed one character at a time */
02331     sptr = RSTRING_PTR(sub);
02332     slen = RSTRING_LEN(sub);
02333     len = RSTRING_LEN(str) - offset;
02334     for (;;) {
02335         char *t;
02336         pos = rb_memsearch(sptr, slen, s, len, enc);
02337         if (pos < 0) return pos;
02338         t = rb_enc_right_char_head(s, s+pos, e, enc);
02339         if (t == s + pos) break;
02340         if ((len -= t - s) <= 0) return -1;
02341         offset += t - s;
02342         s = t;
02343     }
02344     return pos + offset;
02345 }
02346 
02347 
02348 /*
02349  *  call-seq:
02350  *     str.index(substring [, offset])   -> fixnum or nil
02351  *     str.index(regexp [, offset])      -> fixnum or nil
02352  *
02353  *  Returns the index of the first occurrence of the given <i>substring</i> or
02354  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02355  *  found. If the second parameter is present, it specifies the position in the
02356  *  string to begin the search.
02357  *
02358  *     "hello".index('e')             #=> 1
02359  *     "hello".index('lo')            #=> 3
02360  *     "hello".index('a')             #=> nil
02361  *     "hello".index(?e)              #=> 1
02362  *     "hello".index(/[aeiou]/, -3)   #=> 4
02363  */
02364 
02365 static VALUE
02366 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02367 {
02368     VALUE sub;
02369     VALUE initpos;
02370     long pos;
02371 
02372     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02373         pos = NUM2LONG(initpos);
02374     }
02375     else {
02376         pos = 0;
02377     }
02378     if (pos < 0) {
02379         pos += str_strlen(str, STR_ENC_GET(str));
02380         if (pos < 0) {
02381             if (TYPE(sub) == T_REGEXP) {
02382                 rb_backref_set(Qnil);
02383             }
02384             return Qnil;
02385         }
02386     }
02387 
02388     switch (TYPE(sub)) {
02389       case T_REGEXP:
02390         if (pos > str_strlen(str, STR_ENC_GET(str)))
02391             return Qnil;
02392         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02393                          rb_enc_check(str, sub), single_byte_optimizable(str));
02394 
02395         pos = rb_reg_search(sub, str, pos, 0);
02396         pos = rb_str_sublen(str, pos);
02397         break;
02398 
02399       default: {
02400         VALUE tmp;
02401 
02402         tmp = rb_check_string_type(sub);
02403         if (NIL_P(tmp)) {
02404             rb_raise(rb_eTypeError, "type mismatch: %s given",
02405                      rb_obj_classname(sub));
02406         }
02407         sub = tmp;
02408       }
02409         /* fall through */
02410       case T_STRING:
02411         pos = rb_str_index(str, sub, pos);
02412         pos = rb_str_sublen(str, pos);
02413         break;
02414     }
02415 
02416     if (pos == -1) return Qnil;
02417     return LONG2NUM(pos);
02418 }
02419 
02420 static long
02421 rb_str_rindex(VALUE str, VALUE sub, long pos)
02422 {
02423     long len, slen;
02424     char *s, *sbeg, *e, *t;
02425     rb_encoding *enc;
02426     int singlebyte = single_byte_optimizable(str);
02427 
02428     enc = rb_enc_check(str, sub);
02429     if (is_broken_string(sub)) {
02430         return -1;
02431     }
02432     len = str_strlen(str, enc);
02433     slen = str_strlen(sub, enc);
02434     /* substring longer than string */
02435     if (len < slen) return -1;
02436     if (len - pos < slen) {
02437         pos = len - slen;
02438     }
02439     if (len == 0) {
02440         return pos;
02441     }
02442     sbeg = RSTRING_PTR(str);
02443     e = RSTRING_END(str);
02444     t = RSTRING_PTR(sub);
02445     slen = RSTRING_LEN(sub);
02446     for (;;) {
02447         s = str_nth(sbeg, e, pos, enc, singlebyte);
02448         if (!s) return -1;
02449         if (memcmp(s, t, slen) == 0) {
02450             return pos;
02451         }
02452         if (pos == 0) break;
02453         pos--;
02454     }
02455     return -1;
02456 }
02457 
02458 
02459 /*
02460  *  call-seq:
02461  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02462  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02463  *
02464  *  Returns the index of the last occurrence of the given <i>substring</i> or
02465  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02466  *  found. If the second parameter is present, it specifies the position in the
02467  *  string to end the search---characters beyond this point will not be
02468  *  considered.
02469  *
02470  *     "hello".rindex('e')             #=> 1
02471  *     "hello".rindex('l')             #=> 3
02472  *     "hello".rindex('a')             #=> nil
02473  *     "hello".rindex(?e)              #=> 1
02474  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02475  */
02476 
02477 static VALUE
02478 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02479 {
02480     VALUE sub;
02481     VALUE vpos;
02482     rb_encoding *enc = STR_ENC_GET(str);
02483     long pos, len = str_strlen(str, enc);
02484 
02485     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02486         pos = NUM2LONG(vpos);
02487         if (pos < 0) {
02488             pos += len;
02489             if (pos < 0) {
02490                 if (TYPE(sub) == T_REGEXP) {
02491                     rb_backref_set(Qnil);
02492                 }
02493                 return Qnil;
02494             }
02495         }
02496         if (pos > len) pos = len;
02497     }
02498     else {
02499         pos = len;
02500     }
02501 
02502     switch (TYPE(sub)) {
02503       case T_REGEXP:
02504         /* enc = rb_get_check(str, sub); */
02505         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02506                          STR_ENC_GET(str), single_byte_optimizable(str));
02507 
02508         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02509             pos = rb_reg_search(sub, str, pos, 1);
02510             pos = rb_str_sublen(str, pos);
02511         }
02512         if (pos >= 0) return LONG2NUM(pos);
02513         break;
02514 
02515       default: {
02516         VALUE tmp;
02517 
02518         tmp = rb_check_string_type(sub);
02519         if (NIL_P(tmp)) {
02520             rb_raise(rb_eTypeError, "type mismatch: %s given",
02521                      rb_obj_classname(sub));
02522         }
02523         sub = tmp;
02524       }
02525         /* fall through */
02526       case T_STRING:
02527         pos = rb_str_rindex(str, sub, pos);
02528         if (pos >= 0) return LONG2NUM(pos);
02529         break;
02530     }
02531     return Qnil;
02532 }
02533 
02534 /*
02535  *  call-seq:
02536  *     str =~ obj   -> fixnum or nil
02537  *
02538  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02539  *  against <i>str</i>,and returns the position the match starts, or
02540  *  <code>nil</code> if there is no match. Otherwise, invokes
02541  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02542  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02543  *
02544  *     "cat o' 9 tails" =~ /\d/   #=> 7
02545  *     "cat o' 9 tails" =~ 9      #=> nil
02546  */
02547 
02548 static VALUE
02549 rb_str_match(VALUE x, VALUE y)
02550 {
02551     switch (TYPE(y)) {
02552       case T_STRING:
02553         rb_raise(rb_eTypeError, "type mismatch: String given");
02554 
02555       case T_REGEXP:
02556         return rb_reg_match(y, x);
02557 
02558       default:
02559         return rb_funcall(y, rb_intern("=~"), 1, x);
02560     }
02561 }
02562 
02563 
02564 static VALUE get_pat(VALUE, int);
02565 
02566 
02567 /*
02568  *  call-seq:
02569  *     str.match(pattern)        -> matchdata or nil
02570  *     str.match(pattern, pos)   -> matchdata or nil
02571  *
02572  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02573  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02574  *  parameter is present, it specifies the position in the string to begin the
02575  *  search.
02576  *  If the second parameter is present, it specifies the position in the string
02577  *  to begin the search.
02578  *
02579  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02580  *     'hello'.match('(.)\1')[0]   #=> "ll"
02581  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02582  *     'hello'.match('xx')         #=> nil
02583  *
02584  *  If a block is given, invoke the block with MatchData if match succeed, so
02585  *  that you can write
02586  *
02587  *     str.match(pat) {|m| ...}
02588  *
02589  *  instead of
02590  *
02591  *     if m = str.match(pat)
02592  *       ...
02593  *     end
02594  *
02595  *  The return value is a value from block execution in this case.
02596  */
02597 
02598 static VALUE
02599 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02600 {
02601     VALUE re, result;
02602     if (argc < 1)
02603        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02604     re = argv[0];
02605     argv[0] = str;
02606     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02607     if (!NIL_P(result) && rb_block_given_p()) {
02608         return rb_yield(result);
02609     }
02610     return result;
02611 }
02612 
02613 enum neighbor_char {
02614     NEIGHBOR_NOT_CHAR,
02615     NEIGHBOR_FOUND,
02616     NEIGHBOR_WRAPPED
02617 };
02618 
02619 static enum neighbor_char
02620 enc_succ_char(char *p, long len, rb_encoding *enc)
02621 {
02622     long i;
02623     int l;
02624     while (1) {
02625         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02626             p[i] = '\0';
02627         if (i < 0)
02628             return NEIGHBOR_WRAPPED;
02629         ++((unsigned char*)p)[i];
02630         l = rb_enc_precise_mbclen(p, p+len, enc);
02631         if (MBCLEN_CHARFOUND_P(l)) {
02632             l = MBCLEN_CHARFOUND_LEN(l);
02633             if (l == len) {
02634                 return NEIGHBOR_FOUND;
02635             }
02636             else {
02637                 memset(p+l, 0xff, len-l);
02638             }
02639         }
02640         if (MBCLEN_INVALID_P(l) && i < len-1) {
02641             long len2;
02642             int l2;
02643             for (len2 = len-1; 0 < len2; len2--) {
02644                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02645                 if (!MBCLEN_INVALID_P(l2))
02646                     break;
02647             }
02648             memset(p+len2+1, 0xff, len-(len2+1));
02649         }
02650     }
02651 }
02652 
02653 static enum neighbor_char
02654 enc_pred_char(char *p, long len, rb_encoding *enc)
02655 {
02656     long i;
02657     int l;
02658     while (1) {
02659         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02660             p[i] = '\xff';
02661         if (i < 0)
02662             return NEIGHBOR_WRAPPED;
02663         --((unsigned char*)p)[i];
02664         l = rb_enc_precise_mbclen(p, p+len, enc);
02665         if (MBCLEN_CHARFOUND_P(l)) {
02666             l = MBCLEN_CHARFOUND_LEN(l);
02667             if (l == len) {
02668                 return NEIGHBOR_FOUND;
02669             }
02670             else {
02671                 memset(p+l, 0, len-l);
02672             }
02673         }
02674         if (MBCLEN_INVALID_P(l) && i < len-1) {
02675             long len2;
02676             int l2;
02677             for (len2 = len-1; 0 < len2; len2--) {
02678                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02679                 if (!MBCLEN_INVALID_P(l2))
02680                     break;
02681             }
02682             memset(p+len2+1, 0, len-(len2+1));
02683         }
02684     }
02685 }
02686 
02687 /*
02688   overwrite +p+ by succeeding letter in +enc+ and returns
02689   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02690   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02691   assuming each ranges are successive, and mbclen
02692   never change in each ranges.
02693   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02694   character.
02695  */
02696 static enum neighbor_char
02697 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02698 {
02699     enum neighbor_char ret;
02700     unsigned int c;
02701     int ctype;
02702     int range;
02703     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02704 
02705     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02706     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02707         ctype = ONIGENC_CTYPE_DIGIT;
02708     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02709         ctype = ONIGENC_CTYPE_ALPHA;
02710     else
02711         return NEIGHBOR_NOT_CHAR;
02712 
02713     MEMCPY(save, p, char, len);
02714     ret = enc_succ_char(p, len, enc);
02715     if (ret == NEIGHBOR_FOUND) {
02716         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02717         if (rb_enc_isctype(c, ctype, enc))
02718             return NEIGHBOR_FOUND;
02719     }
02720     MEMCPY(p, save, char, len);
02721     range = 1;
02722     while (1) {
02723         MEMCPY(save, p, char, len);
02724         ret = enc_pred_char(p, len, enc);
02725         if (ret == NEIGHBOR_FOUND) {
02726             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02727             if (!rb_enc_isctype(c, ctype, enc)) {
02728                 MEMCPY(p, save, char, len);
02729                 break;
02730             }
02731         }
02732         else {
02733             MEMCPY(p, save, char, len);
02734             break;
02735         }
02736         range++;
02737     }
02738     if (range == 1) {
02739         return NEIGHBOR_NOT_CHAR;
02740     }
02741 
02742     if (ctype != ONIGENC_CTYPE_DIGIT) {
02743         MEMCPY(carry, p, char, len);
02744         return NEIGHBOR_WRAPPED;
02745     }
02746 
02747     MEMCPY(carry, p, char, len);
02748     enc_succ_char(carry, len, enc);
02749     return NEIGHBOR_WRAPPED;
02750 }
02751 
02752 
02753 /*
02754  *  call-seq:
02755  *     str.succ   -> new_str
02756  *     str.next   -> new_str
02757  *
02758  *  Returns the successor to <i>str</i>. The successor is calculated by
02759  *  incrementing characters starting from the rightmost alphanumeric (or
02760  *  the rightmost character if there are no alphanumerics) in the
02761  *  string. Incrementing a digit always results in another digit, and
02762  *  incrementing a letter results in another letter of the same case.
02763  *  Incrementing nonalphanumerics uses the underlying character set's
02764  *  collating sequence.
02765  *
02766  *  If the increment generates a ``carry,'' the character to the left of
02767  *  it is incremented. This process repeats until there is no carry,
02768  *  adding an additional character if necessary.
02769  *
02770  *     "abcd".succ        #=> "abce"
02771  *     "THX1138".succ     #=> "THX1139"
02772  *     "<<koala>>".succ   #=> "<<koalb>>"
02773  *     "1999zzz".succ     #=> "2000aaa"
02774  *     "ZZZ9999".succ     #=> "AAAA0000"
02775  *     "***".succ         #=> "**+"
02776  */
02777 
02778 VALUE
02779 rb_str_succ(VALUE orig)
02780 {
02781     rb_encoding *enc;
02782     VALUE str;
02783     char *sbeg, *s, *e, *last_alnum = 0;
02784     int c = -1;
02785     long l;
02786     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02787     long carry_pos = 0, carry_len = 1;
02788     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02789 
02790     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02791     rb_enc_cr_str_copy_for_substr(str, orig);
02792     OBJ_INFECT(str, orig);
02793     if (RSTRING_LEN(str) == 0) return str;
02794 
02795     enc = STR_ENC_GET(orig);
02796     sbeg = RSTRING_PTR(str);
02797     s = e = sbeg + RSTRING_LEN(str);
02798 
02799     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02800         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02801             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02802                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02803                 s = last_alnum;
02804                 break;
02805             }
02806         }
02807         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02808         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02809         switch (neighbor) {
02810           case NEIGHBOR_NOT_CHAR:
02811             continue;
02812           case NEIGHBOR_FOUND:
02813             return str;
02814           case NEIGHBOR_WRAPPED:
02815             last_alnum = s;
02816             break;
02817         }
02818         c = 1;
02819         carry_pos = s - sbeg;
02820         carry_len = l;
02821     }
02822     if (c == -1) {              /* str contains no alnum */
02823         s = e;
02824         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02825             enum neighbor_char neighbor;
02826             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02827             neighbor = enc_succ_char(s, l, enc);
02828             if (neighbor == NEIGHBOR_FOUND)
02829                 return str;
02830             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02831                 /* wrapped to \0...\0.  search next valid char. */
02832                 enc_succ_char(s, l, enc);
02833             }
02834             if (!rb_enc_asciicompat(enc)) {
02835                 MEMCPY(carry, s, char, l);
02836                 carry_len = l;
02837             }
02838             carry_pos = s - sbeg;
02839         }
02840     }
02841     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02842     s = RSTRING_PTR(str) + carry_pos;
02843     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02844     memmove(s, carry, carry_len);
02845     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02846     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02847     rb_enc_str_coderange(str);
02848     return str;
02849 }
02850 
02851 
02852 /*
02853  *  call-seq:
02854  *     str.succ!   -> str
02855  *     str.next!   -> str
02856  *
02857  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02858  *  place.
02859  */
02860 
02861 static VALUE
02862 rb_str_succ_bang(VALUE str)
02863 {
02864     rb_str_shared_replace(str, rb_str_succ(str));
02865 
02866     return str;
02867 }
02868 
02869 
02870 /*
02871  *  call-seq:
02872  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02873  *     str.upto(other_str, exclusive=false)                -> an_enumerator
02874  *
02875  *  Iterates through successive values, starting at <i>str</i> and
02876  *  ending at <i>other_str</i> inclusive, passing each value in turn to
02877  *  the block. The <code>String#succ</code> method is used to generate
02878  *  each value.  If optional second argument exclusive is omitted or is false,
02879  *  the last value will be included; otherwise it will be excluded.
02880  *
02881  *  If no block is given, an enumerator is returned instead.
02882  *
02883  *     "a8".upto("b6") {|s| print s, ' ' }
02884  *     for s in "a8".."b6"
02885  *       print s, ' '
02886  *     end
02887  *
02888  *  <em>produces:</em>
02889  *
02890  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02891  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
02892  *
02893  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
02894  *  both are recognized as decimal numbers. In addition, the width of
02895  *  string (e.g. leading zeros) is handled appropriately.
02896  *
02897  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
02898  *     "25".upto("5").to_a   #=> []
02899  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
02900  */
02901 
02902 static VALUE
02903 rb_str_upto(int argc, VALUE *argv, VALUE beg)
02904 {
02905     VALUE end, exclusive;
02906     VALUE current, after_end;
02907     ID succ;
02908     int n, excl, ascii;
02909     rb_encoding *enc;
02910 
02911     rb_scan_args(argc, argv, "11", &end, &exclusive);
02912     RETURN_ENUMERATOR(beg, argc, argv);
02913     excl = RTEST(exclusive);
02914     CONST_ID(succ, "succ");
02915     StringValue(end);
02916     enc = rb_enc_check(beg, end);
02917     ascii = (is_ascii_string(beg) && is_ascii_string(end));
02918     /* single character */
02919     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
02920         char c = RSTRING_PTR(beg)[0];
02921         char e = RSTRING_PTR(end)[0];
02922 
02923         if (c > e || (excl && c == e)) return beg;
02924         for (;;) {
02925             rb_yield(rb_enc_str_new(&c, 1, enc));
02926             if (!excl && c == e) break;
02927             c++;
02928             if (excl && c == e) break;
02929         }
02930         return beg;
02931     }
02932     /* both edges are all digits */
02933     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
02934         char *s, *send;
02935         VALUE b, e;
02936         int width;
02937 
02938         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
02939         width = rb_long2int(send - s);
02940         while (s < send) {
02941             if (!ISDIGIT(*s)) goto no_digits;
02942             s++;
02943         }
02944         s = RSTRING_PTR(end); send = RSTRING_END(end);
02945         while (s < send) {
02946             if (!ISDIGIT(*s)) goto no_digits;
02947             s++;
02948         }
02949         b = rb_str_to_inum(beg, 10, FALSE);
02950         e = rb_str_to_inum(end, 10, FALSE);
02951         if (FIXNUM_P(b) && FIXNUM_P(e)) {
02952             long bi = FIX2LONG(b);
02953             long ei = FIX2LONG(e);
02954             rb_encoding *usascii = rb_usascii_encoding();
02955 
02956             while (bi <= ei) {
02957                 if (excl && bi == ei) break;
02958                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
02959                 bi++;
02960             }
02961         }
02962         else {
02963             ID op = excl ? '<' : rb_intern("<=");
02964             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
02965 
02966             args[0] = INT2FIX(width);
02967             while (rb_funcall(b, op, 1, e)) {
02968                 args[1] = b;
02969                 rb_yield(rb_str_format(numberof(args), args, fmt));
02970                 b = rb_funcall(b, succ, 0, 0);
02971             }
02972         }
02973         return beg;
02974     }
02975     /* normal case */
02976   no_digits:
02977     n = rb_str_cmp(beg, end);
02978     if (n > 0 || (excl && n == 0)) return beg;
02979 
02980     after_end = rb_funcall(end, succ, 0, 0);
02981     current = rb_str_dup(beg);
02982     while (!rb_str_equal(current, after_end)) {
02983         VALUE next = Qnil;
02984         if (excl || !rb_str_equal(current, end))
02985             next = rb_funcall(current, succ, 0, 0);
02986         rb_yield(current);
02987         if (NIL_P(next)) break;
02988         current = next;
02989         StringValue(current);
02990         if (excl && rb_str_equal(current, end)) break;
02991         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
02992             break;
02993     }
02994 
02995     return beg;
02996 }
02997 
02998 static VALUE
02999 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03000 {
03001     if (rb_reg_search(re, str, 0, 0) >= 0) {
03002         VALUE match = rb_backref_get();
03003         int nth = rb_reg_backref_number(match, backref);
03004         return rb_reg_nth_match(nth, match);
03005     }
03006     return Qnil;
03007 }
03008 
03009 static VALUE
03010 rb_str_aref(VALUE str, VALUE indx)
03011 {
03012     long idx;
03013 
03014     switch (TYPE(indx)) {
03015       case T_FIXNUM:
03016         idx = FIX2LONG(indx);
03017 
03018       num_index:
03019         str = rb_str_substr(str, idx, 1);
03020         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03021         return str;
03022 
03023       case T_REGEXP:
03024         return rb_str_subpat(str, indx, INT2FIX(0));
03025 
03026       case T_STRING:
03027         if (rb_str_index(str, indx, 0) != -1)
03028             return rb_str_dup(indx);
03029         return Qnil;
03030 
03031       default:
03032         /* check if indx is Range */
03033         {
03034             long beg, len;
03035             VALUE tmp;
03036 
03037             len = str_strlen(str, STR_ENC_GET(str));
03038             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03039               case Qfalse:
03040                 break;
03041               case Qnil:
03042                 return Qnil;
03043               default:
03044                 tmp = rb_str_substr(str, beg, len);
03045                 return tmp;
03046             }
03047         }
03048         idx = NUM2LONG(indx);
03049         goto num_index;
03050     }
03051     return Qnil;                /* not reached */
03052 }
03053 
03054 
03055 /*
03056  *  call-seq:
03057  *     str[fixnum]                 -> new_str or nil
03058  *     str[fixnum, fixnum]         -> new_str or nil
03059  *     str[range]                  -> new_str or nil
03060  *     str[regexp]                 -> new_str or nil
03061  *     str[regexp, fixnum]         -> new_str or nil
03062  *     str[other_str]              -> new_str or nil
03063  *     str.slice(fixnum)           -> new_str or nil
03064  *     str.slice(fixnum, fixnum)   -> new_str or nil
03065  *     str.slice(range)            -> new_str or nil
03066  *     str.slice(regexp)           -> new_str or nil
03067  *     str.slice(regexp, fixnum)   -> new_str or nil
03068  *     str.slice(regexp, capname)  -> new_str or nil
03069  *     str.slice(other_str)        -> new_str or nil
03070  *
03071  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03072  *  substring of one character at that position. If passed two <code>Fixnum</code>
03073  *  objects, returns a substring starting at the offset given by the first, and
03074  *  a length given by the second. If given a range, a substring containing
03075  *  characters at offsets given by the range is returned. In all three cases, if
03076  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
03077  *  <code>nil</code> if the initial offset falls outside the string, the length
03078  *  is negative, or the beginning of the range is greater than the end.
03079  *
03080  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03081  *  returned. If a numeric or name parameter follows the regular expression, that
03082  *  component of the <code>MatchData</code> is returned instead. If a
03083  *  <code>String</code> is given, that string is returned if it occurs in
03084  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03085  *  match.
03086  *
03087  *     a = "hello there"
03088  *     a[1]                   #=> "e"
03089  *     a[1,3]                 #=> "ell"
03090  *     a[1..3]                #=> "ell"
03091  *     a[-3,2]                #=> "er"
03092  *     a[-4..-2]              #=> "her"
03093  *     a[12..-1]              #=> nil
03094  *     a[-2..-4]              #=> ""
03095  *     a[/[aeiou](.)\1/]      #=> "ell"
03096  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03097  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03098  *     a[/[aeiou](.)\1/, 2]   #=> nil
03099  *     a["lo"]                #=> "lo"
03100  *     a["bye"]               #=> nil
03101  */
03102 
03103 static VALUE
03104 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03105 {
03106     if (argc == 2) {
03107         if (TYPE(argv[0]) == T_REGEXP) {
03108             return rb_str_subpat(str, argv[0], argv[1]);
03109         }
03110         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03111     }
03112     if (argc != 1) {
03113         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03114     }
03115     return rb_str_aref(str, argv[0]);
03116 }
03117 
03118 VALUE
03119 rb_str_drop_bytes(VALUE str, long len)
03120 {
03121     char *ptr = RSTRING_PTR(str);
03122     long olen = RSTRING_LEN(str), nlen;
03123 
03124     str_modifiable(str);
03125     if (len > olen) len = olen;
03126     nlen = olen - len;
03127     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03128         char *oldptr = ptr;
03129         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03130         STR_SET_EMBED(str);
03131         STR_SET_EMBED_LEN(str, nlen);
03132         ptr = RSTRING(str)->as.ary;
03133         memmove(ptr, oldptr + len, nlen);
03134         if (fl == STR_NOEMBED) xfree(oldptr);
03135     }
03136     else {
03137         if (!STR_SHARED_P(str)) rb_str_new4(str);
03138         ptr = RSTRING(str)->as.heap.ptr += len;
03139         RSTRING(str)->as.heap.len = nlen;
03140     }
03141     ptr[nlen] = 0;
03142     ENC_CODERANGE_CLEAR(str);
03143     return str;
03144 }
03145 
03146 static void
03147 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03148 {
03149     if (beg == 0 && RSTRING_LEN(val) == 0) {
03150         rb_str_drop_bytes(str, len);
03151         OBJ_INFECT(str, val);
03152         return;
03153     }
03154 
03155     rb_str_modify(str);
03156     if (len < RSTRING_LEN(val)) {
03157         /* expand string */
03158         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03159     }
03160 
03161     if (RSTRING_LEN(val) != len) {
03162         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03163                 RSTRING_PTR(str) + beg + len,
03164                 RSTRING_LEN(str) - (beg + len));
03165     }
03166     if (RSTRING_LEN(val) < beg && len < 0) {
03167         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03168     }
03169     if (RSTRING_LEN(val) > 0) {
03170         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03171     }
03172     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03173     if (RSTRING_PTR(str)) {
03174         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03175     }
03176     OBJ_INFECT(str, val);
03177 }
03178 
03179 static void
03180 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03181 {
03182     long slen;
03183     char *p, *e;
03184     rb_encoding *enc;
03185     int singlebyte = single_byte_optimizable(str);
03186     int cr;
03187 
03188     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03189 
03190     StringValue(val);
03191     enc = rb_enc_check(str, val);
03192     slen = str_strlen(str, enc);
03193 
03194     if (slen < beg) {
03195       out_of_range:
03196         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03197     }
03198     if (beg < 0) {
03199         if (-beg > slen) {
03200             goto out_of_range;
03201         }
03202         beg += slen;
03203     }
03204     if (slen < len || slen < beg + len) {
03205         len = slen - beg;
03206     }
03207     str_modify_keep_cr(str);
03208     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03209     if (!p) p = RSTRING_END(str);
03210     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03211     if (!e) e = RSTRING_END(str);
03212     /* error check */
03213     beg = p - RSTRING_PTR(str); /* physical position */
03214     len = e - p;                /* physical length */
03215     rb_str_splice_0(str, beg, len, val);
03216     rb_enc_associate(str, enc);
03217     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03218     if (cr != ENC_CODERANGE_BROKEN)
03219         ENC_CODERANGE_SET(str, cr);
03220 }
03221 
03222 void
03223 rb_str_update(VALUE str, long beg, long len, VALUE val)
03224 {
03225     rb_str_splice(str, beg, len, val);
03226 }
03227 
03228 static void
03229 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03230 {
03231     int nth;
03232     VALUE match;
03233     long start, end, len;
03234     rb_encoding *enc;
03235     struct re_registers *regs;
03236 
03237     if (rb_reg_search(re, str, 0, 0) < 0) {
03238         rb_raise(rb_eIndexError, "regexp not matched");
03239     }
03240     match = rb_backref_get();
03241     nth = rb_reg_backref_number(match, backref);
03242     regs = RMATCH_REGS(match);
03243     if (nth >= regs->num_regs) {
03244       out_of_range:
03245         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03246     }
03247     if (nth < 0) {
03248         if (-nth >= regs->num_regs) {
03249             goto out_of_range;
03250         }
03251         nth += regs->num_regs;
03252     }
03253 
03254     start = BEG(nth);
03255     if (start == -1) {
03256         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03257     }
03258     end = END(nth);
03259     len = end - start;
03260     StringValue(val);
03261     enc = rb_enc_check(str, val);
03262     rb_str_splice_0(str, start, len, val);
03263     rb_enc_associate(str, enc);
03264 }
03265 
03266 static VALUE
03267 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03268 {
03269     long idx, beg;
03270 
03271     switch (TYPE(indx)) {
03272       case T_FIXNUM:
03273         idx = FIX2LONG(indx);
03274       num_index:
03275         rb_str_splice(str, idx, 1, val);
03276         return val;
03277 
03278       case T_REGEXP:
03279         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03280         return val;
03281 
03282       case T_STRING:
03283         beg = rb_str_index(str, indx, 0);
03284         if (beg < 0) {
03285             rb_raise(rb_eIndexError, "string not matched");
03286         }
03287         beg = rb_str_sublen(str, beg);
03288         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03289         return val;
03290 
03291       default:
03292         /* check if indx is Range */
03293         {
03294             long beg, len;
03295             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03296                 rb_str_splice(str, beg, len, val);
03297                 return val;
03298             }
03299         }
03300         idx = NUM2LONG(indx);
03301         goto num_index;
03302     }
03303 }
03304 
03305 /*
03306  *  call-seq:
03307  *     str[fixnum] = new_str
03308  *     str[fixnum, fixnum] = new_str
03309  *     str[range] = aString
03310  *     str[regexp] = new_str
03311  *     str[regexp, fixnum] = new_str
03312  *     str[regexp, name] = new_str
03313  *     str[other_str] = new_str
03314  *
03315  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03316  *  portion of the string affected is determined using the same criteria as
03317  *  <code>String#[]</code>. If the replacement string is not the same length as
03318  *  the text it is replacing, the string will be adjusted accordingly. If the
03319  *  regular expression or string is used as the index doesn't match a position
03320  *  in the string, <code>IndexError</code> is raised. If the regular expression
03321  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03322  *  which portion of the match to replace (effectively using the
03323  *  <code>MatchData</code> indexing rules. The forms that take a
03324  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03325  *  out of range; the <code>Range</code> form will raise a
03326  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03327  *  forms will silently ignore the assignment.
03328  */
03329 
03330 static VALUE
03331 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03332 {
03333     if (argc == 3) {
03334         if (TYPE(argv[0]) == T_REGEXP) {
03335             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03336         }
03337         else {
03338             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03339         }
03340         return argv[2];
03341     }
03342     if (argc != 2) {
03343         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03344     }
03345     return rb_str_aset(str, argv[0], argv[1]);
03346 }
03347 
03348 /*
03349  *  call-seq:
03350  *     str.insert(index, other_str)   -> str
03351  *
03352  *  Inserts <i>other_str</i> before the character at the given
03353  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03354  *  end of the string, and insert <em>after</em> the given character.
03355  *  The intent is insert <i>aString</i> so that it starts at the given
03356  *  <i>index</i>.
03357  *
03358  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03359  *     "abcd".insert(3, 'X')    #=> "abcXd"
03360  *     "abcd".insert(4, 'X')    #=> "abcdX"
03361  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03362  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03363  */
03364 
03365 static VALUE
03366 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03367 {
03368     long pos = NUM2LONG(idx);
03369 
03370     if (pos == -1) {
03371         return rb_str_append(str, str2);
03372     }
03373     else if (pos < 0) {
03374         pos++;
03375     }
03376     rb_str_splice(str, pos, 0, str2);
03377     return str;
03378 }
03379 
03380 
03381 /*
03382  *  call-seq:
03383  *     str.slice!(fixnum)           -> fixnum or nil
03384  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03385  *     str.slice!(range)            -> new_str or nil
03386  *     str.slice!(regexp)           -> new_str or nil
03387  *     str.slice!(other_str)        -> new_str or nil
03388  *
03389  *  Deletes the specified portion from <i>str</i>, and returns the portion
03390  *  deleted.
03391  *
03392  *     string = "this is a string"
03393  *     string.slice!(2)        #=> "i"
03394  *     string.slice!(3..6)     #=> " is "
03395  *     string.slice!(/s.*t/)   #=> "sa st"
03396  *     string.slice!("r")      #=> "r"
03397  *     string                  #=> "thing"
03398  */
03399 
03400 static VALUE
03401 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03402 {
03403     VALUE result;
03404     VALUE buf[3];
03405     int i;
03406 
03407     if (argc < 1 || 2 < argc) {
03408         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03409     }
03410     for (i=0; i<argc; i++) {
03411         buf[i] = argv[i];
03412     }
03413     str_modify_keep_cr(str);
03414     buf[i] = rb_str_new(0,0);
03415     result = rb_str_aref_m(argc, buf, str);
03416     if (!NIL_P(result)) {
03417         rb_str_aset_m(argc+1, buf, str);
03418     }
03419     return result;
03420 }
03421 
03422 static VALUE
03423 get_pat(VALUE pat, int quote)
03424 {
03425     VALUE val;
03426 
03427     switch (TYPE(pat)) {
03428       case T_REGEXP:
03429         return pat;
03430 
03431       case T_STRING:
03432         break;
03433 
03434       default:
03435         val = rb_check_string_type(pat);
03436         if (NIL_P(val)) {
03437             Check_Type(pat, T_REGEXP);
03438         }
03439         pat = val;
03440     }
03441 
03442     if (quote) {
03443         pat = rb_reg_quote(pat);
03444     }
03445 
03446     return rb_reg_regcomp(pat);
03447 }
03448 
03449 
03450 /*
03451  *  call-seq:
03452  *     str.sub!(pattern, replacement)          -> str or nil
03453  *     str.sub!(pattern) {|match| block }      -> str or nil
03454  *
03455  *  Performs the substitutions of <code>String#sub</code> in place,
03456  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03457  *  performed.
03458  */
03459 
03460 static VALUE
03461 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03462 {
03463     VALUE pat, repl, hash = Qnil;
03464     int iter = 0;
03465     int tainted = 0;
03466     int untrusted = 0;
03467     long plen;
03468 
03469     if (argc == 1 && rb_block_given_p()) {
03470         iter = 1;
03471     }
03472     else if (argc == 2) {
03473         repl = argv[1];
03474         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03475         if (NIL_P(hash)) {
03476             StringValue(repl);
03477         }
03478         if (OBJ_TAINTED(repl)) tainted = 1;
03479         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03480     }
03481     else {
03482         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03483     }
03484 
03485     pat = get_pat(argv[0], 1);
03486     str_modifiable(str);
03487     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03488         rb_encoding *enc;
03489         int cr = ENC_CODERANGE(str);
03490         VALUE match = rb_backref_get();
03491         struct re_registers *regs = RMATCH_REGS(match);
03492         long beg0 = BEG(0);
03493         long end0 = END(0);
03494         char *p, *rp;
03495         long len, rlen;
03496 
03497         if (iter || !NIL_P(hash)) {
03498             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03499 
03500             if (iter) {
03501                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03502             }
03503             else {
03504                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03505                 repl = rb_obj_as_string(repl);
03506             }
03507             str_mod_check(str, p, len);
03508             str_frozen_check(str);
03509         }
03510         else {
03511             repl = rb_reg_regsub(repl, str, regs, pat);
03512         }
03513         enc = rb_enc_compatible(str, repl);
03514         if (!enc) {
03515             rb_encoding *str_enc = STR_ENC_GET(str);
03516             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03517             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03518                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03519                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03520                          rb_enc_name(str_enc),
03521                          rb_enc_name(STR_ENC_GET(repl)));
03522             }
03523             enc = STR_ENC_GET(repl);
03524         }
03525         rb_str_modify(str);
03526         rb_enc_associate(str, enc);
03527         if (OBJ_TAINTED(repl)) tainted = 1;
03528         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03529         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03530             int cr2 = ENC_CODERANGE(repl);
03531             if (cr2 == ENC_CODERANGE_BROKEN ||
03532                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03533                 cr = ENC_CODERANGE_UNKNOWN;
03534             else
03535                 cr = cr2;
03536         }
03537         plen = end0 - beg0;
03538         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03539         len = RSTRING_LEN(str);
03540         if (rlen > plen) {
03541             RESIZE_CAPA(str, len + rlen - plen);
03542         }
03543         p = RSTRING_PTR(str);
03544         if (rlen != plen) {
03545             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03546         }
03547         memcpy(p + beg0, rp, rlen);
03548         len += rlen - plen;
03549         STR_SET_LEN(str, len);
03550         RSTRING_PTR(str)[len] = '\0';
03551         ENC_CODERANGE_SET(str, cr);
03552         if (tainted) OBJ_TAINT(str);
03553         if (untrusted) OBJ_UNTRUST(str);
03554 
03555         return str;
03556     }
03557     return Qnil;
03558 }
03559 
03560 
03561 /*
03562  *  call-seq:
03563  *     str.sub(pattern, replacement)         -> new_str
03564  *     str.sub(pattern, hash)                -> new_str
03565  *     str.sub(pattern) {|match| block }     -> new_str
03566  *
03567  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03568  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03569  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03570  *  regular expression metacharacters it contains will be interpreted
03571  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03572  *  instead of a digit.
03573  *
03574  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03575  *  the matched text. It may contain back-references to the pattern's capture
03576  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03577  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03578  *  double-quoted string, both back-references must be preceded by an
03579  *  additional backslash. However, within <i>replacement</i> the special match
03580  *  variables, such as <code>&$</code>, will not refer to the current match.
03581  *
03582  *  If the second argument is a <code>Hash</code>, and the matched text is one
03583  *  of its keys, the corresponding value is the replacement string.
03584  *
03585  *  In the block form, the current match string is passed in as a parameter,
03586  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03587  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03588  *  returned by the block will be substituted for the match on each call.
03589  *
03590  *  The result inherits any tainting in the original string or any supplied
03591  *  replacement string.
03592  *
03593  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03594  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03595  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03596  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03597  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03598  *      #=> "Is /bin/bash your preferred shell?"
03599  */
03600 
03601 static VALUE
03602 rb_str_sub(int argc, VALUE *argv, VALUE str)
03603 {
03604     str = rb_str_dup(str);
03605     rb_str_sub_bang(argc, argv, str);
03606     return str;
03607 }
03608 
03609 static VALUE
03610 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03611 {
03612     VALUE pat, val, repl, match, dest, hash = Qnil;
03613     struct re_registers *regs;
03614     long beg, n;
03615     long beg0, end0;
03616     long offset, blen, slen, len, last;
03617     int iter = 0;
03618     char *sp, *cp;
03619     int tainted = 0;
03620     rb_encoding *str_enc;
03621 
03622     switch (argc) {
03623       case 1:
03624         RETURN_ENUMERATOR(str, argc, argv);
03625         iter = 1;
03626         break;
03627       case 2:
03628         repl = argv[1];
03629         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03630         if (NIL_P(hash)) {
03631             StringValue(repl);
03632         }
03633         if (OBJ_TAINTED(repl)) tainted = 1;
03634         break;
03635       default:
03636         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03637     }
03638 
03639     pat = get_pat(argv[0], 1);
03640     beg = rb_reg_search(pat, str, 0, 0);
03641     if (beg < 0) {
03642         if (bang) return Qnil;  /* no match, no substitution */
03643         return rb_str_dup(str);
03644     }
03645 
03646     offset = 0;
03647     n = 0;
03648     blen = RSTRING_LEN(str) + 30; /* len + margin */
03649     dest = rb_str_buf_new(blen);
03650     sp = RSTRING_PTR(str);
03651     slen = RSTRING_LEN(str);
03652     cp = sp;
03653     str_enc = STR_ENC_GET(str);
03654 
03655     do {
03656         n++;
03657         match = rb_backref_get();
03658         regs = RMATCH_REGS(match);
03659         beg0 = BEG(0);
03660         end0 = END(0);
03661         if (iter || !NIL_P(hash)) {
03662             if (iter) {
03663                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03664             }
03665             else {
03666                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03667                 val = rb_obj_as_string(val);
03668             }
03669             str_mod_check(str, sp, slen);
03670             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03671                 rb_raise(rb_eRuntimeError, "block should not cheat");
03672             }
03673         }
03674         else {
03675             val = rb_reg_regsub(repl, str, regs, pat);
03676         }
03677 
03678         if (OBJ_TAINTED(val)) tainted = 1;
03679 
03680         len = beg - offset;     /* copy pre-match substr */
03681         if (len) {
03682             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03683         }
03684 
03685         rb_str_buf_append(dest, val);
03686 
03687         last = offset;
03688         offset = end0;
03689         if (beg0 == end0) {
03690             /*
03691              * Always consume at least one character of the input string
03692              * in order to prevent infinite loops.
03693              */
03694             if (RSTRING_LEN(str) <= end0) break;
03695             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03696             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03697             offset = end0 + len;
03698         }
03699         cp = RSTRING_PTR(str) + offset;
03700         if (offset > RSTRING_LEN(str)) break;
03701         beg = rb_reg_search(pat, str, offset, 0);
03702     } while (beg >= 0);
03703     if (RSTRING_LEN(str) > offset) {
03704         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03705     }
03706     rb_reg_search(pat, str, last, 0);
03707     if (bang) {
03708         rb_str_shared_replace(str, dest);
03709     }
03710     else {
03711         RBASIC(dest)->klass = rb_obj_class(str);
03712         OBJ_INFECT(dest, str);
03713         str = dest;
03714     }
03715 
03716     if (tainted) OBJ_TAINT(str);
03717     return str;
03718 }
03719 
03720 
03721 /*
03722  *  call-seq:
03723  *     str.gsub!(pattern, replacement)        -> str or nil
03724  *     str.gsub!(pattern) {|match| block }    -> str or nil
03725  *     str.gsub!(pattern)                     -> an_enumerator
03726  *
03727  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03728  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03729  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03730  */
03731 
03732 static VALUE
03733 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03734 {
03735     str_modify_keep_cr(str);
03736     return str_gsub(argc, argv, str, 1);
03737 }
03738 
03739 
03740 /*
03741  *  call-seq:
03742  *     str.gsub(pattern, replacement)       -> new_str
03743  *     str.gsub(pattern, hash)              -> new_str
03744  *     str.gsub(pattern) {|match| block }   -> new_str
03745  *     str.gsub(pattern)                    -> enumerator
03746  *
03747  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03748  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03749  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03750  *  regular expression metacharacters it contains will be interpreted
03751  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03752  *  instead of a digit.
03753  *
03754  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03755  *  the matched text. It may contain back-references to the pattern's capture
03756  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03757  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03758  *  double-quoted string, both back-references must be preceded by an
03759  *  additional backslash. However, within <i>replacement</i> the special match
03760  *  variables, such as <code>&$</code>, will not refer to the current match.
03761  *
03762  *  If the second argument is a <code>Hash</code>, and the matched text is one
03763  *  of its keys, the corresponding value is the replacement string.
03764  *
03765  *  In the block form, the current match string is passed in as a parameter,
03766  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03767  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03768  *  returned by the block will be substituted for the match on each call.
03769  *
03770  *  The result inherits any tainting in the original string or any supplied
03771  *  replacement string.
03772  *
03773  *  When neither a block nor a second argument is supplied, an
03774  *  <code>Enumerator</code> is returned.
03775  *
03776  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03777  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03778  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03779  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03780  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03781  */
03782 
03783 static VALUE
03784 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03785 {
03786     return str_gsub(argc, argv, str, 0);
03787 }
03788 
03789 
03790 /*
03791  *  call-seq:
03792  *     str.replace(other_str)   -> str
03793  *
03794  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03795  *  values in <i>other_str</i>.
03796  *
03797  *     s = "hello"         #=> "hello"
03798  *     s.replace "world"   #=> "world"
03799  */
03800 
03801 VALUE
03802 rb_str_replace(VALUE str, VALUE str2)
03803 {
03804     str_modifiable(str);
03805     if (str == str2) return str;
03806 
03807     StringValue(str2);
03808     str_discard(str);
03809     return str_replace(str, str2);
03810 }
03811 
03812 /*
03813  *  call-seq:
03814  *     string.clear    ->  string
03815  *
03816  *  Makes string empty.
03817  *
03818  *     a = "abcde"
03819  *     a.clear    #=> ""
03820  */
03821 
03822 static VALUE
03823 rb_str_clear(VALUE str)
03824 {
03825     str_discard(str);
03826     STR_SET_EMBED(str);
03827     STR_SET_EMBED_LEN(str, 0);
03828     RSTRING_PTR(str)[0] = 0;
03829     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03830         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03831     else
03832         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03833     return str;
03834 }
03835 
03836 /*
03837  *  call-seq:
03838  *     string.chr    ->  string
03839  *
03840  *  Returns a one-character string at the beginning of the string.
03841  *
03842  *     a = "abcde"
03843  *     a.chr    #=> "a"
03844  */
03845 
03846 static VALUE
03847 rb_str_chr(VALUE str)
03848 {
03849     return rb_str_substr(str, 0, 1);
03850 }
03851 
03852 /*
03853  *  call-seq:
03854  *     str.getbyte(index)          -> 0 .. 255
03855  *
03856  *  returns the <i>index</i>th byte as an integer.
03857  */
03858 static VALUE
03859 rb_str_getbyte(VALUE str, VALUE index)
03860 {
03861     long pos = NUM2LONG(index);
03862 
03863     if (pos < 0)
03864         pos += RSTRING_LEN(str);
03865     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03866         return Qnil;
03867 
03868     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03869 }
03870 
03871 /*
03872  *  call-seq:
03873  *     str.setbyte(index, int) -> int
03874  *
03875  *  modifies the <i>index</i>th byte as <i>int</i>.
03876  */
03877 static VALUE
03878 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03879 {
03880     long pos = NUM2LONG(index);
03881     int byte = NUM2INT(value);
03882 
03883     rb_str_modify(str);
03884 
03885     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
03886         rb_raise(rb_eIndexError, "index %ld out of string", pos);
03887     if (pos < 0)
03888         pos += RSTRING_LEN(str);
03889 
03890     RSTRING_PTR(str)[pos] = byte;
03891 
03892     return value;
03893 }
03894 
03895 /*
03896  *  call-seq:
03897  *     str.reverse   -> new_str
03898  *
03899  *  Returns a new string with the characters from <i>str</i> in reverse order.
03900  *
03901  *     "stressed".reverse   #=> "desserts"
03902  */
03903 
03904 static VALUE
03905 rb_str_reverse(VALUE str)
03906 {
03907     rb_encoding *enc;
03908     VALUE rev;
03909     char *s, *e, *p;
03910     int single = 1;
03911 
03912     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
03913     enc = STR_ENC_GET(str);
03914     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
03915     s = RSTRING_PTR(str); e = RSTRING_END(str);
03916     p = RSTRING_END(rev);
03917 
03918     if (RSTRING_LEN(str) > 1) {
03919         if (single_byte_optimizable(str)) {
03920             while (s < e) {
03921                 *--p = *s++;
03922             }
03923         }
03924         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
03925             while (s < e) {
03926                 int clen = rb_enc_fast_mbclen(s, e, enc);
03927 
03928                 if (clen > 1 || (*s & 0x80)) single = 0;
03929                 p -= clen;
03930                 memcpy(p, s, clen);
03931                 s += clen;
03932             }
03933         }
03934         else {
03935             while (s < e) {
03936                 int clen = rb_enc_mbclen(s, e, enc);
03937 
03938                 if (clen > 1 || (*s & 0x80)) single = 0;
03939                 p -= clen;
03940                 memcpy(p, s, clen);
03941                 s += clen;
03942             }
03943         }
03944     }
03945     STR_SET_LEN(rev, RSTRING_LEN(str));
03946     OBJ_INFECT(rev, str);
03947     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
03948         if (single) {
03949             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03950         }
03951         else {
03952             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953         }
03954     }
03955     rb_enc_cr_str_copy_for_substr(rev, str);
03956 
03957     return rev;
03958 }
03959 
03960 
03961 /*
03962  *  call-seq:
03963  *     str.reverse!   -> str
03964  *
03965  *  Reverses <i>str</i> in place.
03966  */
03967 
03968 static VALUE
03969 rb_str_reverse_bang(VALUE str)
03970 {
03971     if (RSTRING_LEN(str) > 1) {
03972         if (single_byte_optimizable(str)) {
03973             char *s, *e, c;
03974 
03975             str_modify_keep_cr(str);
03976             s = RSTRING_PTR(str);
03977             e = RSTRING_END(str) - 1;
03978             while (s < e) {
03979                 c = *s;
03980                 *s++ = *e;
03981                 *e-- = c;
03982             }
03983         }
03984         else {
03985             rb_str_shared_replace(str, rb_str_reverse(str));
03986         }
03987     }
03988     else {
03989         str_modify_keep_cr(str);
03990     }
03991     return str;
03992 }
03993 
03994 
03995 /*
03996  *  call-seq:
03997  *     str.include? other_str   -> true or false
03998  *
03999  *  Returns <code>true</code> if <i>str</i> contains the given string or
04000  *  character.
04001  *
04002  *     "hello".include? "lo"   #=> true
04003  *     "hello".include? "ol"   #=> false
04004  *     "hello".include? ?h     #=> true
04005  */
04006 
04007 static VALUE
04008 rb_str_include(VALUE str, VALUE arg)
04009 {
04010     long i;
04011 
04012     StringValue(arg);
04013     i = rb_str_index(str, arg, 0);
04014 
04015     if (i == -1) return Qfalse;
04016     return Qtrue;
04017 }
04018 
04019 
04020 /*
04021  *  call-seq:
04022  *     str.to_i(base=10)   -> integer
04023  *
04024  *  Returns the result of interpreting leading characters in <i>str</i> as an
04025  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04026  *  end of a valid number are ignored. If there is not a valid number at the
04027  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04028  *  exception when <i>base</i> is valid.
04029  *
04030  *     "12345".to_i             #=> 12345
04031  *     "99 red balloons".to_i   #=> 99
04032  *     "0a".to_i                #=> 0
04033  *     "0a".to_i(16)            #=> 10
04034  *     "hello".to_i             #=> 0
04035  *     "1100101".to_i(2)        #=> 101
04036  *     "1100101".to_i(8)        #=> 294977
04037  *     "1100101".to_i(10)       #=> 1100101
04038  *     "1100101".to_i(16)       #=> 17826049
04039  */
04040 
04041 static VALUE
04042 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04043 {
04044     int base;
04045 
04046     if (argc == 0) base = 10;
04047     else {
04048         VALUE b;
04049 
04050         rb_scan_args(argc, argv, "01", &b);
04051         base = NUM2INT(b);
04052     }
04053     if (base < 0) {
04054         rb_raise(rb_eArgError, "invalid radix %d", base);
04055     }
04056     return rb_str_to_inum(str, base, FALSE);
04057 }
04058 
04059 
04060 /*
04061  *  call-seq:
04062  *     str.to_f   -> float
04063  *
04064  *  Returns the result of interpreting leading characters in <i>str</i> as a
04065  *  floating point number. Extraneous characters past the end of a valid number
04066  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04067  *  <code>0.0</code> is returned. This method never raises an exception.
04068  *
04069  *     "123.45e1".to_f        #=> 1234.5
04070  *     "45.67 degrees".to_f   #=> 45.67
04071  *     "thx1138".to_f         #=> 0.0
04072  */
04073 
04074 static VALUE
04075 rb_str_to_f(VALUE str)
04076 {
04077     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04078 }
04079 
04080 
04081 /*
04082  *  call-seq:
04083  *     str.to_s     -> str
04084  *     str.to_str   -> str
04085  *
04086  *  Returns the receiver.
04087  */
04088 
04089 static VALUE
04090 rb_str_to_s(VALUE str)
04091 {
04092     if (rb_obj_class(str) != rb_cString) {
04093         return str_duplicate(rb_cString, str);
04094     }
04095     return str;
04096 }
04097 
04098 #if 0
04099 static void
04100 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04101 {
04102     char s[RUBY_MAX_CHAR_LEN];
04103     int n = rb_enc_codelen(c, enc);
04104 
04105     rb_enc_mbcput(c, s, enc);
04106     rb_enc_str_buf_cat(str, s, n, enc);
04107 }
04108 #endif
04109 
04110 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04111 
04112 int
04113 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04114 {
04115     char buf[CHAR_ESC_LEN + 1];
04116     int l;
04117 
04118 #if SIZEOF_INT > 4
04119     c &= 0xffffffff;
04120 #endif
04121     if (unicode_p) {
04122         if (c < 0x7F && ISPRINT(c)) {
04123             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04124         }
04125         else if (c < 0x10000) {
04126             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04127         }
04128         else {
04129             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04130         }
04131     }
04132     else {
04133         if (c < 0x100) {
04134             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04135         }
04136         else {
04137             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04138         }
04139     }
04140     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04141     rb_str_buf_cat(result, buf, l);
04142     return l;
04143 }
04144 
04145 /*
04146  * call-seq:
04147  *   str.inspect   -> string
04148  *
04149  * Returns a printable version of _str_, surrounded by quote marks,
04150  * with special characters escaped.
04151  *
04152  *    str = "hello"
04153  *    str[3] = "\b"
04154  *    str.inspect       #=> "\"hel\\bo\""
04155  */
04156 
04157 VALUE
04158 rb_str_inspect(VALUE str)
04159 {
04160     rb_encoding *enc = STR_ENC_GET(str);
04161     const char *p, *pend, *prev;
04162     char buf[CHAR_ESC_LEN + 1];
04163     VALUE result = rb_str_buf_new(0);
04164     rb_encoding *resenc = rb_default_internal_encoding();
04165     int unicode_p = rb_enc_unicode_p(enc);
04166     int asciicompat = rb_enc_asciicompat(enc);
04167 
04168     if (resenc == NULL) resenc = rb_default_external_encoding();
04169     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04170     rb_enc_associate(result, resenc);
04171     str_buf_cat2(result, "\"");
04172 
04173     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04174     prev = p;
04175     while (p < pend) {
04176         unsigned int c, cc;
04177         int n;
04178 
04179         n = rb_enc_precise_mbclen(p, pend, enc);
04180         if (!MBCLEN_CHARFOUND_P(n)) {
04181             if (p > prev) str_buf_cat(result, prev, p - prev);
04182             n = rb_enc_mbminlen(enc);
04183             if (pend < p + n)
04184                 n = (int)(pend - p);
04185             while (n--) {
04186                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04187                 str_buf_cat(result, buf, strlen(buf));
04188                 prev = ++p;
04189             }
04190             continue;
04191         }
04192         n = MBCLEN_CHARFOUND_LEN(n);
04193         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04194         p += n;
04195         if (c == '"'|| c == '\\' ||
04196             (c == '#' &&
04197              p < pend &&
04198              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04199              (cc = rb_enc_codepoint(p,pend,enc),
04200               (cc == '$' || cc == '@' || cc == '{')))) {
04201             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04202             str_buf_cat2(result, "\\");
04203             if (asciicompat || enc == resenc) {
04204                 prev = p - n;
04205                 continue;
04206             }
04207         }
04208         switch (c) {
04209           case '\n': cc = 'n'; break;
04210           case '\r': cc = 'r'; break;
04211           case '\t': cc = 't'; break;
04212           case '\f': cc = 'f'; break;
04213           case '\013': cc = 'v'; break;
04214           case '\010': cc = 'b'; break;
04215           case '\007': cc = 'a'; break;
04216           case 033: cc = 'e'; break;
04217           default: cc = 0; break;
04218         }
04219         if (cc) {
04220             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04221             buf[0] = '\\';
04222             buf[1] = (char)cc;
04223             str_buf_cat(result, buf, 2);
04224             prev = p;
04225             continue;
04226         }
04227         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04228             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04229             continue;
04230         }
04231         else {
04232             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04233             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04234             prev = p;
04235             continue;
04236         }
04237     }
04238     if (p > prev) str_buf_cat(result, prev, p - prev);
04239     str_buf_cat2(result, "\"");
04240 
04241     OBJ_INFECT(result, str);
04242     return result;
04243 }
04244 
04245 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04246 
04247 /*
04248  *  call-seq:
04249  *     str.dump   -> new_str
04250  *
04251  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04252  *  <code>\nnn</code> notation and all special characters escaped.
04253  */
04254 
04255 VALUE
04256 rb_str_dump(VALUE str)
04257 {
04258     rb_encoding *enc = rb_enc_get(str);
04259     long len;
04260     const char *p, *pend;
04261     char *q, *qend;
04262     VALUE result;
04263     int u8 = (enc == rb_utf8_encoding());
04264 
04265     len = 2;                    /* "" */
04266     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04267     while (p < pend) {
04268         unsigned char c = *p++;
04269         switch (c) {
04270           case '"':  case '\\':
04271           case '\n': case '\r':
04272           case '\t': case '\f':
04273           case '\013': case '\010': case '\007': case '\033':
04274             len += 2;
04275             break;
04276 
04277           case '#':
04278             len += IS_EVSTR(p, pend) ? 2 : 1;
04279             break;
04280 
04281           default:
04282             if (ISPRINT(c)) {
04283                 len++;
04284             }
04285             else {
04286                 if (u8) {       /* \u{NN} */
04287                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04288                     if (MBCLEN_CHARFOUND_P(n-1)) {
04289                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04290                         while (cc >>= 4) len++;
04291                         len += 5;
04292                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04293                         break;
04294                     }
04295                 }
04296                 len += 4;       /* \xNN */
04297             }
04298             break;
04299         }
04300     }
04301     if (!rb_enc_asciicompat(enc)) {
04302         len += 19;              /* ".force_encoding('')" */
04303         len += strlen(enc->name);
04304     }
04305 
04306     result = rb_str_new5(str, 0, len);
04307     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04308     q = RSTRING_PTR(result); qend = q + len + 1;
04309 
04310     *q++ = '"';
04311     while (p < pend) {
04312         unsigned char c = *p++;
04313 
04314         if (c == '"' || c == '\\') {
04315             *q++ = '\\';
04316             *q++ = c;
04317         }
04318         else if (c == '#') {
04319             if (IS_EVSTR(p, pend)) *q++ = '\\';
04320             *q++ = '#';
04321         }
04322         else if (c == '\n') {
04323             *q++ = '\\';
04324             *q++ = 'n';
04325         }
04326         else if (c == '\r') {
04327             *q++ = '\\';
04328             *q++ = 'r';
04329         }
04330         else if (c == '\t') {
04331             *q++ = '\\';
04332             *q++ = 't';
04333         }
04334         else if (c == '\f') {
04335             *q++ = '\\';
04336             *q++ = 'f';
04337         }
04338         else if (c == '\013') {
04339             *q++ = '\\';
04340             *q++ = 'v';
04341         }
04342         else if (c == '\010') {
04343             *q++ = '\\';
04344             *q++ = 'b';
04345         }
04346         else if (c == '\007') {
04347             *q++ = '\\';
04348             *q++ = 'a';
04349         }
04350         else if (c == '\033') {
04351             *q++ = '\\';
04352             *q++ = 'e';
04353         }
04354         else if (ISPRINT(c)) {
04355             *q++ = c;
04356         }
04357         else {
04358             *q++ = '\\';
04359             if (u8) {
04360                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04361                 if (MBCLEN_CHARFOUND_P(n)) {
04362                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04363                     p += n;
04364                     snprintf(q, qend-q, "u{%x}", cc);
04365                     q += strlen(q);
04366                     continue;
04367                 }
04368             }
04369             snprintf(q, qend-q, "x%02X", c);
04370             q += 3;
04371         }
04372     }
04373     *q++ = '"';
04374     *q = '\0';
04375     if (!rb_enc_asciicompat(enc)) {
04376         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04377         enc = rb_ascii8bit_encoding();
04378     }
04379     OBJ_INFECT(result, str);
04380     /* result from dump is ASCII */
04381     rb_enc_associate(result, enc);
04382     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04383     return result;
04384 }
04385 
04386 
04387 static void
04388 rb_str_check_dummy_enc(rb_encoding *enc)
04389 {
04390     if (rb_enc_dummy_p(enc)) {
04391         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04392                  rb_enc_name(enc));
04393     }
04394 }
04395 
04396 /*
04397  *  call-seq:
04398  *     str.upcase!   -> str or nil
04399  *
04400  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04401  *  were made.
04402  *  Note: case replacement is effective only in ASCII region.
04403  */
04404 
04405 static VALUE
04406 rb_str_upcase_bang(VALUE str)
04407 {
04408     rb_encoding *enc;
04409     char *s, *send;
04410     int modify = 0;
04411     int n;
04412 
04413     str_modify_keep_cr(str);
04414     enc = STR_ENC_GET(str);
04415     rb_str_check_dummy_enc(enc);
04416     s = RSTRING_PTR(str); send = RSTRING_END(str);
04417     if (single_byte_optimizable(str)) {
04418         while (s < send) {
04419             unsigned int c = *(unsigned char*)s;
04420 
04421             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04422                 *s = 'A' + (c - 'a');
04423                 modify = 1;
04424             }
04425             s++;
04426         }
04427     }
04428     else {
04429         int ascompat = rb_enc_asciicompat(enc);
04430 
04431         while (s < send) {
04432             unsigned int c;
04433 
04434             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04435                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04436                     *s = 'A' + (c - 'a');
04437                     modify = 1;
04438                 }
04439                 s++;
04440             }
04441             else {
04442                 c = rb_enc_codepoint_len(s, send, &n, enc);
04443                 if (rb_enc_islower(c, enc)) {
04444                     /* assuming toupper returns codepoint with same size */
04445                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04446                     modify = 1;
04447                 }
04448                 s += n;
04449             }
04450         }
04451     }
04452 
04453     if (modify) return str;
04454     return Qnil;
04455 }
04456 
04457 
04458 /*
04459  *  call-seq:
04460  *     str.upcase   -> new_str
04461  *
04462  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04463  *  uppercase counterparts. The operation is locale insensitive---only
04464  *  characters ``a'' to ``z'' are affected.
04465  *  Note: case replacement is effective only in ASCII region.
04466  *
04467  *     "hEllO".upcase   #=> "HELLO"
04468  */
04469 
04470 static VALUE
04471 rb_str_upcase(VALUE str)
04472 {
04473     str = rb_str_dup(str);
04474     rb_str_upcase_bang(str);
04475     return str;
04476 }
04477 
04478 
04479 /*
04480  *  call-seq:
04481  *     str.downcase!   -> str or nil
04482  *
04483  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04484  *  changes were made.
04485  *  Note: case replacement is effective only in ASCII region.
04486  */
04487 
04488 static VALUE
04489 rb_str_downcase_bang(VALUE str)
04490 {
04491     rb_encoding *enc;
04492     char *s, *send;
04493     int modify = 0;
04494 
04495     str_modify_keep_cr(str);
04496     enc = STR_ENC_GET(str);
04497     rb_str_check_dummy_enc(enc);
04498     s = RSTRING_PTR(str); send = RSTRING_END(str);
04499     if (single_byte_optimizable(str)) {
04500         while (s < send) {
04501             unsigned int c = *(unsigned char*)s;
04502 
04503             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04504                 *s = 'a' + (c - 'A');
04505                 modify = 1;
04506             }
04507             s++;
04508         }
04509     }
04510     else {
04511         int ascompat = rb_enc_asciicompat(enc);
04512 
04513         while (s < send) {
04514             unsigned int c;
04515             int n;
04516 
04517             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04518                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04519                     *s = 'a' + (c - 'A');
04520                     modify = 1;
04521                 }
04522                 s++;
04523             }
04524             else {
04525                 c = rb_enc_codepoint_len(s, send, &n, enc);
04526                 if (rb_enc_isupper(c, enc)) {
04527                     /* assuming toupper returns codepoint with same size */
04528                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04529                     modify = 1;
04530                 }
04531                 s += n;
04532             }
04533         }
04534     }
04535 
04536     if (modify) return str;
04537     return Qnil;
04538 }
04539 
04540 
04541 /*
04542  *  call-seq:
04543  *     str.downcase   -> new_str
04544  *
04545  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04546  *  lowercase counterparts. The operation is locale insensitive---only
04547  *  characters ``A'' to ``Z'' are affected.
04548  *  Note: case replacement is effective only in ASCII region.
04549  *
04550  *     "hEllO".downcase   #=> "hello"
04551  */
04552 
04553 static VALUE
04554 rb_str_downcase(VALUE str)
04555 {
04556     str = rb_str_dup(str);
04557     rb_str_downcase_bang(str);
04558     return str;
04559 }
04560 
04561 
04562 /*
04563  *  call-seq:
04564  *     str.capitalize!   -> str or nil
04565  *
04566  *  Modifies <i>str</i> by converting the first character to uppercase and the
04567  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04568  *  Note: case conversion is effective only in ASCII region.
04569  *
04570  *     a = "hello"
04571  *     a.capitalize!   #=> "Hello"
04572  *     a               #=> "Hello"
04573  *     a.capitalize!   #=> nil
04574  */
04575 
04576 static VALUE
04577 rb_str_capitalize_bang(VALUE str)
04578 {
04579     rb_encoding *enc;
04580     char *s, *send;
04581     int modify = 0;
04582     unsigned int c;
04583     int n;
04584 
04585     str_modify_keep_cr(str);
04586     enc = STR_ENC_GET(str);
04587     rb_str_check_dummy_enc(enc);
04588     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04589     s = RSTRING_PTR(str); send = RSTRING_END(str);
04590 
04591     c = rb_enc_codepoint_len(s, send, &n, enc);
04592     if (rb_enc_islower(c, enc)) {
04593         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04594         modify = 1;
04595     }
04596     s += n;
04597     while (s < send) {
04598         c = rb_enc_codepoint_len(s, send, &n, enc);
04599         if (rb_enc_isupper(c, enc)) {
04600             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04601             modify = 1;
04602         }
04603         s += n;
04604     }
04605 
04606     if (modify) return str;
04607     return Qnil;
04608 }
04609 
04610 
04611 /*
04612  *  call-seq:
04613  *     str.capitalize   -> new_str
04614  *
04615  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04616  *  and the remainder to lowercase.
04617  *  Note: case conversion is effective only in ASCII region.
04618  *
04619  *     "hello".capitalize    #=> "Hello"
04620  *     "HELLO".capitalize    #=> "Hello"
04621  *     "123ABC".capitalize   #=> "123abc"
04622  */
04623 
04624 static VALUE
04625 rb_str_capitalize(VALUE str)
04626 {
04627     str = rb_str_dup(str);
04628     rb_str_capitalize_bang(str);
04629     return str;
04630 }
04631 
04632 
04633 /*
04634  *  call-seq:
04635 *     str.swapcase!   -> str or nil
04636  *
04637  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04638  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04639  *  Note: case conversion is effective only in ASCII region.
04640  */
04641 
04642 static VALUE
04643 rb_str_swapcase_bang(VALUE str)
04644 {
04645     rb_encoding *enc;
04646     char *s, *send;
04647     int modify = 0;
04648     int n;
04649 
04650     str_modify_keep_cr(str);
04651     enc = STR_ENC_GET(str);
04652     rb_str_check_dummy_enc(enc);
04653     s = RSTRING_PTR(str); send = RSTRING_END(str);
04654     while (s < send) {
04655         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04656 
04657         if (rb_enc_isupper(c, enc)) {
04658             /* assuming toupper returns codepoint with same size */
04659             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04660             modify = 1;
04661         }
04662         else if (rb_enc_islower(c, enc)) {
04663             /* assuming tolower returns codepoint with same size */
04664             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04665             modify = 1;
04666         }
04667         s += n;
04668     }
04669 
04670     if (modify) return str;
04671     return Qnil;
04672 }
04673 
04674 
04675 /*
04676  *  call-seq:
04677  *     str.swapcase   -> new_str
04678  *
04679  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04680  *  to lowercase and lowercase characters converted to uppercase.
04681  *  Note: case conversion is effective only in ASCII region.
04682  *
04683  *     "Hello".swapcase          #=> "hELLO"
04684  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04685  */
04686 
04687 static VALUE
04688 rb_str_swapcase(VALUE str)
04689 {
04690     str = rb_str_dup(str);
04691     rb_str_swapcase_bang(str);
04692     return str;
04693 }
04694 
04695 typedef unsigned char *USTR;
04696 
04697 struct tr {
04698     int gen;
04699     unsigned int now, max;
04700     char *p, *pend;
04701 };
04702 
04703 static unsigned int
04704 trnext(struct tr *t, rb_encoding *enc)
04705 {
04706     int n;
04707 
04708     for (;;) {
04709         if (!t->gen) {
04710             if (t->p == t->pend) return -1;
04711             if (t->p < t->pend - 1 && *t->p == '\\') {
04712                 t->p++;
04713             }
04714             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04715             t->p += n;
04716             if (t->p < t->pend - 1 && *t->p == '-') {
04717                 t->p++;
04718                 if (t->p < t->pend) {
04719                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04720                     t->p += n;
04721                     if (t->now > c) {
04722                         if (t->now < 0x80 && c < 0x80) {
04723                             rb_raise(rb_eArgError,
04724                                      "invalid range \"%c-%c\" in string transliteration",
04725                                      t->now, c);
04726                         }
04727                         else {
04728                             rb_raise(rb_eArgError, "invalid range in string transliteration");
04729                         }
04730                         continue; /* not reached */
04731                     }
04732                     t->gen = 1;
04733                     t->max = c;
04734                 }
04735             }
04736             return t->now;
04737         }
04738         else if (++t->now < t->max) {
04739             return t->now;
04740         }
04741         else {
04742             t->gen = 0;
04743             return t->max;
04744         }
04745     }
04746 }
04747 
04748 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04749 
04750 static VALUE
04751 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04752 {
04753     const unsigned int errc = -1;
04754     unsigned int trans[256];
04755     rb_encoding *enc, *e1, *e2;
04756     struct tr trsrc, trrepl;
04757     int cflag = 0;
04758     unsigned int c, c0;
04759     int last = 0, modify = 0, i, l;
04760     char *s, *send;
04761     VALUE hash = 0;
04762     int singlebyte = single_byte_optimizable(str);
04763     int cr;
04764 
04765 #define CHECK_IF_ASCII(c) \
04766     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
04767            (cr = ENC_CODERANGE_VALID) : 0)
04768 
04769     StringValue(src);
04770     StringValue(repl);
04771     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04772     if (RSTRING_LEN(repl) == 0) {
04773         return rb_str_delete_bang(1, &src, str);
04774     }
04775 
04776     cr = ENC_CODERANGE(str);
04777     e1 = rb_enc_check(str, src);
04778     e2 = rb_enc_check(str, repl);
04779     if (e1 == e2) {
04780         enc = e1;
04781     }
04782     else {
04783         enc = rb_enc_check(src, repl);
04784     }
04785     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
04786     if (RSTRING_LEN(src) > 1 &&
04787         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
04788         trsrc.p + l < trsrc.pend) {
04789         cflag = 1;
04790         trsrc.p += l;
04791     }
04792     trrepl.p = RSTRING_PTR(repl);
04793     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
04794     trsrc.gen = trrepl.gen = 0;
04795     trsrc.now = trrepl.now = 0;
04796     trsrc.max = trrepl.max = 0;
04797 
04798     if (cflag) {
04799         for (i=0; i<256; i++) {
04800             trans[i] = 1;
04801         }
04802         while ((c = trnext(&trsrc, enc)) != errc) {
04803             if (c < 256) {
04804                 trans[c] = errc;
04805             }
04806             else {
04807                 if (!hash) hash = rb_hash_new();
04808                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
04809             }
04810         }
04811         while ((c = trnext(&trrepl, enc)) != errc)
04812             /* retrieve last replacer */;
04813         last = trrepl.now;
04814         for (i=0; i<256; i++) {
04815             if (trans[i] != errc) {
04816                 trans[i] = last;
04817             }
04818         }
04819     }
04820     else {
04821         unsigned int r;
04822 
04823         for (i=0; i<256; i++) {
04824             trans[i] = errc;
04825         }
04826         while ((c = trnext(&trsrc, enc)) != errc) {
04827             r = trnext(&trrepl, enc);
04828             if (r == errc) r = trrepl.now;
04829             if (c < 256) {
04830                 trans[c] = r;
04831                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
04832             }
04833             else {
04834                 if (!hash) hash = rb_hash_new();
04835                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
04836             }
04837         }
04838     }
04839 
04840     if (cr == ENC_CODERANGE_VALID)
04841         cr = ENC_CODERANGE_7BIT;
04842     str_modify_keep_cr(str);
04843     s = RSTRING_PTR(str); send = RSTRING_END(str);
04844     if (sflag) {
04845         int clen, tlen;
04846         long offset, max = RSTRING_LEN(str);
04847         unsigned int save = -1;
04848         char *buf = ALLOC_N(char, max), *t = buf;
04849 
04850         while (s < send) {
04851             int may_modify = 0;
04852 
04853             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04854             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04855 
04856             s += clen;
04857             if (c < 256) {
04858                 c = trans[c];
04859             }
04860             else if (hash) {
04861                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04862                 if (NIL_P(tmp)) {
04863                     if (cflag) c = last;
04864                     else c = errc;
04865                 }
04866                 else if (cflag) c = errc;
04867                 else c = NUM2INT(tmp);
04868             }
04869             else {
04870                 c = errc;
04871             }
04872             if (c != (unsigned int)-1) {
04873                 if (save == c) {
04874                     CHECK_IF_ASCII(c);
04875                     continue;
04876                 }
04877                 save = c;
04878                 tlen = rb_enc_codelen(c, enc);
04879                 modify = 1;
04880             }
04881             else {
04882                 save = -1;
04883                 c = c0;
04884                 if (enc != e1) may_modify = 1;
04885             }
04886             while (t - buf + tlen >= max) {
04887                 offset = t - buf;
04888                 max *= 2;
04889                 REALLOC_N(buf, char, max);
04890                 t = buf + offset;
04891             }
04892             rb_enc_mbcput(c, t, enc);
04893             if (may_modify && memcmp(s, t, tlen) != 0) {
04894                 modify = 1;
04895             }
04896             CHECK_IF_ASCII(c);
04897             t += tlen;
04898         }
04899         *t = '\0';
04900         RSTRING(str)->as.heap.ptr = buf;
04901         RSTRING(str)->as.heap.len = t - buf;
04902         STR_SET_NOEMBED(str);
04903         RSTRING(str)->as.heap.aux.capa = max;
04904     }
04905     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
04906         while (s < send) {
04907             c = (unsigned char)*s;
04908             if (trans[c] != errc) {
04909                 if (!cflag) {
04910                     c = trans[c];
04911                     *s = c;
04912                     modify = 1;
04913                 }
04914                 else {
04915                     *s = last;
04916                     modify = 1;
04917                 }
04918             }
04919             CHECK_IF_ASCII(c);
04920             s++;
04921         }
04922     }
04923     else {
04924         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
04925         long offset;
04926         char *buf = ALLOC_N(char, max), *t = buf;
04927 
04928         while (s < send) {
04929             int may_modify = 0;
04930             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
04931             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
04932 
04933             if (c < 256) {
04934                 c = trans[c];
04935             }
04936             else if (hash) {
04937                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
04938                 if (NIL_P(tmp)) {
04939                     if (cflag) c = last;
04940                     else c = errc;
04941                 }
04942                 else if (cflag) c = errc;
04943                 else c = NUM2INT(tmp);
04944             }
04945             else {
04946                 c = errc;
04947             }
04948             if (c != errc) {
04949                 tlen = rb_enc_codelen(c, enc);
04950                 modify = 1;
04951             }
04952             else {
04953                 c = c0;
04954                 if (enc != e1) may_modify = 1;
04955             }
04956             while (t - buf + tlen >= max) {
04957                 offset = t - buf;
04958                 max *= 2;
04959                 REALLOC_N(buf, char, max);
04960                 t = buf + offset;
04961             }
04962             if (s != t) {
04963                 rb_enc_mbcput(c, t, enc);
04964                 if (may_modify && memcmp(s, t, tlen) != 0) {
04965                     modify = 1;
04966                 }
04967             }
04968             CHECK_IF_ASCII(c);
04969             s += clen;
04970             t += tlen;
04971         }
04972         if (!STR_EMBED_P(str)) {
04973             xfree(RSTRING(str)->as.heap.ptr);
04974         }
04975         *t = '\0';
04976         RSTRING(str)->as.heap.ptr = buf;
04977         RSTRING(str)->as.heap.len = t - buf;
04978         STR_SET_NOEMBED(str);
04979         RSTRING(str)->as.heap.aux.capa = max;
04980     }
04981 
04982     if (modify) {
04983         if (cr != ENC_CODERANGE_BROKEN)
04984             ENC_CODERANGE_SET(str, cr);
04985         rb_enc_associate(str, enc);
04986         return str;
04987     }
04988     return Qnil;
04989 }
04990 
04991 
04992 /*
04993  *  call-seq:
04994  *     str.tr!(from_str, to_str)   -> str or nil
04995  *
04996  *  Translates <i>str</i> in place, using the same rules as
04997  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
04998  *  changes were made.
04999  */
05000 
05001 static VALUE
05002 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05003 {
05004     return tr_trans(str, src, repl, 0);
05005 }
05006 
05007 
05008 /*
05009  *  call-seq:
05010  *     str.tr(from_str, to_str)   -> new_str
05011  *
05012  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i> replaced
05013  *  by the corresponding characters in <i>to_str</i>. If <i>to_str</i> is
05014  *  shorter than <i>from_str</i>, it is padded with its last character. Both
05015  *  strings may use the c1--c2 notation to denote ranges of characters, and
05016  *  <i>from_str</i> may start with a <code>^</code>, which denotes all
05017  *  characters except those listed.
05018  *
05019  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05020  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05021  *     "hello".tr('el', 'ip')      #=> "hippo"
05022  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05023  */
05024 
05025 static VALUE
05026 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05027 {
05028     str = rb_str_dup(str);
05029     tr_trans(str, src, repl, 0);
05030     return str;
05031 }
05032 
05033 static void
05034 tr_setup_table(VALUE str, char stable[256], int first,
05035                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05036 {
05037     const unsigned int errc = -1;
05038     char buf[256];
05039     struct tr tr;
05040     unsigned int c;
05041     VALUE table = 0, ptable = 0;
05042     int i, l, cflag = 0;
05043 
05044     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05045     tr.gen = tr.now = tr.max = 0;
05046 
05047     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05048         cflag = 1;
05049         tr.p += l;
05050 
05051         table = rb_hash_new();
05052         ptable = *ctablep;
05053         *ctablep = table;
05054     }
05055     else {
05056         table = rb_hash_new();
05057         ptable = *tablep;
05058         *tablep = table;
05059     }
05060     if (first) {
05061         for (i=0; i<256; i++) {
05062             stable[i] = 1;
05063         }
05064     }
05065     for (i=0; i<256; i++) {
05066         buf[i] = cflag;
05067     }
05068 
05069     while ((c = trnext(&tr, enc)) != errc) {
05070         if (c < 256) {
05071             buf[c & 0xff] = !cflag;
05072         }
05073         else {
05074             VALUE key = UINT2NUM(c);
05075 
05076             if (!table) {
05077                 table = rb_hash_new();
05078                 ptable = *tablep;
05079                 *tablep = table;
05080             }
05081             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05082                 rb_hash_aset(table, key, Qtrue);
05083             }
05084         }
05085     }
05086     for (i=0; i<256; i++) {
05087         stable[i] = stable[i] && buf[i];
05088     }
05089 }
05090 
05091 
05092 static int
05093 tr_find(unsigned int c, char table[256], VALUE del, VALUE nodel)
05094 {
05095     if (c < 256) {
05096         return table[c] != 0;
05097     }
05098     else {
05099         VALUE v = UINT2NUM(c);
05100 
05101         if (del) {
05102             if (!NIL_P(rb_hash_lookup(del, v)) &&
05103                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05104                 return TRUE;
05105             }
05106         }
05107         else if (nodel && NIL_P(rb_hash_lookup(nodel, v))) {
05108             return TRUE;
05109         }
05110         return FALSE;
05111     }
05112 }
05113 
05114 /*
05115  *  call-seq:
05116  *     str.delete!([other_str]+)   -> str or nil
05117  *
05118  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05119  *  <code>nil</code> if <i>str</i> was not modified.
05120  */
05121 
05122 static VALUE
05123 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05124 {
05125     char squeez[256];
05126     rb_encoding *enc = 0;
05127     char *s, *send, *t;
05128     VALUE del = 0, nodel = 0;
05129     int modify = 0;
05130     int i, ascompat, cr;
05131 
05132     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05133     if (argc < 1) {
05134         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05135     }
05136     for (i=0; i<argc; i++) {
05137         VALUE s = argv[i];
05138 
05139         StringValue(s);
05140         enc = rb_enc_check(str, s);
05141         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05142     }
05143 
05144     str_modify_keep_cr(str);
05145     ascompat = rb_enc_asciicompat(enc);
05146     s = t = RSTRING_PTR(str);
05147     send = RSTRING_END(str);
05148     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05149     while (s < send) {
05150         unsigned int c;
05151         int clen;
05152 
05153         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05154             if (squeez[c]) {
05155                 modify = 1;
05156             }
05157             else {
05158                 if (t != s) *t = c;
05159                 t++;
05160             }
05161             s++;
05162         }
05163         else {
05164             c = rb_enc_codepoint_len(s, send, &clen, enc);
05165 
05166             if (tr_find(c, squeez, del, nodel)) {
05167                 modify = 1;
05168             }
05169             else {
05170                 if (t != s) rb_enc_mbcput(c, t, enc);
05171                 t += clen;
05172                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05173             }
05174             s += clen;
05175         }
05176     }
05177     *t = '\0';
05178     STR_SET_LEN(str, t - RSTRING_PTR(str));
05179     ENC_CODERANGE_SET(str, cr);
05180 
05181     if (modify) return str;
05182     return Qnil;
05183 }
05184 
05185 
05186 /*
05187  *  call-seq:
05188  *     str.delete([other_str]+)   -> new_str
05189  *
05190  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05191  *  arguments deleted. Uses the same rules for building the set of characters as
05192  *  <code>String#count</code>.
05193  *
05194  *     "hello".delete "l","lo"        #=> "heo"
05195  *     "hello".delete "lo"            #=> "he"
05196  *     "hello".delete "aeiou", "^e"   #=> "hell"
05197  *     "hello".delete "ej-m"          #=> "ho"
05198  */
05199 
05200 static VALUE
05201 rb_str_delete(int argc, VALUE *argv, VALUE str)
05202 {
05203     str = rb_str_dup(str);
05204     rb_str_delete_bang(argc, argv, str);
05205     return str;
05206 }
05207 
05208 
05209 /*
05210  *  call-seq:
05211  *     str.squeeze!([other_str]*)   -> str or nil
05212  *
05213  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05214  *  <code>nil</code> if no changes were made.
05215  */
05216 
05217 static VALUE
05218 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05219 {
05220     char squeez[256];
05221     rb_encoding *enc = 0;
05222     VALUE del = 0, nodel = 0;
05223     char *s, *send, *t;
05224     int i, modify = 0;
05225     int ascompat, singlebyte = single_byte_optimizable(str);
05226     unsigned int save;
05227 
05228     if (argc == 0) {
05229         enc = STR_ENC_GET(str);
05230     }
05231     else {
05232         for (i=0; i<argc; i++) {
05233             VALUE s = argv[i];
05234 
05235             StringValue(s);
05236             enc = rb_enc_check(str, s);
05237             if (singlebyte && !single_byte_optimizable(s))
05238                 singlebyte = 0;
05239             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05240         }
05241     }
05242 
05243     str_modify_keep_cr(str);
05244     s = t = RSTRING_PTR(str);
05245     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05246     send = RSTRING_END(str);
05247     save = -1;
05248     ascompat = rb_enc_asciicompat(enc);
05249 
05250     if (singlebyte) {
05251         while (s < send) {
05252             unsigned int c = *(unsigned char*)s++;
05253             if (c != save || (argc > 0 && !squeez[c])) {
05254                 *t++ = save = c;
05255             }
05256         }
05257     } else {
05258         while (s < send) {
05259             unsigned int c;
05260             int clen;
05261 
05262             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05263                 if (c != save || (argc > 0 && !squeez[c])) {
05264                     *t++ = save = c;
05265                 }
05266                 s++;
05267             }
05268             else {
05269                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05270 
05271                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05272                     if (t != s) rb_enc_mbcput(c, t, enc);
05273                     save = c;
05274                     t += clen;
05275                 }
05276                 s += clen;
05277             }
05278         }
05279     }
05280 
05281     *t = '\0';
05282     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05283         STR_SET_LEN(str, t - RSTRING_PTR(str));
05284         modify = 1;
05285     }
05286 
05287     if (modify) return str;
05288     return Qnil;
05289 }
05290 
05291 
05292 /*
05293  *  call-seq:
05294  *     str.squeeze([other_str]*)    -> new_str
05295  *
05296  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05297  *  procedure described for <code>String#count</code>. Returns a new string
05298  *  where runs of the same character that occur in this set are replaced by a
05299  *  single character. If no arguments are given, all runs of identical
05300  *  characters are replaced by a single character.
05301  *
05302  *     "yellow moon".squeeze                  #=> "yelow mon"
05303  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05304  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05305  */
05306 
05307 static VALUE
05308 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05309 {
05310     str = rb_str_dup(str);
05311     rb_str_squeeze_bang(argc, argv, str);
05312     return str;
05313 }
05314 
05315 
05316 /*
05317  *  call-seq:
05318  *     str.tr_s!(from_str, to_str)   -> str or nil
05319  *
05320  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05321  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05322  */
05323 
05324 static VALUE
05325 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05326 {
05327     return tr_trans(str, src, repl, 1);
05328 }
05329 
05330 
05331 /*
05332  *  call-seq:
05333  *     str.tr_s(from_str, to_str)   -> new_str
05334  *
05335  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05336  *  then removes duplicate characters in regions that were affected by the
05337  *  translation.
05338  *
05339  *     "hello".tr_s('l', 'r')     #=> "hero"
05340  *     "hello".tr_s('el', '*')    #=> "h*o"
05341  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05342  */
05343 
05344 static VALUE
05345 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05346 {
05347     str = rb_str_dup(str);
05348     tr_trans(str, src, repl, 1);
05349     return str;
05350 }
05351 
05352 
05353 /*
05354  *  call-seq:
05355  *     str.count([other_str]+)   -> fixnum
05356  *
05357  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05358  *  intersection of these sets defines the characters to count in
05359  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05360  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05361  *
05362  *     a = "hello world"
05363  *     a.count "lo"            #=> 5
05364  *     a.count "lo", "o"       #=> 2
05365  *     a.count "hello", "^l"   #=> 4
05366  *     a.count "ej-m"          #=> 4
05367  */
05368 
05369 static VALUE
05370 rb_str_count(int argc, VALUE *argv, VALUE str)
05371 {
05372     char table[256];
05373     rb_encoding *enc = 0;
05374     VALUE del = 0, nodel = 0;
05375     char *s, *send;
05376     int i;
05377     int ascompat;
05378 
05379     if (argc < 1) {
05380         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05381     }
05382     for (i=0; i<argc; i++) {
05383         VALUE tstr = argv[i];
05384         unsigned char c;
05385 
05386         StringValue(tstr);
05387         enc = rb_enc_check(str, tstr);
05388         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05389             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05390             int n = 0;
05391 
05392             s = RSTRING_PTR(str);
05393             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05394             send = RSTRING_END(str);
05395             while (s < send) {
05396                 if (*(unsigned char*)s++ == c) n++;
05397             }
05398             return INT2NUM(n);
05399         }
05400         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05401     }
05402 
05403     s = RSTRING_PTR(str);
05404     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05405     send = RSTRING_END(str);
05406     ascompat = rb_enc_asciicompat(enc);
05407     i = 0;
05408     while (s < send) {
05409         unsigned int c;
05410 
05411         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05412             if (table[c]) {
05413                 i++;
05414             }
05415             s++;
05416         }
05417         else {
05418             int clen;
05419             c = rb_enc_codepoint_len(s, send, &clen, enc);
05420             if (tr_find(c, table, del, nodel)) {
05421                 i++;
05422             }
05423             s += clen;
05424         }
05425     }
05426 
05427     return INT2NUM(i);
05428 }
05429 
05430 static const char isspacetable[256] = {
05431     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05432     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05433     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05434     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05435     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05436     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05437     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05438     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05439     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05440     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05441     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05442     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05443     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05445     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05446     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05447 };
05448 
05449 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05450 
05451 /*
05452  *  call-seq:
05453  *     str.split(pattern=$;, [limit])   -> anArray
05454  *
05455  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05456  *  of these substrings.
05457  *
05458  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05459  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05460  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05461  *  of contiguous whitespace characters ignored.
05462  *
05463  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05464  *  pattern matches. Whenever the pattern matches a zero-length string,
05465  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05466  *  groups, the respective matches will be returned in the array as well.
05467  *
05468  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05469  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05470  *  split on whitespace as if ` ' were specified.
05471  *
05472  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05473  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05474  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05475  *  string is returned as the only entry in an array). If negative, there is no
05476  *  limit to the number of fields returned, and trailing null fields are not
05477  *  suppressed.
05478  *
05479  *     " now's  the time".split        #=> ["now's", "the", "time"]
05480  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05481  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05482  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05483  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05484  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05485  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05486  *
05487  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05488  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05489  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05490  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05491  */
05492 
05493 static VALUE
05494 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05495 {
05496     rb_encoding *enc;
05497     VALUE spat;
05498     VALUE limit;
05499     enum {awk, string, regexp} split_type;
05500     long beg, end, i = 0;
05501     int lim = 0;
05502     VALUE result, tmp;
05503 
05504     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05505         lim = NUM2INT(limit);
05506         if (lim <= 0) limit = Qnil;
05507         else if (lim == 1) {
05508             if (RSTRING_LEN(str) == 0)
05509                 return rb_ary_new2(0);
05510             return rb_ary_new3(1, str);
05511         }
05512         i = 1;
05513     }
05514 
05515     enc = STR_ENC_GET(str);
05516     if (NIL_P(spat)) {
05517         if (!NIL_P(rb_fs)) {
05518             spat = rb_fs;
05519             goto fs_set;
05520         }
05521         split_type = awk;
05522     }
05523     else {
05524       fs_set:
05525         if (TYPE(spat) == T_STRING) {
05526             rb_encoding *enc2 = STR_ENC_GET(spat);
05527 
05528             split_type = string;
05529             if (RSTRING_LEN(spat) == 0) {
05530                 /* Special case - split into chars */
05531                 spat = rb_reg_regcomp(spat);
05532                 split_type = regexp;
05533             }
05534             else if (rb_enc_asciicompat(enc2) == 1) {
05535                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05536                     split_type = awk;
05537                 }
05538             }
05539             else {
05540                 int l;
05541                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05542                     RSTRING_LEN(spat) == l) {
05543                     split_type = awk;
05544                 }
05545             }
05546         }
05547         else {
05548             spat = get_pat(spat, 1);
05549             split_type = regexp;
05550         }
05551     }
05552 
05553     result = rb_ary_new();
05554     beg = 0;
05555     if (split_type == awk) {
05556         char *ptr = RSTRING_PTR(str);
05557         char *eptr = RSTRING_END(str);
05558         char *bptr = ptr;
05559         int skip = 1;
05560         unsigned int c;
05561 
05562         end = beg;
05563         if (is_ascii_string(str)) {
05564             while (ptr < eptr) {
05565                 c = (unsigned char)*ptr++;
05566                 if (skip) {
05567                     if (ascii_isspace(c)) {
05568                         beg = ptr - bptr;
05569                     }
05570                     else {
05571                         end = ptr - bptr;
05572                         skip = 0;
05573                         if (!NIL_P(limit) && lim <= i) break;
05574                     }
05575                 }
05576                 else if (ascii_isspace(c)) {
05577                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05578                     skip = 1;
05579                     beg = ptr - bptr;
05580                     if (!NIL_P(limit)) ++i;
05581                 }
05582                 else {
05583                     end = ptr - bptr;
05584                 }
05585             }
05586         }
05587         else {
05588             while (ptr < eptr) {
05589                 int n;
05590 
05591                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05592                 ptr += n;
05593                 if (skip) {
05594                     if (rb_isspace(c)) {
05595                         beg = ptr - bptr;
05596                     }
05597                     else {
05598                         end = ptr - bptr;
05599                         skip = 0;
05600                         if (!NIL_P(limit) && lim <= i) break;
05601                     }
05602                 }
05603                 else if (rb_isspace(c)) {
05604                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05605                     skip = 1;
05606                     beg = ptr - bptr;
05607                     if (!NIL_P(limit)) ++i;
05608                 }
05609                 else {
05610                     end = ptr - bptr;
05611                 }
05612             }
05613         }
05614     }
05615     else if (split_type == string) {
05616         char *ptr = RSTRING_PTR(str);
05617         char *temp = ptr;
05618         char *eptr = RSTRING_END(str);
05619         char *sptr = RSTRING_PTR(spat);
05620         long slen = RSTRING_LEN(spat);
05621 
05622         if (is_broken_string(str)) {
05623             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05624         }
05625         if (is_broken_string(spat)) {
05626             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05627         }
05628         enc = rb_enc_check(str, spat);
05629         while (ptr < eptr &&
05630                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05631             /* Check we are at the start of a char */
05632             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05633             if (t != ptr + end) {
05634                 ptr = t;
05635                 continue;
05636             }
05637             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05638             ptr += end + slen;
05639             if (!NIL_P(limit) && lim <= ++i) break;
05640         }
05641         beg = ptr - temp;
05642     }
05643     else {
05644         char *ptr = RSTRING_PTR(str);
05645         long len = RSTRING_LEN(str);
05646         long start = beg;
05647         long idx;
05648         int last_null = 0;
05649         struct re_registers *regs;
05650 
05651         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05652             regs = RMATCH_REGS(rb_backref_get());
05653             if (start == end && BEG(0) == END(0)) {
05654                 if (!ptr) {
05655                     rb_ary_push(result, str_new_empty(str));
05656                     break;
05657                 }
05658                 else if (last_null == 1) {
05659                     rb_ary_push(result, rb_str_subseq(str, beg,
05660                                                       rb_enc_fast_mbclen(ptr+beg,
05661                                                                          ptr+len,
05662                                                                          enc)));
05663                     beg = start;
05664                 }
05665                 else {
05666                     if (ptr+start == ptr+len)
05667                         start++;
05668                     else
05669                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05670                     last_null = 1;
05671                     continue;
05672                 }
05673             }
05674             else {
05675                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05676                 beg = start = END(0);
05677             }
05678             last_null = 0;
05679 
05680             for (idx=1; idx < regs->num_regs; idx++) {
05681                 if (BEG(idx) == -1) continue;
05682                 if (BEG(idx) == END(idx))
05683                     tmp = str_new_empty(str);
05684                 else
05685                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05686                 rb_ary_push(result, tmp);
05687             }
05688             if (!NIL_P(limit) && lim <= ++i) break;
05689         }
05690     }
05691     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05692         if (RSTRING_LEN(str) == beg)
05693             tmp = str_new_empty(str);
05694         else
05695             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05696         rb_ary_push(result, tmp);
05697     }
05698     if (NIL_P(limit) && lim == 0) {
05699         long len;
05700         while ((len = RARRAY_LEN(result)) > 0 &&
05701                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05702             rb_ary_pop(result);
05703     }
05704 
05705     return result;
05706 }
05707 
05708 VALUE
05709 rb_str_split(VALUE str, const char *sep0)
05710 {
05711     VALUE sep;
05712 
05713     StringValue(str);
05714     sep = rb_str_new2(sep0);
05715     return rb_str_split_m(1, &sep, str);
05716 }
05717 
05718 
05719 /*
05720  *  call-seq:
05721  *     str.each_line(separator=$/) {|substr| block }   -> str
05722  *     str.each_line(separator=$/)                     -> an_enumerator
05723  *
05724  *     str.lines(separator=$/) {|substr| block }       -> str
05725  *     str.lines(separator=$/)                         -> an_enumerator
05726  *
05727  *  Splits <i>str</i> using the supplied parameter as the record separator
05728  *  (<code>$/</code> by default), passing each substring in turn to the supplied
05729  *  block. If a zero-length record separator is supplied, the string is split
05730  *  into paragraphs delimited by multiple successive newlines.
05731  *
05732  *  If no block is given, an enumerator is returned instead.
05733  *
05734  *     print "Example one\n"
05735  *     "hello\nworld".each_line {|s| p s}
05736  *     print "Example two\n"
05737  *     "hello\nworld".each_line('l') {|s| p s}
05738  *     print "Example three\n"
05739  *     "hello\n\n\nworld".each_line('') {|s| p s}
05740  *
05741  *  <em>produces:</em>
05742  *
05743  *     Example one
05744  *     "hello\n"
05745  *     "world"
05746  *     Example two
05747  *     "hel"
05748  *     "l"
05749  *     "o\nworl"
05750  *     "d"
05751  *     Example three
05752  *     "hello\n\n\n"
05753  *     "world"
05754  */
05755 
05756 static VALUE
05757 rb_str_each_line(int argc, VALUE *argv, VALUE str)
05758 {
05759     rb_encoding *enc;
05760     VALUE rs;
05761     unsigned int newline;
05762     const char *p, *pend, *s, *ptr;
05763     long len, rslen;
05764     VALUE line;
05765     int n;
05766     VALUE orig = str;
05767 
05768     if (argc == 0) {
05769         rs = rb_rs;
05770     }
05771     else {
05772         rb_scan_args(argc, argv, "01", &rs);
05773     }
05774     RETURN_ENUMERATOR(str, argc, argv);
05775     if (NIL_P(rs)) {
05776         rb_yield(str);
05777         return orig;
05778     }
05779     str = rb_str_new4(str);
05780     ptr = p = s = RSTRING_PTR(str);
05781     pend = p + RSTRING_LEN(str);
05782     len = RSTRING_LEN(str);
05783     StringValue(rs);
05784     if (rs == rb_default_rs) {
05785         enc = rb_enc_get(str);
05786         while (p < pend) {
05787             char *p0;
05788 
05789             p = memchr(p, '\n', pend - p);
05790             if (!p) break;
05791             p0 = rb_enc_left_char_head(s, p, pend, enc);
05792             if (!rb_enc_is_newline(p0, pend, enc)) {
05793                 p++;
05794                 continue;
05795             }
05796             p = p0 + rb_enc_mbclen(p0, pend, enc);
05797             line = rb_str_new5(str, s, p - s);
05798             OBJ_INFECT(line, str);
05799             rb_enc_cr_str_copy_for_substr(line, str);
05800             rb_yield(line);
05801             str_mod_check(str, ptr, len);
05802             s = p;
05803         }
05804         goto finish;
05805     }
05806 
05807     enc = rb_enc_check(str, rs);
05808     rslen = RSTRING_LEN(rs);
05809     if (rslen == 0) {
05810         newline = '\n';
05811     }
05812     else {
05813         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
05814     }
05815 
05816     while (p < pend) {
05817         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
05818 
05819       again:
05820         if (rslen == 0 && c == newline) {
05821             p += n;
05822             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
05823                 goto again;
05824             }
05825             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
05826                 p += n;
05827             }
05828             p -= n;
05829         }
05830         if (c == newline &&
05831             (rslen <= 1 || memcmp(RSTRING_PTR(rs), p, rslen) == 0)) {
05832             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
05833             OBJ_INFECT(line, str);
05834             rb_enc_cr_str_copy_for_substr(line, str);
05835             rb_yield(line);
05836             str_mod_check(str, ptr, len);
05837             s = p + (rslen ? rslen : n);
05838         }
05839         p += n;
05840     }
05841 
05842   finish:
05843     if (s != pend) {
05844         line = rb_str_new5(str, s, pend - s);
05845         OBJ_INFECT(line, str);
05846         rb_enc_cr_str_copy_for_substr(line, str);
05847         rb_yield(line);
05848     }
05849 
05850     return orig;
05851 }
05852 
05853 
05854 /*
05855  *  call-seq:
05856  *     str.bytes {|fixnum| block }        -> str
05857  *     str.bytes                          -> an_enumerator
05858  *
05859  *     str.each_byte {|fixnum| block }    -> str
05860  *     str.each_byte                      -> an_enumerator
05861  *
05862  *  Passes each byte in <i>str</i> to the given block, or returns
05863  *  an enumerator if no block is given.
05864  *
05865  *     "hello".each_byte {|c| print c, ' ' }
05866  *
05867  *  <em>produces:</em>
05868  *
05869  *     104 101 108 108 111
05870  */
05871 
05872 static VALUE
05873 rb_str_each_byte(VALUE str)
05874 {
05875     long i;
05876 
05877     RETURN_ENUMERATOR(str, 0, 0);
05878     for (i=0; i<RSTRING_LEN(str); i++) {
05879         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
05880     }
05881     return str;
05882 }
05883 
05884 
05885 /*
05886  *  call-seq:
05887  *     str.chars {|cstr| block }        -> str
05888  *     str.chars                        -> an_enumerator
05889  *
05890  *     str.each_char {|cstr| block }    -> str
05891  *     str.each_char                    -> an_enumerator
05892  *
05893  *  Passes each character in <i>str</i> to the given block, or returns
05894  *  an enumerator if no block is given.
05895  *
05896  *     "hello".each_char {|c| print c, ' ' }
05897  *
05898  *  <em>produces:</em>
05899  *
05900  *     h e l l o
05901  */
05902 
05903 static VALUE
05904 rb_str_each_char(VALUE str)
05905 {
05906     VALUE orig = str;
05907     long i, len, n;
05908     const char *ptr;
05909     rb_encoding *enc;
05910 
05911     RETURN_ENUMERATOR(str, 0, 0);
05912     str = rb_str_new4(str);
05913     ptr = RSTRING_PTR(str);
05914     len = RSTRING_LEN(str);
05915     enc = rb_enc_get(str);
05916     switch (ENC_CODERANGE(str)) {
05917       case ENC_CODERANGE_VALID:
05918       case ENC_CODERANGE_7BIT:
05919         for (i = 0; i < len; i += n) {
05920             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
05921             rb_yield(rb_str_subseq(str, i, n));
05922         }
05923         break;
05924       default:
05925         for (i = 0; i < len; i += n) {
05926             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
05927             rb_yield(rb_str_subseq(str, i, n));
05928         }
05929     }
05930     return orig;
05931 }
05932 
05933 /*
05934  *  call-seq:
05935  *     str.codepoints {|integer| block }        -> str
05936  *     str.codepoints                           -> an_enumerator
05937  *
05938  *     str.each_codepoint {|integer| block }    -> str
05939  *     str.each_codepoint                       -> an_enumerator
05940  *
05941  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
05942  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
05943  *  given block.
05944  *
05945  *  If no block is given, an enumerator is returned instead.
05946  *
05947  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
05948  *
05949  *  <em>produces:</em>
05950  *
05951  *     104 101 108 108 111 1593
05952  */
05953 
05954 static VALUE
05955 rb_str_each_codepoint(VALUE str)
05956 {
05957     VALUE orig = str;
05958     long len;
05959     int n;
05960     unsigned int c;
05961     const char *ptr, *end;
05962     rb_encoding *enc;
05963 
05964     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
05965     RETURN_ENUMERATOR(str, 0, 0);
05966     str = rb_str_new4(str);
05967     ptr = RSTRING_PTR(str);
05968     len = RSTRING_LEN(str);
05969     end = RSTRING_END(str);
05970     enc = STR_ENC_GET(str);
05971     while (ptr < end) {
05972         c = rb_enc_codepoint_len(ptr, end, &n, enc);
05973         rb_yield(UINT2NUM(c));
05974         ptr += n;
05975     }
05976     return orig;
05977 }
05978 
05979 static long
05980 chopped_length(VALUE str)
05981 {
05982     rb_encoding *enc = STR_ENC_GET(str);
05983     const char *p, *p2, *beg, *end;
05984 
05985     beg = RSTRING_PTR(str);
05986     end = beg + RSTRING_LEN(str);
05987     if (beg > end) return 0;
05988     p = rb_enc_prev_char(beg, end, end, enc);
05989     if (!p) return 0;
05990     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
05991         p2 = rb_enc_prev_char(beg, p, end, enc);
05992         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
05993     }
05994     return p - beg;
05995 }
05996 
05997 /*
05998  *  call-seq:
05999  *     str.chop!   -> str or nil
06000  *
06001  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06002  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06003  *  <code>String#chomp!</code>.
06004  */
06005 
06006 static VALUE
06007 rb_str_chop_bang(VALUE str)
06008 {
06009     str_modify_keep_cr(str);
06010     if (RSTRING_LEN(str) > 0) {
06011         long len;
06012         len = chopped_length(str);
06013         STR_SET_LEN(str, len);
06014         RSTRING_PTR(str)[len] = '\0';
06015         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06016             ENC_CODERANGE_CLEAR(str);
06017         }
06018         return str;
06019     }
06020     return Qnil;
06021 }
06022 
06023 
06024 /*
06025  *  call-seq:
06026  *     str.chop   -> new_str
06027  *
06028  *  Returns a new <code>String</code> with the last character removed.  If the
06029  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06030  *  <code>chop</code> to an empty string returns an empty
06031  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06032  *  the string unchanged if it doesn't end in a record separator.
06033  *
06034  *     "string\r\n".chop   #=> "string"
06035  *     "string\n\r".chop   #=> "string\n"
06036  *     "string\n".chop     #=> "string"
06037  *     "string".chop       #=> "strin"
06038  *     "x".chop.chop       #=> ""
06039  */
06040 
06041 static VALUE
06042 rb_str_chop(VALUE str)
06043 {
06044     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06045     rb_enc_cr_str_copy_for_substr(str2, str);
06046     OBJ_INFECT(str2, str);
06047     return str2;
06048 }
06049 
06050 
06051 /*
06052  *  call-seq:
06053  *     str.chomp!(separator=$/)   -> str or nil
06054  *
06055  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06056  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06057  */
06058 
06059 static VALUE
06060 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06061 {
06062     rb_encoding *enc;
06063     VALUE rs;
06064     int newline;
06065     char *p, *pp, *e;
06066     long len, rslen;
06067 
06068     str_modify_keep_cr(str);
06069     len = RSTRING_LEN(str);
06070     if (len == 0) return Qnil;
06071     p = RSTRING_PTR(str);
06072     e = p + len;
06073     if (argc == 0) {
06074         rs = rb_rs;
06075         if (rs == rb_default_rs) {
06076           smart_chomp:
06077             enc = rb_enc_get(str);
06078             if (rb_enc_mbminlen(enc) > 1) {
06079                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06080                 if (rb_enc_is_newline(pp, e, enc)) {
06081                     e = pp;
06082                 }
06083                 pp = e - rb_enc_mbminlen(enc);
06084                 if (pp >= p) {
06085                     pp = rb_enc_left_char_head(p, pp, e, enc);
06086                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06087                         e = pp;
06088                     }
06089                 }
06090                 if (e == RSTRING_END(str)) {
06091                     return Qnil;
06092                 }
06093                 len = e - RSTRING_PTR(str);
06094                 STR_SET_LEN(str, len);
06095             }
06096             else {
06097                 if (RSTRING_PTR(str)[len-1] == '\n') {
06098                     STR_DEC_LEN(str);
06099                     if (RSTRING_LEN(str) > 0 &&
06100                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06101                         STR_DEC_LEN(str);
06102                     }
06103                 }
06104                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06105                     STR_DEC_LEN(str);
06106                 }
06107                 else {
06108                     return Qnil;
06109                 }
06110             }
06111             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06112             return str;
06113         }
06114     }
06115     else {
06116         rb_scan_args(argc, argv, "01", &rs);
06117     }
06118     if (NIL_P(rs)) return Qnil;
06119     StringValue(rs);
06120     rslen = RSTRING_LEN(rs);
06121     if (rslen == 0) {
06122         while (len>0 && p[len-1] == '\n') {
06123             len--;
06124             if (len>0 && p[len-1] == '\r')
06125                 len--;
06126         }
06127         if (len < RSTRING_LEN(str)) {
06128             STR_SET_LEN(str, len);
06129             RSTRING_PTR(str)[len] = '\0';
06130             return str;
06131         }
06132         return Qnil;
06133     }
06134     if (rslen > len) return Qnil;
06135     newline = RSTRING_PTR(rs)[rslen-1];
06136     if (rslen == 1 && newline == '\n')
06137         goto smart_chomp;
06138 
06139     enc = rb_enc_check(str, rs);
06140     if (is_broken_string(rs)) {
06141         return Qnil;
06142     }
06143     pp = e - rslen;
06144     if (p[len-1] == newline &&
06145         (rslen <= 1 ||
06146          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06147         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06148             return Qnil;
06149         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06150             ENC_CODERANGE_CLEAR(str);
06151         }
06152         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06153         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06154         return str;
06155     }
06156     return Qnil;
06157 }
06158 
06159 
06160 /*
06161  *  call-seq:
06162  *     str.chomp(separator=$/)   -> new_str
06163  *
06164  *  Returns a new <code>String</code> with the given record separator removed
06165  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06166  *  changed from the default Ruby record separator, then <code>chomp</code> also
06167  *  removes carriage return characters (that is it will remove <code>\n</code>,
06168  *  <code>\r</code>, and <code>\r\n</code>).
06169  *
06170  *     "hello".chomp            #=> "hello"
06171  *     "hello\n".chomp          #=> "hello"
06172  *     "hello\r\n".chomp        #=> "hello"
06173  *     "hello\n\r".chomp        #=> "hello\n"
06174  *     "hello\r".chomp          #=> "hello"
06175  *     "hello \n there".chomp   #=> "hello \n there"
06176  *     "hello".chomp("llo")     #=> "he"
06177  */
06178 
06179 static VALUE
06180 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06181 {
06182     str = rb_str_dup(str);
06183     rb_str_chomp_bang(argc, argv, str);
06184     return str;
06185 }
06186 
06187 /*
06188  *  call-seq:
06189  *     str.lstrip!   -> self or nil
06190  *
06191  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06192  *  change was made. See also <code>String#rstrip!</code> and
06193  *  <code>String#strip!</code>.
06194  *
06195  *     "  hello  ".lstrip   #=> "hello  "
06196  *     "hello".lstrip!      #=> nil
06197  */
06198 
06199 static VALUE
06200 rb_str_lstrip_bang(VALUE str)
06201 {
06202     rb_encoding *enc;
06203     char *s, *t, *e;
06204 
06205     str_modify_keep_cr(str);
06206     enc = STR_ENC_GET(str);
06207     s = RSTRING_PTR(str);
06208     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06209     e = t = RSTRING_END(str);
06210     /* remove spaces at head */
06211     while (s < e) {
06212         int n;
06213         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06214 
06215         if (!rb_isspace(cc)) break;
06216         s += n;
06217     }
06218 
06219     if (s > RSTRING_PTR(str)) {
06220         STR_SET_LEN(str, t-s);
06221         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06222         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06223         return str;
06224     }
06225     return Qnil;
06226 }
06227 
06228 
06229 /*
06230  *  call-seq:
06231  *     str.lstrip   -> new_str
06232  *
06233  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06234  *  <code>String#rstrip</code> and <code>String#strip</code>.
06235  *
06236  *     "  hello  ".lstrip   #=> "hello  "
06237  *     "hello".lstrip       #=> "hello"
06238  */
06239 
06240 static VALUE
06241 rb_str_lstrip(VALUE str)
06242 {
06243     str = rb_str_dup(str);
06244     rb_str_lstrip_bang(str);
06245     return str;
06246 }
06247 
06248 
06249 /*
06250  *  call-seq:
06251  *     str.rstrip!   -> self or nil
06252  *
06253  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06254  *  no change was made. See also <code>String#lstrip!</code> and
06255  *  <code>String#strip!</code>.
06256  *
06257  *     "  hello  ".rstrip   #=> "  hello"
06258  *     "hello".rstrip!      #=> nil
06259  */
06260 
06261 static VALUE
06262 rb_str_rstrip_bang(VALUE str)
06263 {
06264     rb_encoding *enc;
06265     char *s, *t, *e;
06266 
06267     str_modify_keep_cr(str);
06268     enc = STR_ENC_GET(str);
06269     rb_str_check_dummy_enc(enc);
06270     s = RSTRING_PTR(str);
06271     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06272     t = e = RSTRING_END(str);
06273 
06274     /* remove trailing spaces or '\0's */
06275     if (single_byte_optimizable(str)) {
06276         unsigned char c;
06277         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06278     }
06279     else {
06280         char *tp;
06281 
06282         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06283             unsigned int c = rb_enc_codepoint(tp, e, enc);
06284             if (c && !rb_isspace(c)) break;
06285             t = tp;
06286         }
06287     }
06288     if (t < e) {
06289         long len = t-RSTRING_PTR(str);
06290 
06291         STR_SET_LEN(str, len);
06292         RSTRING_PTR(str)[len] = '\0';
06293         return str;
06294     }
06295     return Qnil;
06296 }
06297 
06298 
06299 /*
06300  *  call-seq:
06301  *     str.rstrip   -> new_str
06302  *
06303  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06304  *  <code>String#lstrip</code> and <code>String#strip</code>.
06305  *
06306  *     "  hello  ".rstrip   #=> "  hello"
06307  *     "hello".rstrip       #=> "hello"
06308  */
06309 
06310 static VALUE
06311 rb_str_rstrip(VALUE str)
06312 {
06313     str = rb_str_dup(str);
06314     rb_str_rstrip_bang(str);
06315     return str;
06316 }
06317 
06318 
06319 /*
06320  *  call-seq:
06321  *     str.strip!   -> str or nil
06322  *
06323  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06324  *  <code>nil</code> if <i>str</i> was not altered.
06325  */
06326 
06327 static VALUE
06328 rb_str_strip_bang(VALUE str)
06329 {
06330     VALUE l = rb_str_lstrip_bang(str);
06331     VALUE r = rb_str_rstrip_bang(str);
06332 
06333     if (NIL_P(l) && NIL_P(r)) return Qnil;
06334     return str;
06335 }
06336 
06337 
06338 /*
06339  *  call-seq:
06340  *     str.strip   -> new_str
06341  *
06342  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06343  *
06344  *     "    hello    ".strip   #=> "hello"
06345  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06346  */
06347 
06348 static VALUE
06349 rb_str_strip(VALUE str)
06350 {
06351     str = rb_str_dup(str);
06352     rb_str_strip_bang(str);
06353     return str;
06354 }
06355 
06356 static VALUE
06357 scan_once(VALUE str, VALUE pat, long *start)
06358 {
06359     VALUE result, match;
06360     struct re_registers *regs;
06361     int i;
06362 
06363     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06364         match = rb_backref_get();
06365         regs = RMATCH_REGS(match);
06366         if (BEG(0) == END(0)) {
06367             rb_encoding *enc = STR_ENC_GET(str);
06368             /*
06369              * Always consume at least one character of the input string
06370              */
06371             if (RSTRING_LEN(str) > END(0))
06372                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06373                                                    RSTRING_END(str), enc);
06374             else
06375                 *start = END(0)+1;
06376         }
06377         else {
06378             *start = END(0);
06379         }
06380         if (regs->num_regs == 1) {
06381             return rb_reg_nth_match(0, match);
06382         }
06383         result = rb_ary_new2(regs->num_regs);
06384         for (i=1; i < regs->num_regs; i++) {
06385             rb_ary_push(result, rb_reg_nth_match(i, match));
06386         }
06387 
06388         return result;
06389     }
06390     return Qnil;
06391 }
06392 
06393 
06394 /*
06395  *  call-seq:
06396  *     str.scan(pattern)                         -> array
06397  *     str.scan(pattern) {|match, ...| block }   -> str
06398  *
06399  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06400  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06401  *  generated and either added to the result array or passed to the block. If
06402  *  the pattern contains no groups, each individual result consists of the
06403  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06404  *  individual result is itself an array containing one entry per group.
06405  *
06406  *     a = "cruel world"
06407  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06408  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06409  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06410  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06411  *
06412  *  And the block form:
06413  *
06414  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06415  *     print "\n"
06416  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06417  *     print "\n"
06418  *
06419  *  <em>produces:</em>
06420  *
06421  *     <<cruel>> <<world>>
06422  *     rceu lowlr
06423  */
06424 
06425 static VALUE
06426 rb_str_scan(VALUE str, VALUE pat)
06427 {
06428     VALUE result;
06429     long start = 0;
06430     long last = -1, prev = 0;
06431     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06432 
06433     pat = get_pat(pat, 1);
06434     if (!rb_block_given_p()) {
06435         VALUE ary = rb_ary_new();
06436 
06437         while (!NIL_P(result = scan_once(str, pat, &start))) {
06438             last = prev;
06439             prev = start;
06440             rb_ary_push(ary, result);
06441         }
06442         if (last >= 0) rb_reg_search(pat, str, last, 0);
06443         return ary;
06444     }
06445 
06446     while (!NIL_P(result = scan_once(str, pat, &start))) {
06447         last = prev;
06448         prev = start;
06449         rb_yield(result);
06450         str_mod_check(str, p, len);
06451     }
06452     if (last >= 0) rb_reg_search(pat, str, last, 0);
06453     return str;
06454 }
06455 
06456 
06457 /*
06458  *  call-seq:
06459  *     str.hex   -> integer
06460  *
06461  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06462  *  (with an optional sign and an optional <code>0x</code>) and returns the
06463  *  corresponding number. Zero is returned on error.
06464  *
06465  *     "0x0a".hex     #=> 10
06466  *     "-1234".hex    #=> -4660
06467  *     "0".hex        #=> 0
06468  *     "wombat".hex   #=> 0
06469  */
06470 
06471 static VALUE
06472 rb_str_hex(VALUE str)
06473 {
06474     rb_encoding *enc = rb_enc_get(str);
06475 
06476     if (!rb_enc_asciicompat(enc)) {
06477         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06478     }
06479     return rb_str_to_inum(str, 16, FALSE);
06480 }
06481 
06482 
06483 /*
06484  *  call-seq:
06485  *     str.oct   -> integer
06486  *
06487  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06488  *  optional sign) and returns the corresponding number.  Returns 0 if the
06489  *  conversion fails.
06490  *
06491  *     "123".oct       #=> 83
06492  *     "-377".oct      #=> -255
06493  *     "bad".oct       #=> 0
06494  *     "0377bad".oct   #=> 255
06495  */
06496 
06497 static VALUE
06498 rb_str_oct(VALUE str)
06499 {
06500     rb_encoding *enc = rb_enc_get(str);
06501 
06502     if (!rb_enc_asciicompat(enc)) {
06503         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06504     }
06505     return rb_str_to_inum(str, -8, FALSE);
06506 }
06507 
06508 
06509 /*
06510  *  call-seq:
06511  *     str.crypt(other_str)   -> new_str
06512  *
06513  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06514  *  library function <code>crypt</code>. The argument is the salt string, which
06515  *  should be two characters long, each character drawn from
06516  *  <code>[a-zA-Z0-9./]</code>.
06517  */
06518 
06519 static VALUE
06520 rb_str_crypt(VALUE str, VALUE salt)
06521 {
06522     extern char *crypt(const char *, const char *);
06523     VALUE result;
06524     const char *s, *saltp;
06525 #ifdef BROKEN_CRYPT
06526     char salt_8bit_clean[3];
06527 #endif
06528 
06529     StringValue(salt);
06530     if (RSTRING_LEN(salt) < 2)
06531         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06532 
06533     s = RSTRING_PTR(str);
06534     if (!s) s = "";
06535     saltp = RSTRING_PTR(salt);
06536 #ifdef BROKEN_CRYPT
06537     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06538         salt_8bit_clean[0] = saltp[0] & 0x7f;
06539         salt_8bit_clean[1] = saltp[1] & 0x7f;
06540         salt_8bit_clean[2] = '\0';
06541         saltp = salt_8bit_clean;
06542     }
06543 #endif
06544     result = rb_str_new2(crypt(s, saltp));
06545     OBJ_INFECT(result, str);
06546     OBJ_INFECT(result, salt);
06547     return result;
06548 }
06549 
06550 
06551 /*
06552  *  call-seq:
06553  *     str.intern   -> symbol
06554  *     str.to_sym   -> symbol
06555  *
06556  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06557  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06558  *
06559  *     "Koala".intern         #=> :Koala
06560  *     s = 'cat'.to_sym       #=> :cat
06561  *     s == :cat              #=> true
06562  *     s = '@cat'.to_sym      #=> :@cat
06563  *     s == :@cat             #=> true
06564  *
06565  *  This can also be used to create symbols that cannot be represented using the
06566  *  <code>:xxx</code> notation.
06567  *
06568  *     'cat and dog'.to_sym   #=> :"cat and dog"
06569  */
06570 
06571 VALUE
06572 rb_str_intern(VALUE s)
06573 {
06574     VALUE str = RB_GC_GUARD(s);
06575     ID id;
06576 
06577     id = rb_intern_str(str);
06578     return ID2SYM(id);
06579 }
06580 
06581 
06582 /*
06583  *  call-seq:
06584  *     str.ord   -> integer
06585  *
06586  *  Return the <code>Integer</code> ordinal of a one-character string.
06587  *
06588  *     "a".ord         #=> 97
06589  */
06590 
06591 VALUE
06592 rb_str_ord(VALUE s)
06593 {
06594     unsigned int c;
06595 
06596     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06597     return UINT2NUM(c);
06598 }
06599 /*
06600  *  call-seq:
06601  *     str.sum(n=16)   -> integer
06602  *
06603  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06604  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06605  *  to 16. The result is simply the sum of the binary value of each character in
06606  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06607  *  checksum.
06608  */
06609 
06610 static VALUE
06611 rb_str_sum(int argc, VALUE *argv, VALUE str)
06612 {
06613     VALUE vbits;
06614     int bits;
06615     char *ptr, *p, *pend;
06616     long len;
06617     VALUE sum = INT2FIX(0);
06618     unsigned long sum0 = 0;
06619 
06620     if (argc == 0) {
06621         bits = 16;
06622     }
06623     else {
06624         rb_scan_args(argc, argv, "01", &vbits);
06625         bits = NUM2INT(vbits);
06626     }
06627     ptr = p = RSTRING_PTR(str);
06628     len = RSTRING_LEN(str);
06629     pend = p + len;
06630 
06631     while (p < pend) {
06632         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06633             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06634             str_mod_check(str, ptr, len);
06635             sum0 = 0;
06636         }
06637         sum0 += (unsigned char)*p;
06638         p++;
06639     }
06640 
06641     if (bits == 0) {
06642         if (sum0) {
06643             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06644         }
06645     }
06646     else {
06647         if (sum == INT2FIX(0)) {
06648             if (bits < (int)sizeof(long)*CHAR_BIT) {
06649                 sum0 &= (((unsigned long)1)<<bits)-1;
06650             }
06651             sum = LONG2FIX(sum0);
06652         }
06653         else {
06654             VALUE mod;
06655 
06656             if (sum0) {
06657                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06658             }
06659 
06660             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06661             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06662             sum = rb_funcall(sum, '&', 1, mod);
06663         }
06664     }
06665     return sum;
06666 }
06667 
06668 static VALUE
06669 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06670 {
06671     rb_encoding *enc;
06672     VALUE w;
06673     long width, len, flen = 1, fclen = 1;
06674     VALUE res;
06675     char *p;
06676     const char *f = " ";
06677     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06678     volatile VALUE pad;
06679     int singlebyte = 1, cr;
06680 
06681     rb_scan_args(argc, argv, "11", &w, &pad);
06682     enc = STR_ENC_GET(str);
06683     width = NUM2LONG(w);
06684     if (argc == 2) {
06685         StringValue(pad);
06686         enc = rb_enc_check(str, pad);
06687         f = RSTRING_PTR(pad);
06688         flen = RSTRING_LEN(pad);
06689         fclen = str_strlen(pad, enc);
06690         singlebyte = single_byte_optimizable(pad);
06691         if (flen == 0 || fclen == 0) {
06692             rb_raise(rb_eArgError, "zero width padding");
06693         }
06694     }
06695     len = str_strlen(str, enc);
06696     if (width < 0 || len >= width) return rb_str_dup(str);
06697     n = width - len;
06698     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06699     rlen = n - llen;
06700     cr = ENC_CODERANGE(str);
06701     if (flen > 1) {
06702        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06703        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06704     }
06705     size = RSTRING_LEN(str);
06706     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06707        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06708        (len += llen2 + rlen2) >= LONG_MAX - size) {
06709        rb_raise(rb_eArgError, "argument too big");
06710     }
06711     len += size;
06712     res = rb_str_new5(str, 0, len);
06713     p = RSTRING_PTR(res);
06714     if (flen <= 1) {
06715        memset(p, *f, llen);
06716        p += llen;
06717     }
06718     else {
06719        while (llen >= fclen) {
06720             memcpy(p,f,flen);
06721             p += flen;
06722             llen -= fclen;
06723         }
06724        if (llen > 0) {
06725            memcpy(p, f, llen2);
06726            p += llen2;
06727         }
06728     }
06729     memcpy(p, RSTRING_PTR(str), size);
06730     p += size;
06731     if (flen <= 1) {
06732        memset(p, *f, rlen);
06733        p += rlen;
06734     }
06735     else {
06736        while (rlen >= fclen) {
06737             memcpy(p,f,flen);
06738             p += flen;
06739             rlen -= fclen;
06740         }
06741        if (rlen > 0) {
06742            memcpy(p, f, rlen2);
06743            p += rlen2;
06744         }
06745     }
06746     *p = '\0';
06747     STR_SET_LEN(res, p-RSTRING_PTR(res));
06748     OBJ_INFECT(res, str);
06749     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
06750     rb_enc_associate(res, enc);
06751     if (argc == 2)
06752         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
06753     if (cr != ENC_CODERANGE_BROKEN)
06754         ENC_CODERANGE_SET(res, cr);
06755     return res;
06756 }
06757 
06758 
06759 /*
06760  *  call-seq:
06761  *     str.ljust(integer, padstr=' ')   -> new_str
06762  *
06763  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06764  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
06765  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06766  *
06767  *     "hello".ljust(4)            #=> "hello"
06768  *     "hello".ljust(20)           #=> "hello               "
06769  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
06770  */
06771 
06772 static VALUE
06773 rb_str_ljust(int argc, VALUE *argv, VALUE str)
06774 {
06775     return rb_str_justify(argc, argv, str, 'l');
06776 }
06777 
06778 
06779 /*
06780  *  call-seq:
06781  *     str.rjust(integer, padstr=' ')   -> new_str
06782  *
06783  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06784  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
06785  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06786  *
06787  *     "hello".rjust(4)            #=> "hello"
06788  *     "hello".rjust(20)           #=> "               hello"
06789  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
06790  */
06791 
06792 static VALUE
06793 rb_str_rjust(int argc, VALUE *argv, VALUE str)
06794 {
06795     return rb_str_justify(argc, argv, str, 'r');
06796 }
06797 
06798 
06799 /*
06800  *  call-seq:
06801  *     str.center(integer, padstr)   -> new_str
06802  *
06803  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
06804  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
06805  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
06806  *
06807  *     "hello".center(4)         #=> "hello"
06808  *     "hello".center(20)        #=> "       hello        "
06809  *     "hello".center(20, '123') #=> "1231231hello12312312"
06810  */
06811 
06812 static VALUE
06813 rb_str_center(int argc, VALUE *argv, VALUE str)
06814 {
06815     return rb_str_justify(argc, argv, str, 'c');
06816 }
06817 
06818 /*
06819  *  call-seq:
06820  *     str.partition(sep)              -> [head, sep, tail]
06821  *     str.partition(regexp)           -> [head, match, tail]
06822  *
06823  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
06824  *  and returns the part before it, the match, and the part
06825  *  after it.
06826  *  If it is not found, returns two empty strings and <i>str</i>.
06827  *
06828  *     "hello".partition("l")         #=> ["he", "l", "lo"]
06829  *     "hello".partition("x")         #=> ["hello", "", ""]
06830  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
06831  */
06832 
06833 static VALUE
06834 rb_str_partition(VALUE str, VALUE sep)
06835 {
06836     long pos;
06837     int regex = FALSE;
06838 
06839     if (TYPE(sep) == T_REGEXP) {
06840         pos = rb_reg_search(sep, str, 0, 0);
06841         regex = TRUE;
06842     }
06843     else {
06844         VALUE tmp;
06845 
06846         tmp = rb_check_string_type(sep);
06847         if (NIL_P(tmp)) {
06848             rb_raise(rb_eTypeError, "type mismatch: %s given",
06849                      rb_obj_classname(sep));
06850         }
06851         sep = tmp;
06852         pos = rb_str_index(str, sep, 0);
06853     }
06854     if (pos < 0) {
06855       failed:
06856         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
06857     }
06858     if (regex) {
06859         sep = rb_str_subpat(str, sep, INT2FIX(0));
06860         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
06861     }
06862     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
06863                           sep,
06864                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
06865                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
06866 }
06867 
06868 /*
06869  *  call-seq:
06870  *     str.rpartition(sep)             -> [head, sep, tail]
06871  *     str.rpartition(regexp)          -> [head, match, tail]
06872  *
06873  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
06874  *  of the string, and returns the part before it, the match, and the part
06875  *  after it.
06876  *  If it is not found, returns two empty strings and <i>str</i>.
06877  *
06878  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
06879  *     "hello".rpartition("x")         #=> ["", "", "hello"]
06880  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
06881  */
06882 
06883 static VALUE
06884 rb_str_rpartition(VALUE str, VALUE sep)
06885 {
06886     long pos = RSTRING_LEN(str);
06887     int regex = FALSE;
06888 
06889     if (TYPE(sep) == T_REGEXP) {
06890         pos = rb_reg_search(sep, str, pos, 1);
06891         regex = TRUE;
06892     }
06893     else {
06894         VALUE tmp;
06895 
06896         tmp = rb_check_string_type(sep);
06897         if (NIL_P(tmp)) {
06898             rb_raise(rb_eTypeError, "type mismatch: %s given",
06899                      rb_obj_classname(sep));
06900         }
06901         sep = tmp;
06902         pos = rb_str_sublen(str, pos);
06903         pos = rb_str_rindex(str, sep, pos);
06904     }
06905     if (pos < 0) {
06906         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
06907     }
06908     if (regex) {
06909         sep = rb_reg_nth_match(0, rb_backref_get());
06910     }
06911     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
06912                           sep,
06913                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
06914 }
06915 
06916 /*
06917  *  call-seq:
06918  *     str.start_with?([prefix]+)   -> true or false
06919  *
06920  *  Returns true if <i>str</i> starts with a prefix given.
06921  *
06922  *    p "hello".start_with?("hell")               #=> true
06923  *
06924  *    # returns true if one of prefix matches.
06925  *    p "hello".start_with?("heaven", "hell")     #=> true
06926  *    p "hello".start_with?("heaven", "paradice") #=> false
06927  *
06928  *
06929  *
06930  */
06931 
06932 static VALUE
06933 rb_str_start_with(int argc, VALUE *argv, VALUE str)
06934 {
06935     int i;
06936 
06937     for (i=0; i<argc; i++) {
06938         VALUE tmp = rb_check_string_type(argv[i]);
06939         if (NIL_P(tmp)) continue;
06940         rb_enc_check(str, tmp);
06941         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06942         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06943             return Qtrue;
06944     }
06945     return Qfalse;
06946 }
06947 
06948 /*
06949  *  call-seq:
06950  *     str.end_with?([suffix]+)   -> true or false
06951  *
06952  *  Returns true if <i>str</i> ends with a suffix given.
06953  */
06954 
06955 static VALUE
06956 rb_str_end_with(int argc, VALUE *argv, VALUE str)
06957 {
06958     int i;
06959     char *p, *s, *e;
06960     rb_encoding *enc;
06961 
06962     for (i=0; i<argc; i++) {
06963         VALUE tmp = rb_check_string_type(argv[i]);
06964         if (NIL_P(tmp)) continue;
06965         enc = rb_enc_check(str, tmp);
06966         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
06967         p = RSTRING_PTR(str);
06968         e = p + RSTRING_LEN(str);
06969         s = e - RSTRING_LEN(tmp);
06970         if (rb_enc_left_char_head(p, s, e, enc) != s)
06971             continue;
06972         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
06973             return Qtrue;
06974     }
06975     return Qfalse;
06976 }
06977 
06978 void
06979 rb_str_setter(VALUE val, ID id, VALUE *var)
06980 {
06981     if (!NIL_P(val) && TYPE(val) != T_STRING) {
06982         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
06983     }
06984     *var = val;
06985 }
06986 
06987 
06988 /*
06989  *  call-seq:
06990  *     str.force_encoding(encoding)   -> str
06991  *
06992  *  Changes the encoding to +encoding+ and returns self.
06993  */
06994 
06995 static VALUE
06996 rb_str_force_encoding(VALUE str, VALUE enc)
06997 {
06998     str_modifiable(str);
06999     rb_enc_associate(str, rb_to_encoding(enc));
07000     ENC_CODERANGE_CLEAR(str);
07001     return str;
07002 }
07003 
07004 /*
07005  *  call-seq:
07006  *     str.valid_encoding?  -> true or false
07007  *
07008  *  Returns true for a string which encoded correctly.
07009  *
07010  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07011  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07012  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07013  */
07014 
07015 static VALUE
07016 rb_str_valid_encoding_p(VALUE str)
07017 {
07018     int cr = rb_enc_str_coderange(str);
07019 
07020     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07021 }
07022 
07023 /*
07024  *  call-seq:
07025  *     str.ascii_only?  -> true or false
07026  *
07027  *  Returns true for a string which has only ASCII characters.
07028  *
07029  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07030  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07031  */
07032 
07033 static VALUE
07034 rb_str_is_ascii_only_p(VALUE str)
07035 {
07036     int cr = rb_enc_str_coderange(str);
07037 
07038     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07039 }
07040 
07041 /**********************************************************************
07042  * Document-class: Symbol
07043  *
07044  *  <code>Symbol</code> objects represent names and some strings
07045  *  inside the Ruby
07046  *  interpreter. They are generated using the <code>:name</code> and
07047  *  <code>:"string"</code> literals
07048  *  syntax, and by the various <code>to_sym</code> methods. The same
07049  *  <code>Symbol</code> object will be created for a given name or string
07050  *  for the duration of a program's execution, regardless of the context
07051  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07052  *  one context, a method in another, and a class in a third, the
07053  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07054  *  all three contexts.
07055  *
07056  *     module One
07057  *       class Fred
07058  *       end
07059  *       $f1 = :Fred
07060  *     end
07061  *     module Two
07062  *       Fred = 1
07063  *       $f2 = :Fred
07064  *     end
07065  *     def Fred()
07066  *     end
07067  *     $f3 = :Fred
07068  *     $f1.object_id   #=> 2514190
07069  *     $f2.object_id   #=> 2514190
07070  *     $f3.object_id   #=> 2514190
07071  *
07072  */
07073 
07074 
07075 /*
07076  *  call-seq:
07077  *     sym == obj   -> true or false
07078  *
07079  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07080  *  symbol, returns <code>true</code>.
07081  */
07082 
07083 static VALUE
07084 sym_equal(VALUE sym1, VALUE sym2)
07085 {
07086     if (sym1 == sym2) return Qtrue;
07087     return Qfalse;
07088 }
07089 
07090 
07091 static int
07092 sym_printable(const char *s, const char *send, rb_encoding *enc)
07093 {
07094     while (s < send) {
07095         int n;
07096         int c = rb_enc_codepoint_len(s, send, &n, enc);
07097 
07098         if (!rb_enc_isprint(c, enc)) return FALSE;
07099         s += n;
07100     }
07101     return TRUE;
07102 }
07103 
07104 /*
07105  *  call-seq:
07106  *     sym.inspect    -> string
07107  *
07108  *  Returns the representation of <i>sym</i> as a symbol literal.
07109  *
07110  *     :fred.inspect   #=> ":fred"
07111  */
07112 
07113 static VALUE
07114 sym_inspect(VALUE sym)
07115 {
07116     VALUE str;
07117     ID id = SYM2ID(sym);
07118     rb_encoding *enc;
07119     const char *ptr;
07120     long len;
07121     char *dest;
07122     rb_encoding *resenc = rb_default_internal_encoding();
07123 
07124     if (resenc == NULL) resenc = rb_default_external_encoding();
07125     sym = rb_id2str(id);
07126     enc = STR_ENC_GET(sym);
07127     ptr = RSTRING_PTR(sym);
07128     len = RSTRING_LEN(sym);
07129     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07130         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07131         str = rb_str_inspect(sym);
07132         len = RSTRING_LEN(str);
07133         rb_str_resize(str, len + 1);
07134         dest = RSTRING_PTR(str);
07135         memmove(dest + 1, dest, len);
07136         dest[0] = ':';
07137     }
07138     else {
07139         char *dest;
07140         str = rb_enc_str_new(0, len + 1, enc);
07141         dest = RSTRING_PTR(str);
07142         dest[0] = ':';
07143         memcpy(dest + 1, ptr, len);
07144     }
07145     return str;
07146 }
07147 
07148 
07149 /*
07150  *  call-seq:
07151  *     sym.id2name   -> string
07152  *     sym.to_s      -> string
07153  *
07154  *  Returns the name or string corresponding to <i>sym</i>.
07155  *
07156  *     :fred.id2name   #=> "fred"
07157  */
07158 
07159 
07160 VALUE
07161 rb_sym_to_s(VALUE sym)
07162 {
07163     ID id = SYM2ID(sym);
07164 
07165     return str_new3(rb_cString, rb_id2str(id));
07166 }
07167 
07168 
07169 /*
07170  * call-seq:
07171  *   sym.to_sym   -> sym
07172  *   sym.intern   -> sym
07173  *
07174  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07175  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07176  * in this case.
07177  */
07178 
07179 static VALUE
07180 sym_to_sym(VALUE sym)
07181 {
07182     return sym;
07183 }
07184 
07185 VALUE rb_funcall_passing_block(VALUE recv, ID mid, int argc, const VALUE *argv);
07186 
07187 static VALUE
07188 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07189 {
07190     VALUE obj;
07191 
07192     if (argc < 1) {
07193         rb_raise(rb_eArgError, "no receiver given");
07194     }
07195     obj = argv[0];
07196     return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07197 }
07198 
07199 /*
07200  * call-seq:
07201  *   sym.to_proc
07202  *
07203  * Returns a _Proc_ object which respond to the given method by _sym_.
07204  *
07205  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07206  */
07207 
07208 static VALUE
07209 sym_to_proc(VALUE sym)
07210 {
07211     static VALUE sym_proc_cache = Qfalse;
07212     enum {SYM_PROC_CACHE_SIZE = 67};
07213     VALUE proc;
07214     long id, index;
07215     VALUE *aryp;
07216 
07217     if (!sym_proc_cache) {
07218         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07219         rb_gc_register_mark_object(sym_proc_cache);
07220         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07221     }
07222 
07223     id = SYM2ID(sym);
07224     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07225 
07226     aryp = RARRAY_PTR(sym_proc_cache);
07227     if (aryp[index] == sym) {
07228         return aryp[index + 1];
07229     }
07230     else {
07231         proc = rb_proc_new(sym_call, (VALUE)id);
07232         aryp[index] = sym;
07233         aryp[index + 1] = proc;
07234         return proc;
07235     }
07236 }
07237 
07238 /*
07239  * call-seq:
07240  *
07241  *   sym.succ
07242  *
07243  * Same as <code>sym.to_s.succ.intern</code>.
07244  */
07245 
07246 static VALUE
07247 sym_succ(VALUE sym)
07248 {
07249     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07250 }
07251 
07252 /*
07253  * call-seq:
07254  *
07255  *   str <=> other       -> -1, 0, +1 or nil
07256  *
07257  * Compares _sym_ with _other_ in string form.
07258  */
07259 
07260 static VALUE
07261 sym_cmp(VALUE sym, VALUE other)
07262 {
07263     if (!SYMBOL_P(other)) {
07264         return Qnil;
07265     }
07266     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07267 }
07268 
07269 /*
07270  * call-seq:
07271  *
07272  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07273  *
07274  * Case-insensitive version of <code>Symbol#<=></code>.
07275  */
07276 
07277 static VALUE
07278 sym_casecmp(VALUE sym, VALUE other)
07279 {
07280     if (!SYMBOL_P(other)) {
07281         return Qnil;
07282     }
07283     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07284 }
07285 
07286 /*
07287  * call-seq:
07288  *   sym =~ obj   -> fixnum or nil
07289  *
07290  * Returns <code>sym.to_s =~ obj</code>.
07291  */
07292 
07293 static VALUE
07294 sym_match(VALUE sym, VALUE other)
07295 {
07296     return rb_str_match(rb_sym_to_s(sym), other);
07297 }
07298 
07299 /*
07300  * call-seq:
07301  *   sym[idx]      -> char
07302  *   sym[b, n]     -> char
07303  *
07304  * Returns <code>sym.to_s[]</code>.
07305  */
07306 
07307 static VALUE
07308 sym_aref(int argc, VALUE *argv, VALUE sym)
07309 {
07310     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07311 }
07312 
07313 /*
07314  * call-seq:
07315  *   sym.length    -> integer
07316  *
07317  * Same as <code>sym.to_s.length</code>.
07318  */
07319 
07320 static VALUE
07321 sym_length(VALUE sym)
07322 {
07323     return rb_str_length(rb_id2str(SYM2ID(sym)));
07324 }
07325 
07326 /*
07327  * call-seq:
07328  *   sym.empty?   -> true or false
07329  *
07330  * Returns that _sym_ is :"" or not.
07331  */
07332 
07333 static VALUE
07334 sym_empty(VALUE sym)
07335 {
07336     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07337 }
07338 
07339 /*
07340  * call-seq:
07341  *   sym.upcase    -> symbol
07342  *
07343  * Same as <code>sym.to_s.upcase.intern</code>.
07344  */
07345 
07346 static VALUE
07347 sym_upcase(VALUE sym)
07348 {
07349     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07350 }
07351 
07352 /*
07353  * call-seq:
07354  *   sym.downcase  -> symbol
07355  *
07356  * Same as <code>sym.to_s.downcase.intern</code>.
07357  */
07358 
07359 static VALUE
07360 sym_downcase(VALUE sym)
07361 {
07362     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07363 }
07364 
07365 /*
07366  * call-seq:
07367  *   sym.capitalize  -> symbol
07368  *
07369  * Same as <code>sym.to_s.capitalize.intern</code>.
07370  */
07371 
07372 static VALUE
07373 sym_capitalize(VALUE sym)
07374 {
07375     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07376 }
07377 
07378 /*
07379  * call-seq:
07380  *   sym.swapcase  -> symbol
07381  *
07382  * Same as <code>sym.to_s.swapcase.intern</code>.
07383  */
07384 
07385 static VALUE
07386 sym_swapcase(VALUE sym)
07387 {
07388     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07389 }
07390 
07391 /*
07392  * call-seq:
07393  *   sym.encoding   -> encoding
07394  *
07395  * Returns the Encoding object that represents the encoding of _sym_.
07396  */
07397 
07398 static VALUE
07399 sym_encoding(VALUE sym)
07400 {
07401     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07402 }
07403 
07404 ID
07405 rb_to_id(VALUE name)
07406 {
07407     VALUE tmp;
07408     ID id;
07409 
07410     switch (TYPE(name)) {
07411       default:
07412         tmp = rb_check_string_type(name);
07413         if (NIL_P(tmp)) {
07414             tmp = rb_inspect(name);
07415             rb_raise(rb_eTypeError, "%s is not a symbol",
07416                      RSTRING_PTR(tmp));
07417         }
07418         name = tmp;
07419         /* fall through */
07420       case T_STRING:
07421         name = rb_str_intern(name);
07422         /* fall through */
07423       case T_SYMBOL:
07424         return SYM2ID(name);
07425     }
07426     return id;
07427 }
07428 
07429 /*
07430  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07431  *  bytes, typically representing characters. String objects may be created
07432  *  using <code>String::new</code> or as literals.
07433  *
07434  *  Because of aliasing issues, users of strings should be aware of the methods
07435  *  that modify the contents of a <code>String</code> object.  Typically,
07436  *  methods with names ending in ``!'' modify their receiver, while those
07437  *  without a ``!'' return a new <code>String</code>.  However, there are
07438  *  exceptions, such as <code>String#[]=</code>.
07439  *
07440  */
07441 
07442 void
07443 Init_String(void)
07444 {
07445 #undef rb_intern
07446 #define rb_intern(str) rb_intern_const(str)
07447 
07448     rb_cString  = rb_define_class("String", rb_cObject);
07449     rb_include_module(rb_cString, rb_mComparable);
07450     rb_define_alloc_func(rb_cString, str_alloc);
07451     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07452     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07453     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07454     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07455     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07456     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07457     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07458     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07459     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07460     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07461     rb_define_method(rb_cString, "*", rb_str_times, 1);
07462     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07463     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07464     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07465     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07466     rb_define_method(rb_cString, "length", rb_str_length, 0);
07467     rb_define_method(rb_cString, "size", rb_str_length, 0);
07468     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07469     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07470     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07471     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07472     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07473     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07474     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07475     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07476     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07477     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07478     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07479     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07480     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07481     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07482     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07483     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07484 
07485     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07486     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07487     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07488     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07489     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07490     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07491 
07492     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07493     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07494     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07495     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07496 
07497     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07498     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07499     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07500     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07501 
07502     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07503     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07504     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07505     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07506     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07507     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07508     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07509     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07510     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07511     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07512     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07513     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07514     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07515     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07516     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07517 
07518     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07519     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07520     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07521 
07522     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07523 
07524     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07525     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07526     rb_define_method(rb_cString, "center", rb_str_center, -1);
07527 
07528     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07529     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07530     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07531     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07532     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07533     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07534     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07535 
07536     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07537     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07538     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07539     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07540     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07541     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07542     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07543 
07544     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07545     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07546     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07547     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07548     rb_define_method(rb_cString, "count", rb_str_count, -1);
07549 
07550     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07551     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07552     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07553     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07554 
07555     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07556     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07557     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07558     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07559 
07560     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07561 
07562     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07563     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07564 
07565     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07566     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07567 
07568     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07569     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07570     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07571     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07572 
07573     id_to_s = rb_intern("to_s");
07574 
07575     rb_fs = Qnil;
07576     rb_define_variable("$;", &rb_fs);
07577     rb_define_variable("$-F", &rb_fs);
07578 
07579     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07580     rb_include_module(rb_cSymbol, rb_mComparable);
07581     rb_undef_alloc_func(rb_cSymbol);
07582     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07583     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07584 
07585     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07586     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07587     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07588     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07589     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07590     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07591     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07592     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07593     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07594     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07595 
07596     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07597     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07598     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07599 
07600     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07601     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07602     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07603     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07604     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07605     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07606 
07607     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07608     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07609     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07610     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07611 
07612     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07613 }
07614