re.c

Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   re.c -
00004 
00005   $Author: yugui $
00006   created at: Mon Aug  9 18:24:49 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "regint.h"
00017 #include <ctype.h>
00018 
00019 VALUE rb_eRegexpError;
00020 
00021 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00022 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00023 
00024 #define BEG(no) regs->beg[no]
00025 #define END(no) regs->end[no]
00026 
00027 #if 'a' == 97   /* it's ascii */
00028 static const char casetable[] = {
00029         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00030         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00031         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00032         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00033         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
00034         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00035         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
00036         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00037         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
00038         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00039         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
00040         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00041         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
00042         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00043         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
00044         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00045         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
00046         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00047         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
00048         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00049         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
00050         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00051         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
00052         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00053         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
00054         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00055         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
00056         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00057         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00058         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00059         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00060         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00061         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00062         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00063         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00064         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00065         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00066         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00067         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00068         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00069         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00070         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00071         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00072         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00073 };
00074 #else
00075 # error >>> "You lose. You will need a translation table for your character set." <<<
00076 #endif
00077 
00078 int
00079 rb_memcicmp(const void *x, const void *y, long len)
00080 {
00081     const unsigned char *p1 = x, *p2 = y;
00082     int tmp;
00083 
00084     while (len--) {
00085         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00086             return tmp;
00087     }
00088     return 0;
00089 }
00090 
00091 #undef rb_memcmp
00092 
00093 int
00094 rb_memcmp(const void *p1, const void *p2, long len)
00095 {
00096     return memcmp(p1, p2, len);
00097 }
00098 
00099 static inline long
00100 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00101 {
00102     const unsigned char *x = xs, *xe = xs + m;
00103     const unsigned char *y = ys, *ye = ys + n;
00104 #ifndef VALUE_MAX
00105 # if SIZEOF_VALUE == 8
00106 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00107 # elif SIZEOF_VALUE == 4
00108 #  define VALUE_MAX 0xFFFFFFFFUL
00109 # endif
00110 #endif
00111     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00112 
00113     if (m > SIZEOF_VALUE)
00114         rb_bug("!!too long pattern string!!");
00115 
00116     /* Prepare hash value */
00117     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00118         hx <<= CHAR_BIT;
00119         hy <<= CHAR_BIT;
00120         hx |= *x;
00121         hy |= *y;
00122     }
00123     /* Searching */
00124     while (hx != hy) {
00125         if (y == ye)
00126             return -1;
00127         hy <<= CHAR_BIT;
00128         hy |= *y;
00129         hy &= mask;
00130         y++;
00131     }
00132     return y - ys - m;
00133 }
00134 
00135 static inline long
00136 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00137 {
00138     const unsigned char *x = xs, *xe = xs + m;
00139     const unsigned char *y = ys;
00140     VALUE i, qstable[256];
00141 
00142     /* Preprocessing */
00143     for (i = 0; i < 256; ++i)
00144         qstable[i] = m + 1;
00145     for (; x < xe; ++x)
00146         qstable[*x] = xe - x;
00147     /* Searching */
00148     for (; y + m <= ys + n; y += *(qstable + y[m])) {
00149         if (*xs == *y && memcmp(xs, y, m) == 0)
00150             return y - ys;
00151     }
00152     return -1;
00153 }
00154 
00155 static inline unsigned int
00156 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00157 {
00158     register const unsigned int mix = 8353;
00159     register unsigned int h = *x;
00160     if (h < 0xC0) {
00161         return h + 256;
00162     }
00163     else if (h < 0xE0) {
00164         h *= mix;
00165         h += x[1];
00166     }
00167     else if (h < 0xF0) {
00168         h *= mix;
00169         h += x[1];
00170         h *= mix;
00171         h += x[2];
00172     }
00173     else if (h < 0xF5) {
00174         h *= mix;
00175         h += x[1];
00176         h *= mix;
00177         h += x[2];
00178         h *= mix;
00179         h += x[3];
00180     }
00181     else {
00182         return h + 256;
00183     }
00184     return (unsigned char)h;
00185 }
00186 
00187 static inline long
00188 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00189 {
00190     const unsigned char *x = xs, *xe = xs + m;
00191     const unsigned char *y = ys;
00192     VALUE i, qstable[512];
00193 
00194     /* Preprocessing */
00195     for (i = 0; i < 512; ++i) {
00196         qstable[i] = m + 1;
00197     }
00198     for (; x < xe; ++x) {
00199         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00200     }
00201     /* Searching */
00202     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00203         if (*xs == *y && memcmp(xs, y, m) == 0)
00204             return y - ys;
00205     }
00206     return -1;
00207 }
00208 
00209 long
00210 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00211 {
00212     const unsigned char *x = x0, *y = y0;
00213 
00214     if (m > n) return -1;
00215     else if (m == n) {
00216         return memcmp(x0, y0, m) == 0 ? 0 : -1;
00217     }
00218     else if (m < 1) {
00219         return 0;
00220     }
00221     else if (m == 1) {
00222         const unsigned char *ys = y, *ye = ys + n;
00223         for (; y < ye; ++y) {
00224             if (*x == *y)
00225                 return y - ys;
00226         }
00227         return -1;
00228     }
00229     else if (m <= SIZEOF_VALUE) {
00230         return rb_memsearch_ss(x0, m, y0, n);
00231     }
00232     else if (enc == rb_utf8_encoding()){
00233         return rb_memsearch_qs_utf8(x0, m, y0, n);
00234     }
00235     else {
00236         return rb_memsearch_qs(x0, m, y0, n);
00237     }
00238 }
00239 
00240 #define REG_LITERAL FL_USER5
00241 #define REG_ENCODING_NONE FL_USER6
00242 
00243 #define KCODE_FIXED FL_USER4
00244 
00245 #define ARG_REG_OPTION_MASK \
00246     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00247 #define ARG_ENCODING_FIXED    16
00248 #define ARG_ENCODING_NONE     32
00249 
00250 static int
00251 char_to_option(int c)
00252 {
00253     int val;
00254 
00255     switch (c) {
00256       case 'i':
00257         val = ONIG_OPTION_IGNORECASE;
00258         break;
00259       case 'x':
00260         val = ONIG_OPTION_EXTEND;
00261         break;
00262       case 'm':
00263         val = ONIG_OPTION_MULTILINE;
00264         break;
00265       default:
00266         val = 0;
00267         break;
00268     }
00269     return val;
00270 }
00271 
00272 static char *
00273 option_to_str(char str[4], int options)
00274 {
00275     char *p = str;
00276     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00277     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00278     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00279     *p = 0;
00280     return str;
00281 }
00282 
00283 extern int
00284 rb_char_to_option_kcode(int c, int *option, int *kcode)
00285 {
00286     *option = 0;
00287 
00288     switch (c) {
00289       case 'n':
00290         *kcode = rb_ascii8bit_encindex();
00291         return (*option = ARG_ENCODING_NONE);
00292       case 'e':
00293         *kcode = rb_enc_find_index("EUC-JP");
00294         break;
00295       case 's':
00296         *kcode = rb_enc_find_index("Windows-31J");
00297         break;
00298       case 'u':
00299         *kcode = rb_utf8_encindex();
00300         break;
00301       default:
00302         *kcode = -1;
00303         return (*option = char_to_option(c));
00304     }
00305     *option = ARG_ENCODING_FIXED;
00306     return 1;
00307 }
00308 
00309 static void
00310 rb_reg_check(VALUE re)
00311 {
00312     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00313         rb_raise(rb_eTypeError, "uninitialized Regexp");
00314     }
00315 }
00316 
00317 int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
00318 
00319 static void
00320 rb_reg_expr_str(VALUE str, const char *s, long len,
00321         rb_encoding *enc, rb_encoding *resenc)
00322 {
00323     const char *p, *pend;
00324     int need_escape = 0;
00325     int c, clen;
00326 
00327     p = s; pend = p + len;
00328     if (rb_enc_asciicompat(enc)) {
00329         while (p < pend) {
00330             c = rb_enc_ascget(p, pend, &clen, enc);
00331             if (c == -1) {
00332                 if (enc == resenc) {
00333                     p += mbclen(p, pend, enc);
00334                 }
00335                 else {
00336                     need_escape = 1;
00337                     break;
00338                 }
00339             }
00340             else if (c != '/' && rb_enc_isprint(c, enc)) {
00341                 p += clen;
00342             }
00343             else {
00344                 need_escape = 1;
00345                 break;
00346             }
00347         }
00348     }
00349     else {
00350         need_escape = 1;
00351     }
00352 
00353     if (!need_escape) {
00354         rb_str_buf_cat(str, s, len);
00355     }
00356     else {
00357         int unicode_p = rb_enc_unicode_p(enc);
00358         p = s;
00359         while (p<pend) {
00360             c = rb_enc_ascget(p, pend, &clen, enc);
00361             if (c == '\\' && p+clen < pend) {
00362                 int n = clen + mbclen(p+clen, pend, enc);
00363                 rb_str_buf_cat(str, p, n);
00364                 p += n;
00365                 continue;
00366             }
00367             else if (c == '/') {
00368                 char c = '\\';
00369                 rb_str_buf_cat(str, &c, 1);
00370                 rb_str_buf_cat(str, p, clen);
00371             }
00372             else if (c == -1) {
00373                 clen = rb_enc_precise_mbclen(p, pend, enc);
00374                 if (!MBCLEN_CHARFOUND_P(clen)) {
00375                     c = (unsigned char)*p;
00376                     clen = 1;
00377                     goto hex;
00378                 }
00379                 if (resenc) {
00380                     unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00381                     rb_str_buf_cat_escaped_char(str, c, unicode_p);
00382                 }
00383                 else {
00384                     clen = MBCLEN_CHARFOUND_LEN(clen);
00385                     rb_str_buf_cat(str, p, clen);
00386                 }
00387             }
00388             else if (rb_enc_isprint(c, enc)) {
00389                 rb_str_buf_cat(str, p, clen);
00390             }
00391             else if (!rb_enc_isspace(c, enc)) {
00392                 char b[8];
00393 
00394               hex:
00395                 snprintf(b, sizeof(b), "\\x%02X", c);
00396                 rb_str_buf_cat(str, b, 4);
00397             }
00398             else {
00399                 rb_str_buf_cat(str, p, clen);
00400             }
00401             p += clen;
00402         }
00403     }
00404 }
00405 
00406 static VALUE
00407 rb_reg_desc(const char *s, long len, VALUE re)
00408 {
00409     rb_encoding *enc = rb_enc_get(re);
00410     VALUE str = rb_str_buf_new2("/");
00411     rb_encoding *resenc = rb_default_internal_encoding();
00412     if (resenc == NULL) resenc = rb_default_external_encoding();
00413 
00414     if (re && rb_enc_asciicompat(enc)) {
00415         rb_enc_copy(str, re);
00416     }
00417     else {
00418         rb_enc_associate(str, rb_usascii_encoding());
00419     }
00420     rb_reg_expr_str(str, s, len, enc, resenc);
00421     rb_str_buf_cat2(str, "/");
00422     if (re) {
00423         char opts[4];
00424         rb_reg_check(re);
00425         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00426             rb_str_buf_cat2(str, opts);
00427         if (RBASIC(re)->flags & REG_ENCODING_NONE)
00428             rb_str_buf_cat2(str, "n");
00429     }
00430     OBJ_INFECT(str, re);
00431     return str;
00432 }
00433 
00434 
00435 /*
00436  *  call-seq:
00437  *      rxp.source   -> str
00438  *
00439  *  Returns the original string of the pattern.
00440  *
00441  *      /ab+c/ix.source #=> "ab+c"
00442  *
00443  *  Note that escape sequences are retained as is.
00444  *
00445  *     /\x20\+/.source  #=> "\\x20\\+"
00446  *
00447  */
00448 
00449 static VALUE
00450 rb_reg_source(VALUE re)
00451 {
00452     VALUE str;
00453 
00454     rb_reg_check(re);
00455     str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00456     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00457     return str;
00458 }
00459 
00460 /*
00461  * call-seq:
00462  *    rxp.inspect   -> string
00463  *
00464  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
00465  * <code>#inspect</code> actually produces the more natural version of
00466  * the string than <code>#to_s</code>.
00467  *
00468  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
00469  *
00470  */
00471 
00472 static VALUE
00473 rb_reg_inspect(VALUE re)
00474 {
00475     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00476         return rb_any_to_s(re);
00477     }
00478     return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00479 }
00480 
00481 
00482 /*
00483  *  call-seq:
00484  *     rxp.to_s   -> str
00485  *
00486  *  Returns a string containing the regular expression and its options (using the
00487  *  <code>(?opts:source)</code> notation. This string can be fed back in to
00488  *  <code>Regexp::new</code> to a regular expression with the same semantics as
00489  *  the original. (However, <code>Regexp#==</code> may not return true when
00490  *  comparing the two, as the source of the regular expression itself may
00491  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
00492  *  generally more readable version of <i>rxp</i>.
00493  *
00494  *      r1 = /ab+c/ix           #=> /ab+c/ix
00495  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
00496  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
00497  *      r1 == r2                #=> false
00498  *      r1.source               #=> "ab+c"
00499  *      r2.source               #=> "(?ix-m:ab+c)"
00500  */
00501 
00502 static VALUE
00503 rb_reg_to_s(VALUE re)
00504 {
00505     int options, opt;
00506     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00507     long len;
00508     const UChar* ptr;
00509     VALUE str = rb_str_buf_new2("(?");
00510     char optbuf[5];
00511     rb_encoding *enc = rb_enc_get(re);
00512 
00513     rb_reg_check(re);
00514 
00515     rb_enc_copy(str, re);
00516     options = RREGEXP(re)->ptr->options;
00517     ptr = (UChar*)RREGEXP_SRC_PTR(re);
00518     len = RREGEXP_SRC_LEN(re);
00519   again:
00520     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00521         int err = 1;
00522         ptr += 2;
00523         if ((len -= 2) > 0) {
00524             do {
00525                 opt = char_to_option((int )*ptr);
00526                 if (opt != 0) {
00527                     options |= opt;
00528                 }
00529                 else {
00530                     break;
00531                 }
00532                 ++ptr;
00533             } while (--len > 0);
00534         }
00535         if (len > 1 && *ptr == '-') {
00536             ++ptr;
00537             --len;
00538             do {
00539                 opt = char_to_option((int )*ptr);
00540                 if (opt != 0) {
00541                     options &= ~opt;
00542                 }
00543                 else {
00544                     break;
00545                 }
00546                 ++ptr;
00547             } while (--len > 0);
00548         }
00549         if (*ptr == ')') {
00550             --len;
00551             ++ptr;
00552             goto again;
00553         }
00554         if (*ptr == ':' && ptr[len-1] == ')') {
00555             Regexp *rp;
00556 
00557             ++ptr;
00558             len -= 2;
00559             err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00560                            enc, OnigDefaultSyntax, NULL);
00561             onig_free(rp);
00562         }
00563         if (err) {
00564             options = RREGEXP(re)->ptr->options;
00565             ptr = (UChar*)RREGEXP_SRC_PTR(re);
00566             len = RREGEXP_SRC_LEN(re);
00567         }
00568     }
00569 
00570     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00571 
00572     if ((options & embeddable) != embeddable) {
00573         optbuf[0] = '-';
00574         option_to_str(optbuf + 1, ~options);
00575         rb_str_buf_cat2(str, optbuf);
00576     }
00577 
00578     rb_str_buf_cat2(str, ":");
00579     rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00580     rb_str_buf_cat2(str, ")");
00581     rb_enc_copy(str, re);
00582 
00583     OBJ_INFECT(str, re);
00584     return str;
00585 }
00586 
00587 static void
00588 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00589 {
00590     volatile VALUE desc = rb_reg_desc(s, len, re);
00591 
00592     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00593 }
00594 
00595 static VALUE
00596 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00597 {
00598     char opts[6];
00599     VALUE desc = rb_str_buf_new2(err);
00600     rb_encoding *resenc = rb_default_internal_encoding();
00601     if (resenc == NULL) resenc = rb_default_external_encoding();
00602 
00603     rb_enc_associate(desc, enc);
00604     rb_str_buf_cat2(desc, ": /");
00605     rb_reg_expr_str(desc, s, len, enc, resenc);
00606     opts[0] = '/';
00607     option_to_str(opts + 1, options);
00608     rb_str_buf_cat2(desc, opts);
00609     return rb_exc_new3(rb_eRegexpError, desc);
00610 }
00611 
00612 static void
00613 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00614 {
00615     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00616 }
00617 
00618 static VALUE
00619 rb_reg_error_desc(VALUE str, int options, const char *err)
00620 {
00621     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00622                                  rb_enc_get(str), options, err);
00623 }
00624 
00625 static void
00626 rb_reg_raise_str(VALUE str, int options, const char *err)
00627 {
00628     rb_exc_raise(rb_reg_error_desc(str, options, err));
00629 }
00630 
00631 
00632 /*
00633  *  call-seq:
00634  *     rxp.casefold?   -> true or false
00635  *
00636  *  Returns the value of the case-insensitive flag.
00637  *
00638  *      /a/.casefold?           #=> false
00639  *      /a/i.casefold?          #=> true
00640  *      /(?i:a)/.casefold?      #=> false
00641  */
00642 
00643 static VALUE
00644 rb_reg_casefold_p(VALUE re)
00645 {
00646     rb_reg_check(re);
00647     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00648     return Qfalse;
00649 }
00650 
00651 
00652 /*
00653  *  call-seq:
00654  *     rxp.options   -> fixnum
00655  *
00656  *  Returns the set of bits corresponding to the options used when creating this
00657  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
00658  *  may be set in the returned options: these are used internally by the regular
00659  *  expression code. These extra bits are ignored if the options are passed to
00660  *  <code>Regexp::new</code>.
00661  *
00662  *     Regexp::IGNORECASE                  #=> 1
00663  *     Regexp::EXTENDED                    #=> 2
00664  *     Regexp::MULTILINE                   #=> 4
00665  *
00666  *     /cat/.options                       #=> 0
00667  *     /cat/ix.options                     #=> 3
00668  *     Regexp.new('cat', true).options     #=> 1
00669  *     /\xa1\xa2/e.options                 #=> 16
00670  *
00671  *     r = /cat/ix
00672  *     Regexp.new(r.source, r.options)     #=> /cat/ix
00673  */
00674 
00675 static VALUE
00676 rb_reg_options_m(VALUE re)
00677 {
00678     int options = rb_reg_options(re);
00679     return INT2NUM(options);
00680 }
00681 
00682 static int
00683 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00684           int back_num, int *back_refs, OnigRegex regex, void *arg)
00685 {
00686     VALUE ary = (VALUE)arg;
00687     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00688     return 0;
00689 }
00690 
00691 /*
00692  * call-seq:
00693  *    rxp.names   -> [name1, name2, ...]
00694  *
00695  * Returns a list of names of captures as an array of strings.
00696  *
00697  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
00698  *     #=> ["foo", "bar", "baz"]
00699  *
00700  *     /(?<foo>.)(?<foo>.)/.names
00701  *     #=> ["foo"]
00702  *
00703  *     /(.)(.)/.names
00704  *     #=> []
00705  */
00706 
00707 static VALUE
00708 rb_reg_names(VALUE re)
00709 {
00710     VALUE ary = rb_ary_new();
00711     rb_reg_check(re);
00712     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00713     return ary;
00714 }
00715 
00716 static int
00717 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00718           int back_num, int *back_refs, OnigRegex regex, void *arg)
00719 {
00720     VALUE hash = (VALUE)arg;
00721     VALUE ary = rb_ary_new2(back_num);
00722     int i;
00723 
00724     for(i = 0; i < back_num; i++)
00725         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00726 
00727     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00728 
00729     return 0;
00730 }
00731 
00732 /*
00733  * call-seq:
00734  *    rxp.named_captures  -> hash
00735  *
00736  * Returns a hash representing information about named captures of <i>rxp</i>.
00737  *
00738  * A key of the hash is a name of the named captures.
00739  * A value of the hash is an array which is list of indexes of corresponding
00740  * named captures.
00741  *
00742  *    /(?<foo>.)(?<bar>.)/.named_captures
00743  *    #=> {"foo"=>[1], "bar"=>[2]}
00744  *
00745  *    /(?<foo>.)(?<foo>.)/.named_captures
00746  *    #=> {"foo"=>[1, 2]}
00747  *
00748  * If there are no named captures, an empty hash is returned.
00749  *
00750  *    /(.)(.)/.named_captures
00751  *    #=> {}
00752  */
00753 
00754 static VALUE
00755 rb_reg_named_captures(VALUE re)
00756 {
00757     VALUE hash = rb_hash_new();
00758     rb_reg_check(re);
00759     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00760     return hash;
00761 }
00762 
00763 static int
00764 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00765           OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00766           OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00767 {
00768   int r;
00769 
00770   *reg = (regex_t* )xmalloc(sizeof(regex_t));
00771   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00772 
00773   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00774   if (r) goto err;
00775 
00776   r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00777   if (r) {
00778   err:
00779     onig_free(*reg);
00780     *reg = NULL;
00781   }
00782   return r;
00783 }
00784 
00785 static Regexp*
00786 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00787         const char *sourcefile, int sourceline)
00788 {
00789     Regexp *rp;
00790     int r;
00791     OnigErrorInfo einfo;
00792 
00793     /* Handle escaped characters first. */
00794 
00795     /* Build a copy of the string (in dest) with the
00796        escaped characters translated,  and generate the regex
00797        from that.
00798     */
00799 
00800     r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00801                  enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00802     if (r) {
00803         onig_error_code_to_str((UChar*)err, r, &einfo);
00804         return 0;
00805     }
00806     return rp;
00807 }
00808 
00809 
00810 /*
00811  *  Document-class: MatchData
00812  *
00813  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
00814  *  and is the type of the object returned by <code>Regexp#match</code> and
00815  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
00816  *  match, results normally accessed through the special variables
00817  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
00818  *  <code>$2</code>, and so on.
00819  *
00820  */
00821 
00822 VALUE rb_cMatch;
00823 
00824 static VALUE
00825 match_alloc(VALUE klass)
00826 {
00827     NEWOBJ(match, struct RMatch);
00828     OBJSETUP(match, klass, T_MATCH);
00829 
00830     match->str = 0;
00831     match->rmatch = 0;
00832     match->regexp = 0;
00833     match->rmatch = ALLOC(struct rmatch);
00834     MEMZERO(match->rmatch, struct rmatch, 1);
00835 
00836     return (VALUE)match;
00837 }
00838 
00839 typedef struct {
00840     long byte_pos;
00841     long char_pos;
00842 } pair_t;
00843 
00844 static int
00845 pair_byte_cmp(const void *pair1, const void *pair2)
00846 {
00847     long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00848 #if SIZEOF_LONG > SIZEOF_INT
00849     return diff ? diff > 0 ? 1 : -1 : 0;
00850 #else
00851     return (int)diff;
00852 #endif
00853 }
00854 
00855 static void
00856 update_char_offset(VALUE match)
00857 {
00858     struct rmatch *rm = RMATCH(match)->rmatch;
00859     struct re_registers *regs;
00860     int i, num_regs, num_pos;
00861     long c;
00862     char *s, *p, *q, *e;
00863     rb_encoding *enc;
00864     pair_t *pairs;
00865 
00866     if (rm->char_offset_updated)
00867         return;
00868 
00869     regs = &rm->regs;
00870     num_regs = rm->regs.num_regs;
00871 
00872     if (rm->char_offset_num_allocated < num_regs) {
00873         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00874         rm->char_offset_num_allocated = num_regs;
00875     }
00876 
00877     enc = rb_enc_get(RMATCH(match)->str);
00878     if (rb_enc_mbmaxlen(enc) == 1) {
00879         for (i = 0; i < num_regs; i++) {
00880             rm->char_offset[i].beg = BEG(i);
00881             rm->char_offset[i].end = END(i);
00882         }
00883         rm->char_offset_updated = 1;
00884         return;
00885     }
00886 
00887     pairs = ALLOCA_N(pair_t, num_regs*2);
00888     num_pos = 0;
00889     for (i = 0; i < num_regs; i++) {
00890         if (BEG(i) < 0)
00891             continue;
00892         pairs[num_pos++].byte_pos = BEG(i);
00893         pairs[num_pos++].byte_pos = END(i);
00894     }
00895     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00896 
00897     s = p = RSTRING_PTR(RMATCH(match)->str);
00898     e = s + RSTRING_LEN(RMATCH(match)->str);
00899     c = 0;
00900     for (i = 0; i < num_pos; i++) {
00901         q = s + pairs[i].byte_pos;
00902         c += rb_enc_strlen(p, q, enc);
00903         pairs[i].char_pos = c;
00904         p = q;
00905     }
00906 
00907     for (i = 0; i < num_regs; i++) {
00908         pair_t key, *found;
00909         if (BEG(i) < 0) {
00910             rm->char_offset[i].beg = -1;
00911             rm->char_offset[i].end = -1;
00912             continue;
00913         }
00914 
00915         key.byte_pos = BEG(i);
00916         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00917         rm->char_offset[i].beg = found->char_pos;
00918 
00919         key.byte_pos = END(i);
00920         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00921         rm->char_offset[i].end = found->char_pos;
00922     }
00923 
00924     rm->char_offset_updated = 1;
00925 }
00926 
00927 static void
00928 match_check(VALUE match)
00929 {
00930     if (!RMATCH(match)->regexp) {
00931         rb_raise(rb_eTypeError, "uninitialized Match");
00932     }
00933 }
00934 
00935 /* :nodoc: */
00936 static VALUE
00937 match_init_copy(VALUE obj, VALUE orig)
00938 {
00939     struct rmatch *rm;
00940 
00941     if (obj == orig) return obj;
00942 
00943     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00944         rb_raise(rb_eTypeError, "wrong argument class");
00945     }
00946     RMATCH(obj)->str = RMATCH(orig)->str;
00947     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00948 
00949     rm = RMATCH(obj)->rmatch;
00950     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00951 
00952     if (!RMATCH(orig)->rmatch->char_offset_updated) {
00953         rm->char_offset_updated = 0;
00954     }
00955     else {
00956         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00957             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00958             rm->char_offset_num_allocated = rm->regs.num_regs;
00959         }
00960         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00961                struct rmatch_offset, rm->regs.num_regs);
00962         rm->char_offset_updated = 1;
00963     }
00964 
00965     return obj;
00966 }
00967 
00968 
00969 /*
00970  * call-seq:
00971  *    mtch.regexp   -> regexp
00972  *
00973  * Returns the regexp.
00974  *
00975  *     m = /a.*b/.match("abc")
00976  *     m.regexp #=> /a.*b/
00977  */
00978 
00979 static VALUE
00980 match_regexp(VALUE match)
00981 {
00982     match_check(match);
00983     return RMATCH(match)->regexp;
00984 }
00985 
00986 /*
00987  * call-seq:
00988  *    mtch.names   -> [name1, name2, ...]
00989  *
00990  * Returns a list of names of captures as an array of strings.
00991  * It is same as mtch.regexp.names.
00992  *
00993  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
00994  *     #=> ["foo", "bar", "baz"]
00995  *
00996  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
00997  *     m.names                          #=> ["x", "y"]
00998  */
00999 
01000 static VALUE
01001 match_names(VALUE match)
01002 {
01003     match_check(match);
01004     return rb_reg_names(RMATCH(match)->regexp);
01005 }
01006 
01007 /*
01008  *  call-seq:
01009  *     mtch.length   -> integer
01010  *     mtch.size     -> integer
01011  *
01012  *  Returns the number of elements in the match array.
01013  *
01014  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01015  *     m.length   #=> 5
01016  *     m.size     #=> 5
01017  */
01018 
01019 static VALUE
01020 match_size(VALUE match)
01021 {
01022     match_check(match);
01023     return INT2FIX(RMATCH_REGS(match)->num_regs);
01024 }
01025 
01026 static int
01027 match_backref_number(VALUE match, VALUE backref)
01028 {
01029     const char *name;
01030     int num;
01031 
01032     struct re_registers *regs = RMATCH_REGS(match);
01033     VALUE regexp = RMATCH(match)->regexp;
01034 
01035     match_check(match);
01036     switch(TYPE(backref)) {
01037       default:
01038         return NUM2INT(backref);
01039 
01040       case T_SYMBOL:
01041         name = rb_id2name(SYM2ID(backref));
01042         break;
01043 
01044       case T_STRING:
01045         name = StringValueCStr(backref);
01046         break;
01047     }
01048 
01049     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01050               (const unsigned char*)name,
01051               (const unsigned char*)name + strlen(name),
01052               regs);
01053 
01054     if (num < 1) {
01055         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01056     }
01057 
01058     return num;
01059 }
01060 
01061 int
01062 rb_reg_backref_number(VALUE match, VALUE backref)
01063 {
01064     return match_backref_number(match, backref);
01065 }
01066 
01067 /*
01068  *  call-seq:
01069  *     mtch.offset(n)   -> array
01070  *
01071  *  Returns a two-element array containing the beginning and ending offsets of
01072  *  the <em>n</em>th match.
01073  *  <em>n</em> can be a string or symbol to reference a named capture.
01074  *
01075  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01076  *     m.offset(0)      #=> [1, 7]
01077  *     m.offset(4)      #=> [6, 7]
01078  *
01079  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01080  *     p m.offset(:foo) #=> [0, 1]
01081  *     p m.offset(:bar) #=> [2, 3]
01082  *
01083  */
01084 
01085 static VALUE
01086 match_offset(VALUE match, VALUE n)
01087 {
01088     int i = match_backref_number(match, n);
01089     struct re_registers *regs = RMATCH_REGS(match);
01090 
01091     match_check(match);
01092     if (i < 0 || regs->num_regs <= i)
01093         rb_raise(rb_eIndexError, "index %d out of matches", i);
01094 
01095     if (BEG(i) < 0)
01096         return rb_assoc_new(Qnil, Qnil);
01097 
01098     update_char_offset(match);
01099     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01100                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01101 }
01102 
01103 
01104 /*
01105  *  call-seq:
01106  *     mtch.begin(n)   -> integer
01107  *
01108  *  Returns the offset of the start of the <em>n</em>th element of the match
01109  *  array in the string.
01110  *  <em>n</em> can be a string or symbol to reference a named capture.
01111  *
01112  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01113  *     m.begin(0)       #=> 1
01114  *     m.begin(2)       #=> 2
01115  *
01116  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01117  *     p m.begin(:foo)  #=> 0
01118  *     p m.begin(:bar)  #=> 2
01119  */
01120 
01121 static VALUE
01122 match_begin(VALUE match, VALUE n)
01123 {
01124     int i = match_backref_number(match, n);
01125     struct re_registers *regs = RMATCH_REGS(match);
01126 
01127     match_check(match);
01128     if (i < 0 || regs->num_regs <= i)
01129         rb_raise(rb_eIndexError, "index %d out of matches", i);
01130 
01131     if (BEG(i) < 0)
01132         return Qnil;
01133 
01134     update_char_offset(match);
01135     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01136 }
01137 
01138 
01139 /*
01140  *  call-seq:
01141  *     mtch.end(n)   -> integer
01142  *
01143  *  Returns the offset of the character immediately following the end of the
01144  *  <em>n</em>th element of the match array in the string.
01145  *  <em>n</em> can be a string or symbol to reference a named capture.
01146  *
01147  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01148  *     m.end(0)         #=> 7
01149  *     m.end(2)         #=> 3
01150  *
01151  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01152  *     p m.end(:foo)    #=> 1
01153  *     p m.end(:bar)    #=> 3
01154  */
01155 
01156 static VALUE
01157 match_end(VALUE match, VALUE n)
01158 {
01159     int i = match_backref_number(match, n);
01160     struct re_registers *regs = RMATCH_REGS(match);
01161 
01162     match_check(match);
01163     if (i < 0 || regs->num_regs <= i)
01164         rb_raise(rb_eIndexError, "index %d out of matches", i);
01165 
01166     if (BEG(i) < 0)
01167         return Qnil;
01168 
01169     update_char_offset(match);
01170     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01171 }
01172 
01173 #define MATCH_BUSY FL_USER2
01174 
01175 void
01176 rb_match_busy(VALUE match)
01177 {
01178     FL_SET(match, MATCH_BUSY);
01179 }
01180 
01181 /*
01182  *  call-seq:
01183  *     rxp.fixed_encoding?   -> true or false
01184  *
01185  *  Returns false if rxp is applicable to
01186  *  a string with any ASCII compatible encoding.
01187  *  Returns true otherwise.
01188  *
01189  *      r = /a/
01190  *      r.fixed_encoding?                               #=> false
01191  *      r =~ "\u{6666} a"                               #=> 2
01192  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
01193  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01194  *
01195  *      r = /a/u
01196  *      r.fixed_encoding?                               #=> true
01197  *      r.encoding                                      #=> #<Encoding:UTF-8>
01198  *      r =~ "\u{6666} a"                               #=> 2
01199  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01200  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01201  *
01202  *      r = /\u{6666}/
01203  *      r.fixed_encoding?                               #=> true
01204  *      r.encoding                                      #=> #<Encoding:UTF-8>
01205  *      r =~ "\u{6666} a"                               #=> 0
01206  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01207  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
01208  */
01209 
01210 static VALUE
01211 rb_reg_fixed_encoding_p(VALUE re)
01212 {
01213     if (FL_TEST(re, KCODE_FIXED))
01214         return Qtrue;
01215     else
01216         return Qfalse;
01217 }
01218 
01219 static VALUE
01220 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01221         rb_encoding **fixed_enc, onig_errmsg_buffer err);
01222 
01223 
01224 static void
01225 reg_enc_error(VALUE re, VALUE str)
01226 {
01227     rb_raise(rb_eEncCompatError,
01228              "incompatible encoding regexp match (%s regexp with %s string)",
01229              rb_enc_name(rb_enc_get(re)),
01230              rb_enc_name(rb_enc_get(str)));
01231 }
01232 
01233 static rb_encoding*
01234 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01235 {
01236     rb_encoding *enc = 0;
01237 
01238     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01239         rb_raise(rb_eArgError,
01240             "invalid byte sequence in %s",
01241             rb_enc_name(rb_enc_get(str)));
01242     }
01243 
01244     rb_reg_check(re);
01245     enc = rb_enc_get(str);
01246     if (!rb_enc_str_asciicompat_p(str)) {
01247         if (RREGEXP(re)->ptr->enc != enc) {
01248             reg_enc_error(re, str);
01249         }
01250     }
01251     else if (rb_reg_fixed_encoding_p(re)) {
01252         if (RREGEXP(re)->ptr->enc != enc &&
01253             (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01254              rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01255             reg_enc_error(re, str);
01256         }
01257         enc = RREGEXP(re)->ptr->enc;
01258     }
01259     if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01260         enc != rb_ascii8bit_encoding() &&
01261         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01262         rb_warn("regexp match /.../n against to %s string",
01263                 rb_enc_name(enc));
01264     }
01265     return enc;
01266 }
01267 
01268 regex_t *
01269 rb_reg_prepare_re(VALUE re, VALUE str)
01270 {
01271     regex_t *reg = RREGEXP(re)->ptr;
01272     onig_errmsg_buffer err = "";
01273     int r;
01274     OnigErrorInfo einfo;
01275     const char *pattern;
01276     VALUE unescaped;
01277     rb_encoding *fixed_enc = 0;
01278     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01279 
01280     if (reg->enc == enc) return reg;
01281 
01282     rb_reg_check(re);
01283     reg = RREGEXP(re)->ptr;
01284     pattern = RREGEXP_SRC_PTR(re);
01285 
01286     unescaped = rb_reg_preprocess(
01287         pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01288         &fixed_enc, err);
01289 
01290     if (unescaped == Qnil) {
01291         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01292     }
01293 
01294     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
01295                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01296                  reg->options, enc,
01297                  OnigDefaultSyntax, &einfo);
01298     if (r) {
01299         onig_error_code_to_str((UChar*)err, r, &einfo);
01300         rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01301     }
01302 
01303     RB_GC_GUARD(unescaped);
01304     return reg;
01305 }
01306 
01307 long
01308 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01309 {
01310     long range;
01311     rb_encoding *enc;
01312     UChar *p, *string;
01313 
01314     enc = rb_reg_prepare_enc(re, str, 0);
01315 
01316     if (reverse) {
01317         range = -pos;
01318     }
01319     else {
01320         range = RSTRING_LEN(str) - pos;
01321     }
01322 
01323     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01324          string = (UChar*)RSTRING_PTR(str);
01325 
01326          if (range > 0) {
01327               p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01328          }
01329          else {
01330               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01331          }
01332          return p - string;
01333     }
01334 
01335     return pos;
01336 }
01337 
01338 long
01339 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01340 {
01341     long result;
01342     VALUE match;
01343     struct re_registers regi, *regs = &regi;
01344     char *range = RSTRING_PTR(str);
01345     regex_t *reg;
01346     int tmpreg;
01347 
01348     if (pos > RSTRING_LEN(str) || pos < 0) {
01349         rb_backref_set(Qnil);
01350         return -1;
01351     }
01352 
01353     reg = rb_reg_prepare_re(re, str);
01354     tmpreg = reg != RREGEXP(re)->ptr;
01355     if (!tmpreg) RREGEXP(re)->usecnt++;
01356 
01357     match = rb_backref_get();
01358     if (!NIL_P(match)) {
01359         if (FL_TEST(match, MATCH_BUSY)) {
01360             match = Qnil;
01361         }
01362         else {
01363             regs = RMATCH_REGS(match);
01364         }
01365     }
01366     if (NIL_P(match)) {
01367         MEMZERO(regs, struct re_registers, 1);
01368     }
01369     if (!reverse) {
01370         range += RSTRING_LEN(str);
01371     }
01372     result = onig_search(reg,
01373                          (UChar*)(RSTRING_PTR(str)),
01374                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01375                          ((UChar*)(RSTRING_PTR(str)) + pos),
01376                          ((UChar*)range),
01377                          regs, ONIG_OPTION_NONE);
01378     if (!tmpreg) RREGEXP(re)->usecnt--;
01379     if (tmpreg) {
01380         if (RREGEXP(re)->usecnt) {
01381             onig_free(reg);
01382         }
01383         else {
01384             onig_free(RREGEXP(re)->ptr);
01385             RREGEXP(re)->ptr = reg;
01386         }
01387     }
01388     if (result < 0) {
01389         if (regs == &regi)
01390             onig_region_free(regs, 0);
01391         if (result == ONIG_MISMATCH) {
01392             rb_backref_set(Qnil);
01393             return result;
01394         }
01395         else {
01396             onig_errmsg_buffer err = "";
01397             onig_error_code_to_str((UChar*)err, (int)result);
01398             rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01399         }
01400     }
01401 
01402     if (NIL_P(match)) {
01403         match = match_alloc(rb_cMatch);
01404         onig_region_copy(RMATCH_REGS(match), regs);
01405         onig_region_free(regs, 0);
01406     }
01407     else {
01408         if (rb_safe_level() >= 3)
01409             OBJ_TAINT(match);
01410         else
01411             FL_UNSET(match, FL_TAINT);
01412     }
01413 
01414     RMATCH(match)->str = rb_str_new4(str);
01415     RMATCH(match)->regexp = re;
01416     RMATCH(match)->rmatch->char_offset_updated = 0;
01417     rb_backref_set(match);
01418 
01419     OBJ_INFECT(match, re);
01420     OBJ_INFECT(match, str);
01421 
01422     return result;
01423 }
01424 
01425 VALUE
01426 rb_reg_nth_defined(int nth, VALUE match)
01427 {
01428     struct re_registers *regs;
01429     if (NIL_P(match)) return Qnil;
01430     match_check(match);
01431     regs = RMATCH_REGS(match);
01432     if (nth >= regs->num_regs) {
01433         return Qnil;
01434     }
01435     if (nth < 0) {
01436         nth += regs->num_regs;
01437         if (nth <= 0) return Qnil;
01438     }
01439     if (BEG(nth) == -1) return Qfalse;
01440     return Qtrue;
01441 }
01442 
01443 VALUE
01444 rb_reg_nth_match(int nth, VALUE match)
01445 {
01446     VALUE str;
01447     long start, end, len;
01448     struct re_registers *regs;
01449 
01450     if (NIL_P(match)) return Qnil;
01451     match_check(match);
01452     regs = RMATCH_REGS(match);
01453     if (nth >= regs->num_regs) {
01454         return Qnil;
01455     }
01456     if (nth < 0) {
01457         nth += regs->num_regs;
01458         if (nth <= 0) return Qnil;
01459     }
01460     start = BEG(nth);
01461     if (start == -1) return Qnil;
01462     end = END(nth);
01463     len = end - start;
01464     str = rb_str_subseq(RMATCH(match)->str, start, len);
01465     OBJ_INFECT(str, match);
01466     return str;
01467 }
01468 
01469 VALUE
01470 rb_reg_last_match(VALUE match)
01471 {
01472     return rb_reg_nth_match(0, match);
01473 }
01474 
01475 
01476 /*
01477  *  call-seq:
01478  *     mtch.pre_match   -> str
01479  *
01480  *  Returns the portion of the original string before the current match.
01481  *  Equivalent to the special variable <code>$`</code>.
01482  *
01483  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01484  *     m.pre_match   #=> "T"
01485  */
01486 
01487 VALUE
01488 rb_reg_match_pre(VALUE match)
01489 {
01490     VALUE str;
01491     struct re_registers *regs;
01492 
01493     if (NIL_P(match)) return Qnil;
01494     match_check(match);
01495     regs = RMATCH_REGS(match);
01496     if (BEG(0) == -1) return Qnil;
01497     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01498     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01499     return str;
01500 }
01501 
01502 
01503 /*
01504  *  call-seq:
01505  *     mtch.post_match   -> str
01506  *
01507  *  Returns the portion of the original string after the current match.
01508  *  Equivalent to the special variable <code>$'</code>.
01509  *
01510  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01511  *     m.post_match   #=> ": The Movie"
01512  */
01513 
01514 VALUE
01515 rb_reg_match_post(VALUE match)
01516 {
01517     VALUE str;
01518     long pos;
01519     struct re_registers *regs;
01520 
01521     if (NIL_P(match)) return Qnil;
01522     match_check(match);
01523     regs = RMATCH_REGS(match);
01524     if (BEG(0) == -1) return Qnil;
01525     str = RMATCH(match)->str;
01526     pos = END(0);
01527     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01528     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01529     return str;
01530 }
01531 
01532 VALUE
01533 rb_reg_match_last(VALUE match)
01534 {
01535     int i;
01536     struct re_registers *regs;
01537 
01538     if (NIL_P(match)) return Qnil;
01539     match_check(match);
01540     regs = RMATCH_REGS(match);
01541     if (BEG(0) == -1) return Qnil;
01542 
01543     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01544         ;
01545     if (i == 0) return Qnil;
01546     return rb_reg_nth_match(i, match);
01547 }
01548 
01549 static VALUE
01550 last_match_getter(void)
01551 {
01552     return rb_reg_last_match(rb_backref_get());
01553 }
01554 
01555 static VALUE
01556 prematch_getter(void)
01557 {
01558     return rb_reg_match_pre(rb_backref_get());
01559 }
01560 
01561 static VALUE
01562 postmatch_getter(void)
01563 {
01564     return rb_reg_match_post(rb_backref_get());
01565 }
01566 
01567 static VALUE
01568 last_paren_match_getter(void)
01569 {
01570     return rb_reg_match_last(rb_backref_get());
01571 }
01572 
01573 static VALUE
01574 match_array(VALUE match, int start)
01575 {
01576     struct re_registers *regs;
01577     VALUE ary;
01578     VALUE target;
01579     int i;
01580     int taint = OBJ_TAINTED(match);
01581 
01582     match_check(match);
01583     regs = RMATCH_REGS(match);
01584     ary = rb_ary_new2(regs->num_regs);
01585     target = RMATCH(match)->str;
01586 
01587     for (i=start; i<regs->num_regs; i++) {
01588         if (regs->beg[i] == -1) {
01589             rb_ary_push(ary, Qnil);
01590         }
01591         else {
01592             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01593             if (taint) OBJ_TAINT(str);
01594             rb_ary_push(ary, str);
01595         }
01596     }
01597     return ary;
01598 }
01599 
01600 
01601 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
01602    second example to prevent the '*' followed by a '/' from ending the
01603    comment. */
01604 
01605 /*
01606  *  call-seq:
01607  *     mtch.to_a   -> anArray
01608  *
01609  *  Returns the array of matches.
01610  *
01611  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01612  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
01613  *
01614  *  Because <code>to_a</code> is called when expanding
01615  *  <code>*</code><em>variable</em>, there's a useful assignment
01616  *  shortcut for extracting matched fields. This is slightly slower than
01617  *  accessing the fields directly (as an intermediate array is
01618  *  generated).
01619  *
01620  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
01621  *     all   #=> "HX1138"
01622  *     f1    #=> "H"
01623  *     f2    #=> "X"
01624  *     f3    #=> "113"
01625  */
01626 
01627 static VALUE
01628 match_to_a(VALUE match)
01629 {
01630     return match_array(match, 0);
01631 }
01632 
01633 
01634 /*
01635  *  call-seq:
01636  *     mtch.captures   -> array
01637  *
01638  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
01639  *
01640  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
01641  *     f1    #=> "H"
01642  *     f2    #=> "X"
01643  *     f3    #=> "113"
01644  *     f4    #=> "8"
01645  */
01646 static VALUE
01647 match_captures(VALUE match)
01648 {
01649     return match_array(match, 1);
01650 }
01651 
01652 static int
01653 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01654 {
01655     int num;
01656 
01657     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01658         (const unsigned char* )name, (const unsigned char* )name_end, regs);
01659     if (num >= 1) {
01660         return num;
01661     }
01662     else {
01663         VALUE s = rb_str_new(name, (long )(name_end - name));
01664         rb_raise(rb_eIndexError, "undefined group name reference: %s",
01665                                  StringValuePtr(s));
01666     }
01667 }
01668 
01669 /*
01670  *  call-seq:
01671  *     mtch[i]               -> str or nil
01672  *     mtch[start, length]   -> array
01673  *     mtch[range]           -> array
01674  *     mtch[name]            -> str or nil
01675  *
01676  *  Match Reference---<code>MatchData</code> acts as an array, and may be
01677  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
01678  *  equivalent to the special variable <code>$&</code>, and returns the entire
01679  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
01680  *  of the matched backreferences (portions of the pattern between parentheses).
01681  *
01682  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01683  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
01684  *     m[0]       #=> "HX1138"
01685  *     m[1, 2]    #=> ["H", "X"]
01686  *     m[1..3]    #=> ["H", "X", "113"]
01687  *     m[-3, 2]   #=> ["X", "113"]
01688  *
01689  *     m = /(?<foo>a+)b/.match("ccaaab")
01690  *     m          #=> #<MatchData "aaab" foo:"aaa">
01691  *     m["foo"]   #=> "aaa"
01692  *     m[:foo]    #=> "aaa"
01693  */
01694 
01695 static VALUE
01696 match_aref(int argc, VALUE *argv, VALUE match)
01697 {
01698     VALUE idx, rest;
01699 
01700     match_check(match);
01701     rb_scan_args(argc, argv, "11", &idx, &rest);
01702 
01703     if (NIL_P(rest)) {
01704         if (FIXNUM_P(idx)) {
01705             if (FIX2INT(idx) >= 0) {
01706                 return rb_reg_nth_match(FIX2INT(idx), match);
01707             }
01708         }
01709         else {
01710             const char *p;
01711             int num;
01712 
01713             switch (TYPE(idx)) {
01714               case T_SYMBOL:
01715                 p = rb_id2name(SYM2ID(idx));
01716                 goto name_to_backref;
01717                 break;
01718               case T_STRING:
01719                 p = StringValuePtr(idx);
01720 
01721               name_to_backref:
01722                 num = name_to_backref_number(RMATCH_REGS(match),
01723                                              RMATCH(match)->regexp, p, p + strlen(p));
01724                 return rb_reg_nth_match(num, match);
01725                 break;
01726 
01727               default:
01728                 break;
01729             }
01730         }
01731     }
01732 
01733     return rb_ary_aref(argc, argv, match_to_a(match));
01734 }
01735 
01736 static VALUE
01737 match_entry(VALUE match, long n)
01738 {
01739     /* n should not exceed num_regs */
01740     return rb_reg_nth_match((int)n, match);
01741 }
01742 
01743 
01744 /*
01745  *  call-seq:
01746  *
01747  *     mtch.values_at([index]*)   -> array
01748  *
01749  *  Uses each <i>index</i> to access the matching values, returning an array of
01750  *  the corresponding matches.
01751  *
01752  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01753  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01754  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
01755  */
01756 
01757 static VALUE
01758 match_values_at(int argc, VALUE *argv, VALUE match)
01759 {
01760     struct re_registers *regs;
01761 
01762     match_check(match);
01763     regs = RMATCH_REGS(match);
01764     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01765 }
01766 
01767 
01768 /*
01769  *  call-seq:
01770  *     mtch.to_s   -> str
01771  *
01772  *  Returns the entire matched string.
01773  *
01774  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01775  *     m.to_s   #=> "HX1138"
01776  */
01777 
01778 static VALUE
01779 match_to_s(VALUE match)
01780 {
01781     VALUE str = rb_reg_last_match(match);
01782 
01783     match_check(match);
01784     if (NIL_P(str)) str = rb_str_new(0,0);
01785     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01786     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01787     return str;
01788 }
01789 
01790 
01791 /*
01792  *  call-seq:
01793  *     mtch.string   -> str
01794  *
01795  *  Returns a frozen copy of the string passed in to <code>match</code>.
01796  *
01797  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01798  *     m.string   #=> "THX1138."
01799  */
01800 
01801 static VALUE
01802 match_string(VALUE match)
01803 {
01804     match_check(match);
01805     return RMATCH(match)->str;  /* str is frozen */
01806 }
01807 
01808 struct backref_name_tag {
01809     const UChar *name;
01810     long len;
01811 };
01812 
01813 static int
01814 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01815           int back_num, int *back_refs, OnigRegex regex, void *arg0)
01816 {
01817     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01818     int i;
01819 
01820     for (i = 0; i < back_num; i++) {
01821         arg[back_refs[i]].name = name;
01822         arg[back_refs[i]].len = name_end - name;
01823     }
01824     return 0;
01825 }
01826 
01827 /*
01828  * call-seq:
01829  *    mtch.inspect   -> str
01830  *
01831  * Returns a printable version of <i>mtch</i>.
01832  *
01833  *     puts /.$/.match("foo").inspect
01834  *     #=> #<MatchData "o">
01835  *
01836  *     puts /(.)(.)(.)/.match("foo").inspect
01837  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
01838  *
01839  *     puts /(.)(.)?(.)/.match("fo").inspect
01840  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
01841  *
01842  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
01843  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
01844  *
01845  */
01846 
01847 static VALUE
01848 match_inspect(VALUE match)
01849 {
01850     const char *cname = rb_obj_classname(match);
01851     VALUE str;
01852     int i;
01853     struct re_registers *regs = RMATCH_REGS(match);
01854     int num_regs = regs->num_regs;
01855     struct backref_name_tag *names;
01856     VALUE regexp = RMATCH(match)->regexp;
01857 
01858     if (regexp == 0) {
01859         return rb_sprintf("#<%s:%p>", cname, (void*)match);
01860     }
01861 
01862     names = ALLOCA_N(struct backref_name_tag, num_regs);
01863     MEMZERO(names, struct backref_name_tag, num_regs);
01864 
01865     onig_foreach_name(RREGEXP(regexp)->ptr,
01866             match_inspect_name_iter, names);
01867 
01868     str = rb_str_buf_new2("#<");
01869     rb_str_buf_cat2(str, cname);
01870 
01871     for (i = 0; i < num_regs; i++) {
01872         VALUE v;
01873         rb_str_buf_cat2(str, " ");
01874         if (0 < i) {
01875             if (names[i].name)
01876                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01877             else {
01878                 rb_str_catf(str, "%d", i);
01879             }
01880             rb_str_buf_cat2(str, ":");
01881         }
01882         v = rb_reg_nth_match(i, match);
01883         if (v == Qnil)
01884             rb_str_buf_cat2(str, "nil");
01885         else
01886             rb_str_buf_append(str, rb_str_inspect(v));
01887     }
01888     rb_str_buf_cat2(str, ">");
01889 
01890     return str;
01891 }
01892 
01893 VALUE rb_cRegexp;
01894 
01895 static int
01896 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01897 {
01898     const char *p = *pp;
01899     int code;
01900     int meta_prefix = 0, ctrl_prefix = 0;
01901     size_t len;
01902     int retbyte;
01903 
01904     retbyte = -1;
01905     if (p == end || *p++ != '\\') {
01906         errcpy(err, "too short escaped multibyte character");
01907         return -1;
01908     }
01909 
01910 again:
01911     if (p == end) {
01912         errcpy(err, "too short escape sequence");
01913         return -1;
01914     }
01915     switch (*p++) {
01916       case '\\': code = '\\'; break;
01917       case 'n': code = '\n'; break;
01918       case 't': code = '\t'; break;
01919       case 'r': code = '\r'; break;
01920       case 'f': code = '\f'; break;
01921       case 'v': code = '\013'; break;
01922       case 'a': code = '\007'; break;
01923       case 'e': code = '\033'; break;
01924 
01925       /* \OOO */
01926       case '0': case '1': case '2': case '3':
01927       case '4': case '5': case '6': case '7':
01928         p--;
01929         code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01930         p += len;
01931         break;
01932 
01933       case 'x': /* \xHH */
01934         code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01935         if (len < 1) {
01936             errcpy(err, "invalid hex escape");
01937             return -1;
01938         }
01939         p += len;
01940         break;
01941 
01942       case 'M': /* \M-X, \M-\C-X, \M-\cX */
01943         if (meta_prefix) {
01944             errcpy(err, "duplicate meta escape");
01945             return -1;
01946         }
01947         meta_prefix = 1;
01948         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01949             if (*p == '\\') {
01950                 p++;
01951                 goto again;
01952             }
01953             else {
01954                 code = *p++;
01955                 break;
01956             }
01957         }
01958         errcpy(err, "too short meta escape");
01959         return -1;
01960 
01961       case 'C': /* \C-X, \C-\M-X */
01962         if (p == end || *p++ != '-') {
01963             errcpy(err, "too short control escape");
01964             return -1;
01965         }
01966       case 'c': /* \cX, \c\M-X */
01967         if (ctrl_prefix) {
01968             errcpy(err, "duplicate control escape");
01969             return -1;
01970         }
01971         ctrl_prefix = 1;
01972         if (p < end && (*p & 0x80) == 0) {
01973             if (*p == '\\') {
01974                 p++;
01975                 goto again;
01976             }
01977             else {
01978                 code = *p++;
01979                 break;
01980             }
01981         }
01982         errcpy(err, "too short control escape");
01983         return -1;
01984 
01985       default:
01986         errcpy(err, "unexpected escape sequence");
01987         return -1;
01988     }
01989     if (code < 0 || 0xff < code) {
01990         errcpy(err, "invalid escape code");
01991         return -1;
01992     }
01993 
01994     if (ctrl_prefix)
01995         code &= 0x1f;
01996     if (meta_prefix)
01997         code |= 0x80;
01998 
01999     *pp = p;
02000     return code;
02001 }
02002 
02003 static int
02004 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02005         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02006 {
02007     const char *p = *pp;
02008     int chmaxlen = rb_enc_mbmaxlen(enc);
02009     char *chbuf = ALLOCA_N(char, chmaxlen);
02010     int chlen = 0;
02011     int byte;
02012     int l;
02013 
02014     memset(chbuf, 0, chmaxlen);
02015 
02016     byte = read_escaped_byte(&p, end, err);
02017     if (byte == -1) {
02018         return -1;
02019     }
02020 
02021     chbuf[chlen++] = byte;
02022     while (chlen < chmaxlen &&
02023            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02024         byte = read_escaped_byte(&p, end, err);
02025         if (byte == -1) {
02026             return -1;
02027         }
02028         chbuf[chlen++] = byte;
02029     }
02030 
02031     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02032     if (MBCLEN_INVALID_P(l)) {
02033         errcpy(err, "invalid multibyte escape");
02034         return -1;
02035     }
02036     if (1 < chlen || (chbuf[0] & 0x80)) {
02037         rb_str_buf_cat(buf, chbuf, chlen);
02038 
02039         if (*encp == 0)
02040             *encp = enc;
02041         else if (*encp != enc) {
02042             errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02043             return -1;
02044         }
02045     }
02046     else {
02047         char escbuf[5];
02048         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02049         rb_str_buf_cat(buf, escbuf, 4);
02050     }
02051     *pp = p;
02052     return 0;
02053 }
02054 
02055 static int
02056 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02057 {
02058     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
02059         0x10ffff < code) {
02060         errcpy(err, "invalid Unicode range");
02061         return -1;
02062     }
02063     return 0;
02064 }
02065 
02066 static int
02067 append_utf8(unsigned long uv,
02068         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02069 {
02070     if (check_unicode_range(uv, err) != 0)
02071         return -1;
02072     if (uv < 0x80) {
02073         char escbuf[5];
02074         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02075         rb_str_buf_cat(buf, escbuf, 4);
02076     }
02077     else {
02078         int len;
02079         char utf8buf[6];
02080         len = rb_uv_to_utf8(utf8buf, uv);
02081         rb_str_buf_cat(buf, utf8buf, len);
02082 
02083         if (*encp == 0)
02084             *encp = rb_utf8_encoding();
02085         else if (*encp != rb_utf8_encoding()) {
02086             errcpy(err, "UTF-8 character in non UTF-8 regexp");
02087             return -1;
02088         }
02089     }
02090     return 0;
02091 }
02092 
02093 static int
02094 unescape_unicode_list(const char **pp, const char *end,
02095         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02096 {
02097     const char *p = *pp;
02098     int has_unicode = 0;
02099     unsigned long code;
02100     size_t len;
02101 
02102     while (p < end && ISSPACE(*p)) p++;
02103 
02104     while (1) {
02105         code = ruby_scan_hex(p, end-p, &len);
02106         if (len == 0)
02107             break;
02108         if (6 < len) { /* max 10FFFF */
02109             errcpy(err, "invalid Unicode range");
02110             return -1;
02111         }
02112         p += len;
02113         if (append_utf8(code, buf, encp, err) != 0)
02114             return -1;
02115         has_unicode = 1;
02116 
02117         while (p < end && ISSPACE(*p)) p++;
02118     }
02119 
02120     if (has_unicode == 0) {
02121         errcpy(err, "invalid Unicode list");
02122         return -1;
02123     }
02124 
02125     *pp = p;
02126 
02127     return 0;
02128 }
02129 
02130 static int
02131 unescape_unicode_bmp(const char **pp, const char *end,
02132         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02133 {
02134     const char *p = *pp;
02135     size_t len;
02136     unsigned long code;
02137 
02138     if (end < p+4) {
02139         errcpy(err, "invalid Unicode escape");
02140         return -1;
02141     }
02142     code = ruby_scan_hex(p, 4, &len);
02143     if (len != 4) {
02144         errcpy(err, "invalid Unicode escape");
02145         return -1;
02146     }
02147     if (append_utf8(code, buf, encp, err) != 0)
02148         return -1;
02149     *pp = p + 4;
02150     return 0;
02151 }
02152 
02153 static int
02154 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02155         VALUE buf, rb_encoding **encp, int *has_property,
02156         onig_errmsg_buffer err)
02157 {
02158     char c;
02159     char smallbuf[2];
02160 
02161     while (p < end) {
02162         int chlen = rb_enc_precise_mbclen(p, end, enc);
02163         if (!MBCLEN_CHARFOUND_P(chlen)) {
02164             errcpy(err, "invalid multibyte character");
02165             return -1;
02166         }
02167         chlen = MBCLEN_CHARFOUND_LEN(chlen);
02168         if (1 < chlen || (*p & 0x80)) {
02169             rb_str_buf_cat(buf, p, chlen);
02170             p += chlen;
02171             if (*encp == 0)
02172                 *encp = enc;
02173             else if (*encp != enc) {
02174                 errcpy(err, "non ASCII character in UTF-8 regexp");
02175                 return -1;
02176             }
02177             continue;
02178         }
02179 
02180         switch (c = *p++) {
02181           case '\\':
02182             if (p == end) {
02183                 errcpy(err, "too short escape sequence");
02184                 return -1;
02185             }
02186             switch (c = *p++) {
02187               case '1': case '2': case '3':
02188               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
02189                 {
02190                     size_t octlen;
02191                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02192                         /* backref or 7bit octal.
02193                            no need to unescape anyway.
02194                            re-escaping may break backref */
02195                         goto escape_asis;
02196                     }
02197                 }
02198                 /* xxx: How about more than 199 subexpressions? */
02199 
02200               case '0': /* \0, \0O, \0OO */
02201 
02202               case 'x': /* \xHH */
02203               case 'c': /* \cX, \c\M-X */
02204               case 'C': /* \C-X, \C-\M-X */
02205               case 'M': /* \M-X, \M-\C-X, \M-\cX */
02206                 p = p-2;
02207                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02208                     return -1;
02209                 break;
02210 
02211               case 'u':
02212                 if (p == end) {
02213                     errcpy(err, "too short escape sequence");
02214                     return -1;
02215                 }
02216                 if (*p == '{') {
02217                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
02218                     p++;
02219                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02220                         return -1;
02221                     if (p == end || *p++ != '}') {
02222                         errcpy(err, "invalid Unicode list");
02223                         return -1;
02224                     }
02225                     break;
02226                 }
02227                 else {
02228                     /* \uHHHH */
02229                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02230                         return -1;
02231                     break;
02232                 }
02233 
02234               case 'p': /* \p{Hiragana} */
02235               case 'P':
02236                 if (!*encp) {
02237                     *has_property = 1;
02238                 }
02239                 goto escape_asis;
02240 
02241               default: /* \n, \\, \d, \9, etc. */
02242 escape_asis:
02243                 smallbuf[0] = '\\';
02244                 smallbuf[1] = c;
02245                 rb_str_buf_cat(buf, smallbuf, 2);
02246                 break;
02247             }
02248             break;
02249 
02250           default:
02251             rb_str_buf_cat(buf, &c, 1);
02252             break;
02253         }
02254     }
02255 
02256     return 0;
02257 }
02258 
02259 static VALUE
02260 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02261         rb_encoding **fixed_enc, onig_errmsg_buffer err)
02262 {
02263     VALUE buf;
02264     int has_property = 0;
02265 
02266     buf = rb_str_buf_new(0);
02267 
02268     if (rb_enc_asciicompat(enc))
02269         *fixed_enc = 0;
02270     else {
02271         *fixed_enc = enc;
02272         rb_enc_associate(buf, enc);
02273     }
02274 
02275     if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02276         return Qnil;
02277 
02278     if (has_property && !*fixed_enc) {
02279         *fixed_enc = enc;
02280     }
02281 
02282     if (*fixed_enc) {
02283         rb_enc_associate(buf, *fixed_enc);
02284     }
02285 
02286     return buf;
02287 }
02288 
02289 VALUE
02290 rb_reg_check_preprocess(VALUE str)
02291 {
02292     rb_encoding *fixed_enc = 0;
02293     onig_errmsg_buffer err = "";
02294     VALUE buf;
02295     char *p, *end;
02296     rb_encoding *enc;
02297 
02298     StringValue(str);
02299     p = RSTRING_PTR(str);
02300     end = p + RSTRING_LEN(str);
02301     enc = rb_enc_get(str);
02302 
02303     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02304     RB_GC_GUARD(str);
02305 
02306     if (buf == Qnil) {
02307         return rb_reg_error_desc(str, 0, err);
02308     }
02309     return Qnil;
02310 }
02311 
02312 static VALUE
02313 rb_reg_preprocess_dregexp(VALUE ary, int options)
02314 {
02315     rb_encoding *fixed_enc = 0;
02316     rb_encoding *regexp_enc = 0;
02317     onig_errmsg_buffer err = "";
02318     int i;
02319     VALUE result = 0;
02320     rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02321 
02322     if (RARRAY_LEN(ary) == 0) {
02323         rb_raise(rb_eArgError, "no arguments given");
02324     }
02325 
02326     for (i = 0; i < RARRAY_LEN(ary); i++) {
02327         VALUE str = RARRAY_PTR(ary)[i];
02328         VALUE buf;
02329         char *p, *end;
02330         rb_encoding *src_enc;
02331 
02332         src_enc = rb_enc_get(str);
02333         if (options & ARG_ENCODING_NONE &&
02334                 src_enc != ascii8bit) {
02335             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02336                 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02337             else
02338                 src_enc = ascii8bit;
02339         }
02340 
02341         StringValue(str);
02342         p = RSTRING_PTR(str);
02343         end = p + RSTRING_LEN(str);
02344 
02345         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02346 
02347         if (buf == Qnil)
02348             rb_raise(rb_eArgError, "%s", err);
02349 
02350         if (fixed_enc != 0) {
02351             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02352                 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02353                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02354             }
02355             regexp_enc = fixed_enc;
02356         }
02357 
02358         if (!result)
02359             result = rb_str_new3(str);
02360         else
02361             rb_str_buf_append(result, str);
02362     }
02363     if (regexp_enc) {
02364         rb_enc_associate(result, regexp_enc);
02365     }
02366 
02367     return result;
02368 }
02369 
02370 static int
02371 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02372                   int options, onig_errmsg_buffer err,
02373                   const char *sourcefile, int sourceline)
02374 {
02375     struct RRegexp *re = RREGEXP(obj);
02376     VALUE unescaped;
02377     rb_encoding *fixed_enc = 0;
02378     rb_encoding *a_enc = rb_ascii8bit_encoding();
02379 
02380     if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02381         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02382     rb_check_frozen(obj);
02383     if (FL_TEST(obj, REG_LITERAL))
02384         rb_raise(rb_eSecurityError, "can't modify literal regexp");
02385     if (re->ptr)
02386         rb_raise(rb_eTypeError, "already initialized regexp");
02387     re->ptr = 0;
02388 
02389     if (rb_enc_dummy_p(enc)) {
02390             errcpy(err, "can't make regexp with dummy encoding");
02391             return -1;
02392     }
02393 
02394     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02395     if (unescaped == Qnil)
02396         return -1;
02397 
02398     if (fixed_enc) {
02399         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02400             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02401             errcpy(err, "incompatible character encoding");
02402             return -1;
02403         }
02404         if (fixed_enc != a_enc) {
02405             options |= ARG_ENCODING_FIXED;
02406             enc = fixed_enc;
02407         }
02408     }
02409     else if (!(options & ARG_ENCODING_FIXED)) {
02410        enc = rb_usascii_encoding();
02411     }
02412 
02413     rb_enc_associate((VALUE)re, enc);
02414     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02415         re->basic.flags |= KCODE_FIXED;
02416     }
02417     if (options & ARG_ENCODING_NONE) {
02418         re->basic.flags |= REG_ENCODING_NONE;
02419     }
02420 
02421     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02422                           options & ARG_REG_OPTION_MASK, err,
02423                           sourcefile, sourceline);
02424     if (!re->ptr) return -1;
02425     re->src = rb_enc_str_new(s, len, enc);
02426     OBJ_FREEZE(re->src);
02427     RB_GC_GUARD(unescaped);
02428     return 0;
02429 }
02430 
02431 static int
02432 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02433         const char *sourcefile, int sourceline)
02434 {
02435     int ret;
02436     rb_encoding *enc = rb_enc_get(str);
02437     if (options & ARG_ENCODING_NONE) {
02438         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02439         if (enc != ascii8bit) {
02440             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02441                 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02442                 return -1;
02443             }
02444             enc = ascii8bit;
02445         }
02446     }
02447     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02448                             options, err, sourcefile, sourceline);
02449     OBJ_INFECT(obj, str);
02450     RB_GC_GUARD(str);
02451     return ret;
02452 }
02453 
02454 static VALUE
02455 rb_reg_s_alloc(VALUE klass)
02456 {
02457     NEWOBJ(re, struct RRegexp);
02458     OBJSETUP(re, klass, T_REGEXP);
02459 
02460     re->ptr = 0;
02461     re->src = 0;
02462     re->usecnt = 0;
02463 
02464     return (VALUE)re;
02465 }
02466 
02467 VALUE
02468 rb_reg_alloc(void)
02469 {
02470     return rb_reg_s_alloc(rb_cRegexp);
02471 }
02472 
02473 VALUE
02474 rb_reg_new_str(VALUE s, int options)
02475 {
02476     return rb_reg_init_str(rb_reg_alloc(), s, options);
02477 }
02478 
02479 VALUE
02480 rb_reg_init_str(VALUE re, VALUE s, int options)
02481 {
02482     onig_errmsg_buffer err = "";
02483 
02484     if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02485         rb_reg_raise_str(s, options, err);
02486     }
02487 
02488     return re;
02489 }
02490 
02491 VALUE
02492 rb_reg_new_ary(VALUE ary, int opt)
02493 {
02494     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02495 }
02496 
02497 VALUE
02498 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02499 {
02500     VALUE re = rb_reg_alloc();
02501     onig_errmsg_buffer err = "";
02502 
02503     if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02504         rb_enc_reg_raise(s, len, enc, options, err);
02505     }
02506 
02507     return re;
02508 }
02509 
02510 VALUE
02511 rb_reg_new(const char *s, long len, int options)
02512 {
02513     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02514 }
02515 
02516 VALUE
02517 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02518 {
02519     VALUE re = rb_reg_alloc();
02520     onig_errmsg_buffer err = "";
02521 
02522     if (!str) str = rb_str_new(0,0);
02523     if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02524         rb_set_errinfo(rb_reg_error_desc(str, options, err));
02525         return Qnil;
02526     }
02527     FL_SET(re, REG_LITERAL);
02528     return re;
02529 }
02530 
02531 static VALUE reg_cache;
02532 
02533 VALUE
02534 rb_reg_regcomp(VALUE str)
02535 {
02536     volatile VALUE save_str = str;
02537     if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02538         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02539         && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02540         return reg_cache;
02541 
02542     return reg_cache = rb_reg_new_str(save_str, 0);
02543 }
02544 
02545 static st_index_t reg_hash(VALUE re);
02546 /*
02547  * call-seq:
02548  *   rxp.hash   -> fixnum
02549  *
02550  * Produce a hash based on the text and options of this regular expression.
02551  */
02552 
02553 static VALUE
02554 rb_reg_hash(VALUE re)
02555 {
02556     st_index_t hashval = reg_hash(re);
02557     return LONG2FIX(hashval);
02558 }
02559 
02560 static st_index_t
02561 reg_hash(VALUE re)
02562 {
02563     st_index_t hashval;
02564 
02565     rb_reg_check(re);
02566     hashval = RREGEXP(re)->ptr->options;
02567     hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02568     return rb_hash_end(hashval);
02569 }
02570 
02571 
02572 /*
02573  *  call-seq:
02574  *     rxp == other_rxp      -> true or false
02575  *     rxp.eql?(other_rxp)   -> true or false
02576  *
02577  *  Equality---Two regexps are equal if their patterns are identical, they have
02578  *  the same character set code, and their <code>casefold?</code> values are the
02579  *  same.
02580  *
02581  *     /abc/  == /abc/x   #=> false
02582  *     /abc/  == /abc/i   #=> false
02583  *     /abc/  == /abc/n   #=> false
02584  *     /abc/u == /abc/n   #=> false
02585  */
02586 
02587 static VALUE
02588 rb_reg_equal(VALUE re1, VALUE re2)
02589 {
02590     if (re1 == re2) return Qtrue;
02591     if (TYPE(re2) != T_REGEXP) return Qfalse;
02592     rb_reg_check(re1); rb_reg_check(re2);
02593     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02594     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02595     if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02596     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02597     if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02598         return Qtrue;
02599     }
02600     return Qfalse;
02601 }
02602 
02603 /*
02604  * call-seq:
02605  *    mtch.hash   -> integer
02606  *
02607  * Produce a hash based on the target string, regexp and matched
02608  * positions of this matchdata.
02609  */
02610 
02611 static VALUE
02612 match_hash(VALUE match)
02613 {
02614     const struct re_registers *regs;
02615     st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02616 
02617     rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02618     regs = RMATCH_REGS(match);
02619     hashval = rb_hash_uint(hashval, regs->num_regs);
02620     hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02621     hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02622     hashval = rb_hash_end(hashval);
02623     return LONG2FIX(hashval);
02624 }
02625 
02626 /*
02627  * call-seq:
02628  *    mtch == mtch2   -> true or false
02629  *
02630  *  Equality---Two matchdata are equal if their target strings,
02631  *  patterns, and matched positions are identical.
02632  */
02633 
02634 static VALUE
02635 match_equal(VALUE match1, VALUE match2)
02636 {
02637     const struct re_registers *regs1, *regs2;
02638     if (match1 == match2) return Qtrue;
02639     if (TYPE(match2) != T_MATCH) return Qfalse;
02640     if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02641     if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02642     regs1 = RMATCH_REGS(match1);
02643     regs2 = RMATCH_REGS(match2);
02644     if (regs1->num_regs != regs2->num_regs) return Qfalse;
02645     if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02646     if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02647     return Qtrue;
02648 }
02649 
02650 static VALUE
02651 reg_operand(VALUE s, int check)
02652 {
02653     if (SYMBOL_P(s)) {
02654         return rb_sym_to_s(s);
02655     }
02656     else {
02657         VALUE tmp = rb_check_string_type(s);
02658         if (check && NIL_P(tmp)) {
02659             rb_raise(rb_eTypeError, "can't convert %s to String",
02660                      rb_obj_classname(s));
02661         }
02662         return tmp;
02663     }
02664 }
02665 
02666 static long
02667 reg_match_pos(VALUE re, VALUE *strp, long pos)
02668 {
02669     VALUE str = *strp;
02670 
02671     if (NIL_P(str)) {
02672         rb_backref_set(Qnil);
02673         return -1;
02674     }
02675     *strp = str = reg_operand(str, TRUE);
02676     if (pos != 0) {
02677         if (pos < 0) {
02678             VALUE l = rb_str_length(str);
02679             pos += NUM2INT(l);
02680             if (pos < 0) {
02681                 return pos;
02682             }
02683         }
02684         pos = rb_str_offset(str, pos);
02685     }
02686     return rb_reg_search(re, str, pos, 0);
02687 }
02688 
02689 /*
02690  *  call-seq:
02691  *     rxp =~ str    -> integer or nil
02692  *
02693  *  Match---Matches <i>rxp</i> against <i>str</i>.
02694  *
02695  *     /at/ =~ "input data"   #=> 7
02696  *     /ax/ =~ "input data"   #=> nil
02697  *
02698  *  If <code>=~</code> is used with a regexp literal with named captures,
02699  *  captured strings (or nil) is assigned to local variables named by
02700  *  the capture names.
02701  *
02702  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
02703  *     p lhs    #=> "x"
02704  *     p rhs    #=> "y"
02705  *
02706  *  If it is not matched, nil is assigned for the variables.
02707  *
02708  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
02709  *     p lhs    #=> nil
02710  *     p rhs    #=> nil
02711  *
02712  *  This assignment is implemented in the Ruby parser.
02713  *  The parser detects 'regexp-literal =~ expression' for the assignment.
02714  *  The regexp must be a literal without interpolation and placed at left hand side.
02715  *
02716  *  The assignment is not occur if the regexp is not a literal.
02717  *
02718  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02719  *     re =~ "  x = y  "
02720  *     p lhs    # undefined local variable
02721  *     p rhs    # undefined local variable
02722  *
02723  *  A regexp interpolation, <code>#{}</code>, also disables
02724  *  the assignment.
02725  *
02726  *     rhs_pat = /(?<rhs>\w+)/
02727  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
02728  *     p lhs    # undefined local variable
02729  *
02730  *  The assignment is not occur if the regexp is placed at right hand side.
02731  *
02732  *    "  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02733  *    p lhs, rhs # undefined local variable
02734  *
02735  */
02736 
02737 VALUE
02738 rb_reg_match(VALUE re, VALUE str)
02739 {
02740     long pos = reg_match_pos(re, &str, 0);
02741     if (pos < 0) return Qnil;
02742     pos = rb_str_sublen(str, pos);
02743     return LONG2FIX(pos);
02744 }
02745 
02746 /*
02747  *  call-seq:
02748  *     rxp === str   -> true or false
02749  *
02750  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
02751  *
02752  *     a = "HELLO"
02753  *     case a
02754  *     when /^[a-z]*$/; print "Lower case\n"
02755  *     when /^[A-Z]*$/; print "Upper case\n"
02756  *     else;            print "Mixed case\n"
02757  *     end
02758  *
02759  *  <em>produces:</em>
02760  *
02761  *     Upper case
02762  */
02763 
02764 VALUE
02765 rb_reg_eqq(VALUE re, VALUE str)
02766 {
02767     long start;
02768 
02769     str = reg_operand(str, FALSE);
02770     if (NIL_P(str)) {
02771         rb_backref_set(Qnil);
02772         return Qfalse;
02773     }
02774     start = rb_reg_search(re, str, 0, 0);
02775     if (start < 0) {
02776         return Qfalse;
02777     }
02778     return Qtrue;
02779 }
02780 
02781 
02782 /*
02783  *  call-seq:
02784  *     ~ rxp   -> integer or nil
02785  *
02786  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
02787  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
02788  *
02789  *     $_ = "input data"
02790  *     ~ /at/   #=> 7
02791  */
02792 
02793 VALUE
02794 rb_reg_match2(VALUE re)
02795 {
02796     long start;
02797     VALUE line = rb_lastline_get();
02798 
02799     if (TYPE(line) != T_STRING) {
02800         rb_backref_set(Qnil);
02801         return Qnil;
02802     }
02803 
02804     start = rb_reg_search(re, line, 0, 0);
02805     if (start < 0) {
02806         return Qnil;
02807     }
02808     start = rb_str_sublen(line, start);
02809     return LONG2FIX(start);
02810 }
02811 
02812 
02813 /*
02814  *  call-seq:
02815  *     rxp.match(str)       -> matchdata or nil
02816  *     rxp.match(str,pos)   -> matchdata or nil
02817  *
02818  *  Returns a <code>MatchData</code> object describing the match, or
02819  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
02820  *  value of the special variable <code>$~</code> following a normal match.
02821  *  If the second parameter is present, it specifies the position in the string
02822  *  to begin the search.
02823  *
02824  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
02825  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
02826  *
02827  *  If a block is given, invoke the block with MatchData if match succeed, so
02828  *  that you can write
02829  *
02830  *     pat.match(str) {|m| ...}
02831  *
02832  *  instead of
02833  *
02834  *     if m = pat.match(str)
02835  *       ...
02836  *     end
02837  *
02838  *  The return value is a value from block execution in this case.
02839  */
02840 
02841 static VALUE
02842 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02843 {
02844     VALUE result, str, initpos;
02845     long pos;
02846 
02847     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02848         pos = NUM2LONG(initpos);
02849     }
02850     else {
02851         pos = 0;
02852     }
02853 
02854     pos = reg_match_pos(re, &str, pos);
02855     if (pos < 0) {
02856         rb_backref_set(Qnil);
02857         return Qnil;
02858     }
02859     result = rb_backref_get();
02860     rb_match_busy(result);
02861     if (!NIL_P(result) && rb_block_given_p()) {
02862         return rb_yield(result);
02863     }
02864     return result;
02865 }
02866 
02867 /*
02868  * Document-method: compile
02869  *
02870  * Synonym for <code>Regexp.new</code>
02871  */
02872 
02873 
02874 /*
02875  *  call-seq:
02876  *     Regexp.new(string, [options [, lang]])        -> regexp
02877  *     Regexp.new(regexp)                            -> regexp
02878  *     Regexp.compile(string, [options [, lang]])    -> regexp
02879  *     Regexp.compile(regexp)                        -> regexp
02880  *
02881  *  Constructs a new regular expression from <i>pattern</i>, which can be either
02882  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
02883  *  options are propagated, and new options may not be specified (a change as of
02884  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
02885  *  more of the constants <code>Regexp::EXTENDED</code>,
02886  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
02887  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
02888  *  <code>nil</code>, the regexp will be case insensitive.
02889  *  When the <i>lang</i> parameter is `n' or `N' sets the regexp no encoding.
02890  *
02891  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
02892  *     r2 = Regexp.new('cat', true)               #=> /cat/i
02893  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
02894  *     r4 = Regexp.new(r2)                        #=> /cat/i
02895  */
02896 
02897 static VALUE
02898 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02899 {
02900     onig_errmsg_buffer err = "";
02901     int flags = 0;
02902     VALUE str;
02903     rb_encoding *enc;
02904     const char *ptr;
02905     long len;
02906 
02907     if (argc == 0 || argc > 3) {
02908         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02909     }
02910     if (TYPE(argv[0]) == T_REGEXP) {
02911         VALUE re = argv[0];
02912 
02913         if (argc > 1) {
02914             rb_warn("flags ignored");
02915         }
02916         rb_reg_check(re);
02917         flags = rb_reg_options(re);
02918         ptr = RREGEXP_SRC_PTR(re);
02919         len = RREGEXP_SRC_LEN(re);
02920         enc = rb_enc_get(re);
02921         if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02922             str = rb_enc_str_new(ptr, len, enc);
02923             rb_reg_raise_str(str, flags, err);
02924         }
02925     }
02926     else {
02927         if (argc >= 2) {
02928             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02929             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02930         }
02931         enc = 0;
02932         if (argc == 3 && !NIL_P(argv[2])) {
02933             char *kcode = StringValuePtr(argv[2]);
02934             if (kcode[0] == 'n' || kcode[0] == 'N') {
02935                 enc = rb_ascii8bit_encoding();
02936                 flags |= ARG_ENCODING_NONE;
02937             }
02938             else {
02939                 rb_warn("encoding option is ignored - %s", kcode);
02940             }
02941         }
02942         str = argv[0];
02943         ptr = StringValuePtr(str);
02944         if (enc
02945             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02946             : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02947             rb_reg_raise_str(str, flags, err);
02948         }
02949     }
02950     return self;
02951 }
02952 
02953 VALUE
02954 rb_reg_quote(VALUE str)
02955 {
02956     rb_encoding *enc = rb_enc_get(str);
02957     char *s, *send, *t;
02958     VALUE tmp;
02959     int c, clen;
02960     int ascii_only = rb_enc_str_asciionly_p(str);
02961 
02962     s = RSTRING_PTR(str);
02963     send = s + RSTRING_LEN(str);
02964     while (s < send) {
02965         c = rb_enc_ascget(s, send, &clen, enc);
02966         if (c == -1) {
02967             s += mbclen(s, send, enc);
02968             continue;
02969         }
02970         switch (c) {
02971           case '[': case ']': case '{': case '}':
02972           case '(': case ')': case '|': case '-':
02973           case '*': case '.': case '\\':
02974           case '?': case '+': case '^': case '$':
02975           case ' ': case '#':
02976           case '\t': case '\f': case '\v': case '\n': case '\r':
02977             goto meta_found;
02978         }
02979         s += clen;
02980     }
02981     tmp = rb_str_new3(str);
02982     if (ascii_only) {
02983         rb_enc_associate(tmp, rb_usascii_encoding());
02984     }
02985     return tmp;
02986 
02987   meta_found:
02988     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02989     if (ascii_only) {
02990         rb_enc_associate(tmp, rb_usascii_encoding());
02991     }
02992     else {
02993         rb_enc_copy(tmp, str);
02994     }
02995     t = RSTRING_PTR(tmp);
02996     /* copy upto metacharacter */
02997     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02998     t += s - RSTRING_PTR(str);
02999 
03000     while (s < send) {
03001         c = rb_enc_ascget(s, send, &clen, enc);
03002         if (c == -1) {
03003             int n = mbclen(s, send, enc);
03004 
03005             while (n--)
03006                 *t++ = *s++;
03007             continue;
03008         }
03009         s += clen;
03010         switch (c) {
03011           case '[': case ']': case '{': case '}':
03012           case '(': case ')': case '|': case '-':
03013           case '*': case '.': case '\\':
03014           case '?': case '+': case '^': case '$':
03015           case '#':
03016             t += rb_enc_mbcput('\\', t, enc);
03017             break;
03018           case ' ':
03019             t += rb_enc_mbcput('\\', t, enc);
03020             t += rb_enc_mbcput(' ', t, enc);
03021             continue;
03022           case '\t':
03023             t += rb_enc_mbcput('\\', t, enc);
03024             t += rb_enc_mbcput('t', t, enc);
03025             continue;
03026           case '\n':
03027             t += rb_enc_mbcput('\\', t, enc);
03028             t += rb_enc_mbcput('n', t, enc);
03029             continue;
03030           case '\r':
03031             t += rb_enc_mbcput('\\', t, enc);
03032             t += rb_enc_mbcput('r', t, enc);
03033             continue;
03034           case '\f':
03035             t += rb_enc_mbcput('\\', t, enc);
03036             t += rb_enc_mbcput('f', t, enc);
03037             continue;
03038           case '\v':
03039             t += rb_enc_mbcput('\\', t, enc);
03040             t += rb_enc_mbcput('v', t, enc);
03041             continue;
03042         }
03043         t += rb_enc_mbcput(c, t, enc);
03044     }
03045     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03046     OBJ_INFECT(tmp, str);
03047     return tmp;
03048 }
03049 
03050 
03051 /*
03052  *  call-seq:
03053  *     Regexp.escape(str)   -> string
03054  *     Regexp.quote(str)    -> string
03055  *
03056  *  Escapes any characters that would have special meaning in a regular
03057  *  expression. Returns a new escaped string, or self if no characters are
03058  *  escaped.  For any string,
03059  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
03060  *
03061  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
03062  *
03063  */
03064 
03065 static VALUE
03066 rb_reg_s_quote(VALUE c, VALUE str)
03067 {
03068     return rb_reg_quote(reg_operand(str, TRUE));
03069 }
03070 
03071 int
03072 rb_reg_options(VALUE re)
03073 {
03074     int options;
03075 
03076     rb_reg_check(re);
03077     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03078     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03079     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03080     return options;
03081 }
03082 
03083 VALUE
03084 rb_check_regexp_type(VALUE re)
03085 {
03086     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03087 }
03088 
03089 /*
03090  *  call-seq:
03091  *     Regexp.try_convert(obj) -> re or nil
03092  *
03093  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
03094  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
03095  *  for any reason.
03096  *
03097  *     Regexp.try_convert(/re/)         #=> /re/
03098  *     Regexp.try_convert("re")         #=> nil
03099  *
03100  *     o = Object.new
03101  *     Regexp.try_convert(o)            #=> nil
03102  *     def o.to_regexp() /foo/ end
03103  *     Regexp.try_convert(o)            #=> /foo/
03104  *
03105  */
03106 static VALUE
03107 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03108 {
03109     return rb_check_regexp_type(re);
03110 }
03111 
03112 static VALUE
03113 rb_reg_s_union(VALUE self, VALUE args0)
03114 {
03115     long argc = RARRAY_LEN(args0);
03116 
03117     if (argc == 0) {
03118         VALUE args[1];
03119         args[0] = rb_str_new2("(?!)");
03120         return rb_class_new_instance(1, args, rb_cRegexp);
03121     }
03122     else if (argc == 1) {
03123         VALUE arg = rb_ary_entry(args0, 0);
03124         VALUE re = rb_check_regexp_type(arg);
03125         if (!NIL_P(re))
03126             return re;
03127         else {
03128             VALUE quoted;
03129             quoted = rb_reg_s_quote(Qnil, arg);
03130             return rb_reg_new_str(quoted, 0);
03131         }
03132     }
03133     else {
03134         int i;
03135         VALUE source = rb_str_buf_new(0);
03136         rb_encoding *result_enc;
03137 
03138         int has_asciionly = 0;
03139         rb_encoding *has_ascii_compat_fixed = 0;
03140         rb_encoding *has_ascii_incompat = 0;
03141 
03142         for (i = 0; i < argc; i++) {
03143             volatile VALUE v;
03144             VALUE e = rb_ary_entry(args0, i);
03145 
03146             if (0 < i)
03147                 rb_str_buf_cat_ascii(source, "|");
03148 
03149             v = rb_check_regexp_type(e);
03150             if (!NIL_P(v)) {
03151                 rb_encoding *enc = rb_enc_get(v);
03152                 if (!rb_enc_asciicompat(enc)) {
03153                     if (!has_ascii_incompat)
03154                         has_ascii_incompat = enc;
03155                     else if (has_ascii_incompat != enc)
03156                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03157                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03158                 }
03159                 else if (rb_reg_fixed_encoding_p(v)) {
03160                     if (!has_ascii_compat_fixed)
03161                         has_ascii_compat_fixed = enc;
03162                     else if (has_ascii_compat_fixed != enc)
03163                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03164                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03165                 }
03166                 else {
03167                     has_asciionly = 1;
03168                 }
03169                 v = rb_reg_to_s(v);
03170             }
03171             else {
03172                 rb_encoding *enc;
03173                 StringValue(e);
03174                 enc = rb_enc_get(e);
03175                 if (!rb_enc_str_asciicompat_p(e)) {
03176                     if (!has_ascii_incompat)
03177                         has_ascii_incompat = enc;
03178                     else if (has_ascii_incompat != enc)
03179                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03180                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03181                 }
03182                 else if (rb_enc_str_asciionly_p(e)) {
03183                     has_asciionly = 1;
03184                 }
03185                 else {
03186                     if (!has_ascii_compat_fixed)
03187                         has_ascii_compat_fixed = enc;
03188                     else if (has_ascii_compat_fixed != enc)
03189                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03190                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03191                 }
03192                 v = rb_reg_s_quote(Qnil, e);
03193             }
03194             if (has_ascii_incompat) {
03195                 if (has_asciionly) {
03196                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03197                         rb_enc_name(has_ascii_incompat));
03198                 }
03199                 if (has_ascii_compat_fixed) {
03200                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03201                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03202                 }
03203             }
03204 
03205             if (i == 0) {
03206                 rb_enc_copy(source, v);
03207             }
03208             rb_str_append(source, v);
03209         }
03210 
03211         if (has_ascii_incompat) {
03212             result_enc = has_ascii_incompat;
03213         }
03214         else if (has_ascii_compat_fixed) {
03215             result_enc = has_ascii_compat_fixed;
03216         }
03217         else {
03218             result_enc = rb_ascii8bit_encoding();
03219         }
03220 
03221         rb_enc_associate(source, result_enc);
03222         return rb_class_new_instance(1, &source, rb_cRegexp);
03223     }
03224 }
03225 
03226 /*
03227  *  call-seq:
03228  *     Regexp.union(pat1, pat2, ...)            -> new_regexp
03229  *     Regexp.union(pats_ary)                   -> new_regexp
03230  *
03231  *  Return a <code>Regexp</code> object that is the union of the given
03232  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
03233  *  can be Regexp objects, in which case their options will be preserved, or
03234  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
03235  *  The behavior is unspecified if any given <em>pattern</em> contains capture.
03236  *
03237  *     Regexp.union                         #=> /(?!)/
03238  *     Regexp.union("penzance")             #=> /penzance/
03239  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
03240  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
03241  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
03242  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
03243  */
03244 static VALUE
03245 rb_reg_s_union_m(VALUE self, VALUE args)
03246 {
03247     VALUE v;
03248     if (RARRAY_LEN(args) == 1 &&
03249         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03250         return rb_reg_s_union(self, v);
03251     }
03252     return rb_reg_s_union(self, args);
03253 }
03254 
03255 /* :nodoc: */
03256 static VALUE
03257 rb_reg_init_copy(VALUE copy, VALUE re)
03258 {
03259     onig_errmsg_buffer err = "";
03260     const char *s;
03261     long len;
03262 
03263     if (copy == re) return copy;
03264     rb_check_frozen(copy);
03265     /* need better argument type check */
03266     if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03267         rb_raise(rb_eTypeError, "wrong argument type");
03268     }
03269     rb_reg_check(re);
03270     s = RREGEXP_SRC_PTR(re);
03271     len = RREGEXP_SRC_LEN(re);
03272     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03273                 err, NULL, 0) != 0) {
03274         rb_reg_raise(s, len, err, re);
03275     }
03276     return copy;
03277 }
03278 
03279 VALUE
03280 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03281 {
03282     VALUE val = 0;
03283     char *p, *s, *e;
03284     int no, clen;
03285     rb_encoding *str_enc = rb_enc_get(str);
03286     rb_encoding *src_enc = rb_enc_get(src);
03287     int acompat = rb_enc_asciicompat(str_enc);
03288 #define ASCGET(s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : rb_enc_ascget(s, e, cl, str_enc))
03289 
03290     p = s = RSTRING_PTR(str);
03291     e = s + RSTRING_LEN(str);
03292 
03293     while (s < e) {
03294         int c = ASCGET(s, e, &clen);
03295         char *ss;
03296 
03297         if (c == -1) {
03298             s += mbclen(s, e, str_enc);
03299             continue;
03300         }
03301         ss = s;
03302         s += clen;
03303 
03304         if (c != '\\' || s == e) continue;
03305 
03306         if (!val) {
03307             val = rb_str_buf_new(ss-p);
03308         }
03309         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03310 
03311         c = ASCGET(s, e, &clen);
03312         if (c == -1) {
03313             s += mbclen(s, e, str_enc);
03314             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03315             p = s;
03316             continue;
03317         }
03318         s += clen;
03319 
03320         p = s;
03321         switch (c) {
03322           case '1': case '2': case '3': case '4':
03323           case '5': case '6': case '7': case '8': case '9':
03324             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03325                 no = c - '0';
03326             }
03327             else {
03328                 continue;
03329             }
03330             break;
03331 
03332           case 'k':
03333             if (s < e && ASCGET(s, e, &clen) == '<') {
03334                 char *name, *name_end;
03335 
03336                 name_end = name = s + clen;
03337                 while (name_end < e) {
03338                     c = ASCGET(name_end, e, &clen);
03339                     if (c == '>') break;
03340                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03341                 }
03342                 if (name_end < e) {
03343                     no = name_to_backref_number(regs, regexp, name, name_end);
03344                     p = s = name_end + clen;
03345                     break;
03346                 }
03347                 else {
03348                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
03349                 }
03350             }
03351 
03352             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03353             continue;
03354 
03355           case '0':
03356           case '&':
03357             no = 0;
03358             break;
03359 
03360           case '`':
03361             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03362             continue;
03363 
03364           case '\'':
03365             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03366             continue;
03367 
03368           case '+':
03369             no = regs->num_regs-1;
03370             while (BEG(no) == -1 && no > 0) no--;
03371             if (no == 0) continue;
03372             break;
03373 
03374           case '\\':
03375             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03376             continue;
03377 
03378           default:
03379             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03380             continue;
03381         }
03382 
03383         if (no >= 0) {
03384             if (no >= regs->num_regs) continue;
03385             if (BEG(no) == -1) continue;
03386             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03387         }
03388     }
03389 
03390     if (!val) return str;
03391     if (p < e) {
03392         rb_enc_str_buf_cat(val, p, e-p, str_enc);
03393     }
03394 
03395     return val;
03396 }
03397 
03398 static VALUE
03399 kcode_getter(void)
03400 {
03401     rb_warn("variable $KCODE is no longer effective");
03402     return Qnil;
03403 }
03404 
03405 static void
03406 kcode_setter(VALUE val, ID id)
03407 {
03408     rb_warn("variable $KCODE is no longer effective; ignored");
03409 }
03410 
03411 static VALUE
03412 ignorecase_getter(void)
03413 {
03414     rb_warn("variable $= is no longer effective");
03415     return Qfalse;
03416 }
03417 
03418 static void
03419 ignorecase_setter(VALUE val, ID id)
03420 {
03421     rb_warn("variable $= is no longer effective; ignored");
03422 }
03423 
03424 static VALUE
03425 match_getter(void)
03426 {
03427     VALUE match = rb_backref_get();
03428 
03429     if (NIL_P(match)) return Qnil;
03430     rb_match_busy(match);
03431     return match;
03432 }
03433 
03434 static void
03435 match_setter(VALUE val)
03436 {
03437     if (!NIL_P(val)) {
03438         Check_Type(val, T_MATCH);
03439     }
03440     rb_backref_set(val);
03441 }
03442 
03443 /*
03444  *  call-seq:
03445  *     Regexp.last_match           -> matchdata
03446  *     Regexp.last_match(n)        -> str
03447  *
03448  *  The first form returns the <code>MatchData</code> object generated by the
03449  *  last successful pattern match. Equivalent to reading the global variable
03450  *  <code>$~</code>. The second form returns the <i>n</i>th field in this
03451  *  <code>MatchData</code> object.
03452  *  <em>n</em> can be a string or symbol to reference a named capture.
03453  *
03454  *  Note that the <code>last_match</code> is local to the thread and method scope
03455  *  of the method that did the pattern match.
03456  *
03457  *     /c(.)t/ =~ 'cat'        #=> 0
03458  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
03459  *     Regexp.last_match(0)    #=> "cat"
03460  *     Regexp.last_match(1)    #=> "a"
03461  *     Regexp.last_match(2)    #=> nil
03462  *
03463  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
03464  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
03465  *     Regexp.last_match(:lhs) #=> "var"
03466  *     Regexp.last_match(:rhs) #=> "val"
03467  */
03468 
03469 static VALUE
03470 rb_reg_s_last_match(int argc, VALUE *argv)
03471 {
03472     VALUE nth;
03473 
03474     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03475         VALUE match = rb_backref_get();
03476         int n;
03477         if (NIL_P(match)) return Qnil;
03478         n = match_backref_number(match, nth);
03479         return rb_reg_nth_match(n, match);
03480     }
03481     return match_getter();
03482 }
03483 
03484 static void
03485 re_warn(const char *s)
03486 {
03487     rb_warn("%s", s);
03488 }
03489 
03490 /*
03491  *  Document-class: RegexpError
03492  *
03493  *  Raised when given an invalid regexp expression.
03494  *
03495  *     Regexp.new("?")
03496  *
03497  *  <em>raises the exception:</em>
03498  *
03499  *     RegexpError: target of repeat operator is not specified: /?/
03500  */
03501 
03502 /*
03503  *  Document-class: Regexp
03504  *
03505  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
03506  *  against strings. Regexps are created using the <code>/.../</code> and
03507  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
03508  *  constructor.
03509  *
03510  *  :include: doc/re.rdoc
03511  */
03512 
03513 void
03514 Init_Regexp(void)
03515 {
03516     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03517 
03518     onigenc_set_default_caseconv_table((UChar*)casetable);
03519     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03520     onig_set_warn_func(re_warn);
03521     onig_set_verb_warn_func(re_warn);
03522 
03523     rb_define_virtual_variable("$~", match_getter, match_setter);
03524     rb_define_virtual_variable("$&", last_match_getter, 0);
03525     rb_define_virtual_variable("$`", prematch_getter, 0);
03526     rb_define_virtual_variable("$'", postmatch_getter, 0);
03527     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03528 
03529     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03530     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03531     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03532 
03533     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03534     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03535     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03536     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03537     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03538     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03539     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03540     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03541 
03542     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03543     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03544     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03545     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03546     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03547     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03548     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03549     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03550     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03551     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03552     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03553     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03554     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03555     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03556     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
03557     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03558     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03559     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03560 
03561     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03562     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03563     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03564     rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03565 
03566     rb_global_variable(&reg_cache);
03567 
03568     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
03569     rb_define_alloc_func(rb_cMatch, match_alloc);
03570     rb_undef_method(CLASS_OF(rb_cMatch), "new");
03571 
03572     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03573     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03574     rb_define_method(rb_cMatch, "names", match_names, 0);
03575     rb_define_method(rb_cMatch, "size", match_size, 0);
03576     rb_define_method(rb_cMatch, "length", match_size, 0);
03577     rb_define_method(rb_cMatch, "offset", match_offset, 1);
03578     rb_define_method(rb_cMatch, "begin", match_begin, 1);
03579     rb_define_method(rb_cMatch, "end", match_end, 1);
03580     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03581     rb_define_method(rb_cMatch, "[]", match_aref, -1);
03582     rb_define_method(rb_cMatch, "captures", match_captures, 0);
03583     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03584     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03585     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03586     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03587     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03588     rb_define_method(rb_cMatch, "string", match_string, 0);
03589     rb_define_method(rb_cMatch, "hash", match_hash, 0);
03590     rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03591     rb_define_method(rb_cMatch, "==", match_equal, 1);
03592 }
03593 

Generated on Wed Aug 10 09:17:12 2011 for Ruby by  doxygen 1.4.7