00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "regint.h"
00017 #include <ctype.h>
00018
00019 VALUE rb_eRegexpError;
00020
00021 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00022 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00023
00024 #define BEG(no) regs->beg[no]
00025 #define END(no) regs->end[no]
00026
00027 #if 'a' == 97
00028 static const char casetable[] = {
00029 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00030 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00031 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00032 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00033
00034 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00035
00036 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00037
00038 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00039
00040 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00041
00042 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00043
00044 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00045
00046 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00047
00048 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00049
00050 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00051
00052 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00053
00054 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00055
00056 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00057 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00058 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00059 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00060 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00061 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00062 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00063 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00064 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00065 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00066 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00067 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00068 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00069 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00070 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00071 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00072 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00073 };
00074 #else
00075 # error >>> "You lose. You will need a translation table for your character set." <<<
00076 #endif
00077
00078 int
00079 rb_memcicmp(const void *x, const void *y, long len)
00080 {
00081 const unsigned char *p1 = x, *p2 = y;
00082 int tmp;
00083
00084 while (len--) {
00085 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00086 return tmp;
00087 }
00088 return 0;
00089 }
00090
00091 #undef rb_memcmp
00092
00093 int
00094 rb_memcmp(const void *p1, const void *p2, long len)
00095 {
00096 return memcmp(p1, p2, len);
00097 }
00098
00099 static inline long
00100 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00101 {
00102 const unsigned char *x = xs, *xe = xs + m;
00103 const unsigned char *y = ys, *ye = ys + n;
00104 #ifndef VALUE_MAX
00105 # if SIZEOF_VALUE == 8
00106 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00107 # elif SIZEOF_VALUE == 4
00108 # define VALUE_MAX 0xFFFFFFFFUL
00109 # endif
00110 #endif
00111 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00112
00113 if (m > SIZEOF_VALUE)
00114 rb_bug("!!too long pattern string!!");
00115
00116
00117 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00118 hx <<= CHAR_BIT;
00119 hy <<= CHAR_BIT;
00120 hx |= *x;
00121 hy |= *y;
00122 }
00123
00124 while (hx != hy) {
00125 if (y == ye)
00126 return -1;
00127 hy <<= CHAR_BIT;
00128 hy |= *y;
00129 hy &= mask;
00130 y++;
00131 }
00132 return y - ys - m;
00133 }
00134
00135 static inline long
00136 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00137 {
00138 const unsigned char *x = xs, *xe = xs + m;
00139 const unsigned char *y = ys;
00140 VALUE i, qstable[256];
00141
00142
00143 for (i = 0; i < 256; ++i)
00144 qstable[i] = m + 1;
00145 for (; x < xe; ++x)
00146 qstable[*x] = xe - x;
00147
00148 for (; y + m <= ys + n; y += *(qstable + y[m])) {
00149 if (*xs == *y && memcmp(xs, y, m) == 0)
00150 return y - ys;
00151 }
00152 return -1;
00153 }
00154
00155 static inline unsigned int
00156 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00157 {
00158 register const unsigned int mix = 8353;
00159 register unsigned int h = *x;
00160 if (h < 0xC0) {
00161 return h + 256;
00162 }
00163 else if (h < 0xE0) {
00164 h *= mix;
00165 h += x[1];
00166 }
00167 else if (h < 0xF0) {
00168 h *= mix;
00169 h += x[1];
00170 h *= mix;
00171 h += x[2];
00172 }
00173 else if (h < 0xF5) {
00174 h *= mix;
00175 h += x[1];
00176 h *= mix;
00177 h += x[2];
00178 h *= mix;
00179 h += x[3];
00180 }
00181 else {
00182 return h + 256;
00183 }
00184 return (unsigned char)h;
00185 }
00186
00187 static inline long
00188 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00189 {
00190 const unsigned char *x = xs, *xe = xs + m;
00191 const unsigned char *y = ys;
00192 VALUE i, qstable[512];
00193
00194
00195 for (i = 0; i < 512; ++i) {
00196 qstable[i] = m + 1;
00197 }
00198 for (; x < xe; ++x) {
00199 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00200 }
00201
00202 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00203 if (*xs == *y && memcmp(xs, y, m) == 0)
00204 return y - ys;
00205 }
00206 return -1;
00207 }
00208
00209 long
00210 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00211 {
00212 const unsigned char *x = x0, *y = y0;
00213
00214 if (m > n) return -1;
00215 else if (m == n) {
00216 return memcmp(x0, y0, m) == 0 ? 0 : -1;
00217 }
00218 else if (m < 1) {
00219 return 0;
00220 }
00221 else if (m == 1) {
00222 const unsigned char *ys = y, *ye = ys + n;
00223 for (; y < ye; ++y) {
00224 if (*x == *y)
00225 return y - ys;
00226 }
00227 return -1;
00228 }
00229 else if (m <= SIZEOF_VALUE) {
00230 return rb_memsearch_ss(x0, m, y0, n);
00231 }
00232 else if (enc == rb_utf8_encoding()){
00233 return rb_memsearch_qs_utf8(x0, m, y0, n);
00234 }
00235 else {
00236 return rb_memsearch_qs(x0, m, y0, n);
00237 }
00238 }
00239
00240 #define REG_LITERAL FL_USER5
00241 #define REG_ENCODING_NONE FL_USER6
00242
00243 #define KCODE_FIXED FL_USER4
00244
00245 #define ARG_REG_OPTION_MASK \
00246 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00247 #define ARG_ENCODING_FIXED 16
00248 #define ARG_ENCODING_NONE 32
00249
00250 static int
00251 char_to_option(int c)
00252 {
00253 int val;
00254
00255 switch (c) {
00256 case 'i':
00257 val = ONIG_OPTION_IGNORECASE;
00258 break;
00259 case 'x':
00260 val = ONIG_OPTION_EXTEND;
00261 break;
00262 case 'm':
00263 val = ONIG_OPTION_MULTILINE;
00264 break;
00265 default:
00266 val = 0;
00267 break;
00268 }
00269 return val;
00270 }
00271
00272 static char *
00273 option_to_str(char str[4], int options)
00274 {
00275 char *p = str;
00276 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00277 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00278 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00279 *p = 0;
00280 return str;
00281 }
00282
00283 extern int
00284 rb_char_to_option_kcode(int c, int *option, int *kcode)
00285 {
00286 *option = 0;
00287
00288 switch (c) {
00289 case 'n':
00290 *kcode = rb_ascii8bit_encindex();
00291 return (*option = ARG_ENCODING_NONE);
00292 case 'e':
00293 *kcode = rb_enc_find_index("EUC-JP");
00294 break;
00295 case 's':
00296 *kcode = rb_enc_find_index("Windows-31J");
00297 break;
00298 case 'u':
00299 *kcode = rb_utf8_encindex();
00300 break;
00301 default:
00302 *kcode = -1;
00303 return (*option = char_to_option(c));
00304 }
00305 *option = ARG_ENCODING_FIXED;
00306 return 1;
00307 }
00308
00309 static void
00310 rb_reg_check(VALUE re)
00311 {
00312 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00313 rb_raise(rb_eTypeError, "uninitialized Regexp");
00314 }
00315 }
00316
00317 int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
00318
00319 static void
00320 rb_reg_expr_str(VALUE str, const char *s, long len,
00321 rb_encoding *enc, rb_encoding *resenc)
00322 {
00323 const char *p, *pend;
00324 int need_escape = 0;
00325 int c, clen;
00326
00327 p = s; pend = p + len;
00328 if (rb_enc_asciicompat(enc)) {
00329 while (p < pend) {
00330 c = rb_enc_ascget(p, pend, &clen, enc);
00331 if (c == -1) {
00332 if (enc == resenc) {
00333 p += mbclen(p, pend, enc);
00334 }
00335 else {
00336 need_escape = 1;
00337 break;
00338 }
00339 }
00340 else if (c != '/' && rb_enc_isprint(c, enc)) {
00341 p += clen;
00342 }
00343 else {
00344 need_escape = 1;
00345 break;
00346 }
00347 }
00348 }
00349 else {
00350 need_escape = 1;
00351 }
00352
00353 if (!need_escape) {
00354 rb_str_buf_cat(str, s, len);
00355 }
00356 else {
00357 int unicode_p = rb_enc_unicode_p(enc);
00358 p = s;
00359 while (p<pend) {
00360 c = rb_enc_ascget(p, pend, &clen, enc);
00361 if (c == '\\' && p+clen < pend) {
00362 int n = clen + mbclen(p+clen, pend, enc);
00363 rb_str_buf_cat(str, p, n);
00364 p += n;
00365 continue;
00366 }
00367 else if (c == '/') {
00368 char c = '\\';
00369 rb_str_buf_cat(str, &c, 1);
00370 rb_str_buf_cat(str, p, clen);
00371 }
00372 else if (c == -1) {
00373 clen = rb_enc_precise_mbclen(p, pend, enc);
00374 if (!MBCLEN_CHARFOUND_P(clen)) {
00375 c = (unsigned char)*p;
00376 clen = 1;
00377 goto hex;
00378 }
00379 if (resenc) {
00380 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00381 rb_str_buf_cat_escaped_char(str, c, unicode_p);
00382 }
00383 else {
00384 clen = MBCLEN_CHARFOUND_LEN(clen);
00385 rb_str_buf_cat(str, p, clen);
00386 }
00387 }
00388 else if (rb_enc_isprint(c, enc)) {
00389 rb_str_buf_cat(str, p, clen);
00390 }
00391 else if (!rb_enc_isspace(c, enc)) {
00392 char b[8];
00393
00394 hex:
00395 snprintf(b, sizeof(b), "\\x%02X", c);
00396 rb_str_buf_cat(str, b, 4);
00397 }
00398 else {
00399 rb_str_buf_cat(str, p, clen);
00400 }
00401 p += clen;
00402 }
00403 }
00404 }
00405
00406 static VALUE
00407 rb_reg_desc(const char *s, long len, VALUE re)
00408 {
00409 rb_encoding *enc = rb_enc_get(re);
00410 VALUE str = rb_str_buf_new2("/");
00411 rb_encoding *resenc = rb_default_internal_encoding();
00412 if (resenc == NULL) resenc = rb_default_external_encoding();
00413
00414 if (re && rb_enc_asciicompat(enc)) {
00415 rb_enc_copy(str, re);
00416 }
00417 else {
00418 rb_enc_associate(str, rb_usascii_encoding());
00419 }
00420 rb_reg_expr_str(str, s, len, enc, resenc);
00421 rb_str_buf_cat2(str, "/");
00422 if (re) {
00423 char opts[4];
00424 rb_reg_check(re);
00425 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00426 rb_str_buf_cat2(str, opts);
00427 if (RBASIC(re)->flags & REG_ENCODING_NONE)
00428 rb_str_buf_cat2(str, "n");
00429 }
00430 OBJ_INFECT(str, re);
00431 return str;
00432 }
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449 static VALUE
00450 rb_reg_source(VALUE re)
00451 {
00452 VALUE str;
00453
00454 rb_reg_check(re);
00455 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00456 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00457 return str;
00458 }
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472 static VALUE
00473 rb_reg_inspect(VALUE re)
00474 {
00475 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00476 return rb_any_to_s(re);
00477 }
00478 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00479 }
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 static VALUE
00503 rb_reg_to_s(VALUE re)
00504 {
00505 int options, opt;
00506 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00507 long len;
00508 const UChar* ptr;
00509 VALUE str = rb_str_buf_new2("(?");
00510 char optbuf[5];
00511 rb_encoding *enc = rb_enc_get(re);
00512
00513 rb_reg_check(re);
00514
00515 rb_enc_copy(str, re);
00516 options = RREGEXP(re)->ptr->options;
00517 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00518 len = RREGEXP_SRC_LEN(re);
00519 again:
00520 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00521 int err = 1;
00522 ptr += 2;
00523 if ((len -= 2) > 0) {
00524 do {
00525 opt = char_to_option((int )*ptr);
00526 if (opt != 0) {
00527 options |= opt;
00528 }
00529 else {
00530 break;
00531 }
00532 ++ptr;
00533 } while (--len > 0);
00534 }
00535 if (len > 1 && *ptr == '-') {
00536 ++ptr;
00537 --len;
00538 do {
00539 opt = char_to_option((int )*ptr);
00540 if (opt != 0) {
00541 options &= ~opt;
00542 }
00543 else {
00544 break;
00545 }
00546 ++ptr;
00547 } while (--len > 0);
00548 }
00549 if (*ptr == ')') {
00550 --len;
00551 ++ptr;
00552 goto again;
00553 }
00554 if (*ptr == ':' && ptr[len-1] == ')') {
00555 Regexp *rp;
00556
00557 ++ptr;
00558 len -= 2;
00559 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00560 enc, OnigDefaultSyntax, NULL);
00561 onig_free(rp);
00562 }
00563 if (err) {
00564 options = RREGEXP(re)->ptr->options;
00565 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00566 len = RREGEXP_SRC_LEN(re);
00567 }
00568 }
00569
00570 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00571
00572 if ((options & embeddable) != embeddable) {
00573 optbuf[0] = '-';
00574 option_to_str(optbuf + 1, ~options);
00575 rb_str_buf_cat2(str, optbuf);
00576 }
00577
00578 rb_str_buf_cat2(str, ":");
00579 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00580 rb_str_buf_cat2(str, ")");
00581 rb_enc_copy(str, re);
00582
00583 OBJ_INFECT(str, re);
00584 return str;
00585 }
00586
00587 static void
00588 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00589 {
00590 volatile VALUE desc = rb_reg_desc(s, len, re);
00591
00592 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00593 }
00594
00595 static VALUE
00596 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00597 {
00598 char opts[6];
00599 VALUE desc = rb_str_buf_new2(err);
00600 rb_encoding *resenc = rb_default_internal_encoding();
00601 if (resenc == NULL) resenc = rb_default_external_encoding();
00602
00603 rb_enc_associate(desc, enc);
00604 rb_str_buf_cat2(desc, ": /");
00605 rb_reg_expr_str(desc, s, len, enc, resenc);
00606 opts[0] = '/';
00607 option_to_str(opts + 1, options);
00608 rb_str_buf_cat2(desc, opts);
00609 return rb_exc_new3(rb_eRegexpError, desc);
00610 }
00611
00612 static void
00613 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00614 {
00615 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00616 }
00617
00618 static VALUE
00619 rb_reg_error_desc(VALUE str, int options, const char *err)
00620 {
00621 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00622 rb_enc_get(str), options, err);
00623 }
00624
00625 static void
00626 rb_reg_raise_str(VALUE str, int options, const char *err)
00627 {
00628 rb_exc_raise(rb_reg_error_desc(str, options, err));
00629 }
00630
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643 static VALUE
00644 rb_reg_casefold_p(VALUE re)
00645 {
00646 rb_reg_check(re);
00647 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00648 return Qfalse;
00649 }
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675 static VALUE
00676 rb_reg_options_m(VALUE re)
00677 {
00678 int options = rb_reg_options(re);
00679 return INT2NUM(options);
00680 }
00681
00682 static int
00683 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00684 int back_num, int *back_refs, OnigRegex regex, void *arg)
00685 {
00686 VALUE ary = (VALUE)arg;
00687 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00688 return 0;
00689 }
00690
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707 static VALUE
00708 rb_reg_names(VALUE re)
00709 {
00710 VALUE ary = rb_ary_new();
00711 rb_reg_check(re);
00712 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00713 return ary;
00714 }
00715
00716 static int
00717 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00718 int back_num, int *back_refs, OnigRegex regex, void *arg)
00719 {
00720 VALUE hash = (VALUE)arg;
00721 VALUE ary = rb_ary_new2(back_num);
00722 int i;
00723
00724 for(i = 0; i < back_num; i++)
00725 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00726
00727 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00728
00729 return 0;
00730 }
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754 static VALUE
00755 rb_reg_named_captures(VALUE re)
00756 {
00757 VALUE hash = rb_hash_new();
00758 rb_reg_check(re);
00759 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00760 return hash;
00761 }
00762
00763 static int
00764 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00765 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00766 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00767 {
00768 int r;
00769
00770 *reg = (regex_t* )xmalloc(sizeof(regex_t));
00771 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00772
00773 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00774 if (r) goto err;
00775
00776 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00777 if (r) {
00778 err:
00779 onig_free(*reg);
00780 *reg = NULL;
00781 }
00782 return r;
00783 }
00784
00785 static Regexp*
00786 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00787 const char *sourcefile, int sourceline)
00788 {
00789 Regexp *rp;
00790 int r;
00791 OnigErrorInfo einfo;
00792
00793
00794
00795
00796
00797
00798
00799
00800 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00801 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00802 if (r) {
00803 onig_error_code_to_str((UChar*)err, r, &einfo);
00804 return 0;
00805 }
00806 return rp;
00807 }
00808
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821
00822 VALUE rb_cMatch;
00823
00824 static VALUE
00825 match_alloc(VALUE klass)
00826 {
00827 NEWOBJ(match, struct RMatch);
00828 OBJSETUP(match, klass, T_MATCH);
00829
00830 match->str = 0;
00831 match->rmatch = 0;
00832 match->regexp = 0;
00833 match->rmatch = ALLOC(struct rmatch);
00834 MEMZERO(match->rmatch, struct rmatch, 1);
00835
00836 return (VALUE)match;
00837 }
00838
00839 typedef struct {
00840 long byte_pos;
00841 long char_pos;
00842 } pair_t;
00843
00844 static int
00845 pair_byte_cmp(const void *pair1, const void *pair2)
00846 {
00847 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00848 #if SIZEOF_LONG > SIZEOF_INT
00849 return diff ? diff > 0 ? 1 : -1 : 0;
00850 #else
00851 return (int)diff;
00852 #endif
00853 }
00854
00855 static void
00856 update_char_offset(VALUE match)
00857 {
00858 struct rmatch *rm = RMATCH(match)->rmatch;
00859 struct re_registers *regs;
00860 int i, num_regs, num_pos;
00861 long c;
00862 char *s, *p, *q, *e;
00863 rb_encoding *enc;
00864 pair_t *pairs;
00865
00866 if (rm->char_offset_updated)
00867 return;
00868
00869 regs = &rm->regs;
00870 num_regs = rm->regs.num_regs;
00871
00872 if (rm->char_offset_num_allocated < num_regs) {
00873 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00874 rm->char_offset_num_allocated = num_regs;
00875 }
00876
00877 enc = rb_enc_get(RMATCH(match)->str);
00878 if (rb_enc_mbmaxlen(enc) == 1) {
00879 for (i = 0; i < num_regs; i++) {
00880 rm->char_offset[i].beg = BEG(i);
00881 rm->char_offset[i].end = END(i);
00882 }
00883 rm->char_offset_updated = 1;
00884 return;
00885 }
00886
00887 pairs = ALLOCA_N(pair_t, num_regs*2);
00888 num_pos = 0;
00889 for (i = 0; i < num_regs; i++) {
00890 if (BEG(i) < 0)
00891 continue;
00892 pairs[num_pos++].byte_pos = BEG(i);
00893 pairs[num_pos++].byte_pos = END(i);
00894 }
00895 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00896
00897 s = p = RSTRING_PTR(RMATCH(match)->str);
00898 e = s + RSTRING_LEN(RMATCH(match)->str);
00899 c = 0;
00900 for (i = 0; i < num_pos; i++) {
00901 q = s + pairs[i].byte_pos;
00902 c += rb_enc_strlen(p, q, enc);
00903 pairs[i].char_pos = c;
00904 p = q;
00905 }
00906
00907 for (i = 0; i < num_regs; i++) {
00908 pair_t key, *found;
00909 if (BEG(i) < 0) {
00910 rm->char_offset[i].beg = -1;
00911 rm->char_offset[i].end = -1;
00912 continue;
00913 }
00914
00915 key.byte_pos = BEG(i);
00916 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00917 rm->char_offset[i].beg = found->char_pos;
00918
00919 key.byte_pos = END(i);
00920 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00921 rm->char_offset[i].end = found->char_pos;
00922 }
00923
00924 rm->char_offset_updated = 1;
00925 }
00926
00927 static void
00928 match_check(VALUE match)
00929 {
00930 if (!RMATCH(match)->regexp) {
00931 rb_raise(rb_eTypeError, "uninitialized Match");
00932 }
00933 }
00934
00935
00936 static VALUE
00937 match_init_copy(VALUE obj, VALUE orig)
00938 {
00939 struct rmatch *rm;
00940
00941 if (obj == orig) return obj;
00942
00943 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00944 rb_raise(rb_eTypeError, "wrong argument class");
00945 }
00946 RMATCH(obj)->str = RMATCH(orig)->str;
00947 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00948
00949 rm = RMATCH(obj)->rmatch;
00950 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00951
00952 if (!RMATCH(orig)->rmatch->char_offset_updated) {
00953 rm->char_offset_updated = 0;
00954 }
00955 else {
00956 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00957 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00958 rm->char_offset_num_allocated = rm->regs.num_regs;
00959 }
00960 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00961 struct rmatch_offset, rm->regs.num_regs);
00962 rm->char_offset_updated = 1;
00963 }
00964
00965 return obj;
00966 }
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979 static VALUE
00980 match_regexp(VALUE match)
00981 {
00982 match_check(match);
00983 return RMATCH(match)->regexp;
00984 }
00985
00986
00987
00988
00989
00990
00991
00992
00993
00994
00995
00996
00997
00998
00999
01000 static VALUE
01001 match_names(VALUE match)
01002 {
01003 match_check(match);
01004 return rb_reg_names(RMATCH(match)->regexp);
01005 }
01006
01007
01008
01009
01010
01011
01012
01013
01014
01015
01016
01017
01018
01019 static VALUE
01020 match_size(VALUE match)
01021 {
01022 match_check(match);
01023 return INT2FIX(RMATCH_REGS(match)->num_regs);
01024 }
01025
01026 static int
01027 match_backref_number(VALUE match, VALUE backref)
01028 {
01029 const char *name;
01030 int num;
01031
01032 struct re_registers *regs = RMATCH_REGS(match);
01033 VALUE regexp = RMATCH(match)->regexp;
01034
01035 match_check(match);
01036 switch(TYPE(backref)) {
01037 default:
01038 return NUM2INT(backref);
01039
01040 case T_SYMBOL:
01041 name = rb_id2name(SYM2ID(backref));
01042 break;
01043
01044 case T_STRING:
01045 name = StringValueCStr(backref);
01046 break;
01047 }
01048
01049 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01050 (const unsigned char*)name,
01051 (const unsigned char*)name + strlen(name),
01052 regs);
01053
01054 if (num < 1) {
01055 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01056 }
01057
01058 return num;
01059 }
01060
01061 int
01062 rb_reg_backref_number(VALUE match, VALUE backref)
01063 {
01064 return match_backref_number(match, backref);
01065 }
01066
01067
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083
01084
01085 static VALUE
01086 match_offset(VALUE match, VALUE n)
01087 {
01088 int i = match_backref_number(match, n);
01089 struct re_registers *regs = RMATCH_REGS(match);
01090
01091 match_check(match);
01092 if (i < 0 || regs->num_regs <= i)
01093 rb_raise(rb_eIndexError, "index %d out of matches", i);
01094
01095 if (BEG(i) < 0)
01096 return rb_assoc_new(Qnil, Qnil);
01097
01098 update_char_offset(match);
01099 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01100 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01101 }
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121 static VALUE
01122 match_begin(VALUE match, VALUE n)
01123 {
01124 int i = match_backref_number(match, n);
01125 struct re_registers *regs = RMATCH_REGS(match);
01126
01127 match_check(match);
01128 if (i < 0 || regs->num_regs <= i)
01129 rb_raise(rb_eIndexError, "index %d out of matches", i);
01130
01131 if (BEG(i) < 0)
01132 return Qnil;
01133
01134 update_char_offset(match);
01135 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01136 }
01137
01138
01139
01140
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156 static VALUE
01157 match_end(VALUE match, VALUE n)
01158 {
01159 int i = match_backref_number(match, n);
01160 struct re_registers *regs = RMATCH_REGS(match);
01161
01162 match_check(match);
01163 if (i < 0 || regs->num_regs <= i)
01164 rb_raise(rb_eIndexError, "index %d out of matches", i);
01165
01166 if (BEG(i) < 0)
01167 return Qnil;
01168
01169 update_char_offset(match);
01170 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01171 }
01172
01173 #define MATCH_BUSY FL_USER2
01174
01175 void
01176 rb_match_busy(VALUE match)
01177 {
01178 FL_SET(match, MATCH_BUSY);
01179 }
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210 static VALUE
01211 rb_reg_fixed_encoding_p(VALUE re)
01212 {
01213 if (FL_TEST(re, KCODE_FIXED))
01214 return Qtrue;
01215 else
01216 return Qfalse;
01217 }
01218
01219 static VALUE
01220 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01221 rb_encoding **fixed_enc, onig_errmsg_buffer err);
01222
01223
01224 static void
01225 reg_enc_error(VALUE re, VALUE str)
01226 {
01227 rb_raise(rb_eEncCompatError,
01228 "incompatible encoding regexp match (%s regexp with %s string)",
01229 rb_enc_name(rb_enc_get(re)),
01230 rb_enc_name(rb_enc_get(str)));
01231 }
01232
01233 static rb_encoding*
01234 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01235 {
01236 rb_encoding *enc = 0;
01237
01238 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01239 rb_raise(rb_eArgError,
01240 "invalid byte sequence in %s",
01241 rb_enc_name(rb_enc_get(str)));
01242 }
01243
01244 rb_reg_check(re);
01245 enc = rb_enc_get(str);
01246 if (!rb_enc_str_asciicompat_p(str)) {
01247 if (RREGEXP(re)->ptr->enc != enc) {
01248 reg_enc_error(re, str);
01249 }
01250 }
01251 else if (rb_reg_fixed_encoding_p(re)) {
01252 if (RREGEXP(re)->ptr->enc != enc &&
01253 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01254 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01255 reg_enc_error(re, str);
01256 }
01257 enc = RREGEXP(re)->ptr->enc;
01258 }
01259 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01260 enc != rb_ascii8bit_encoding() &&
01261 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01262 rb_warn("regexp match /.../n against to %s string",
01263 rb_enc_name(enc));
01264 }
01265 return enc;
01266 }
01267
01268 regex_t *
01269 rb_reg_prepare_re(VALUE re, VALUE str)
01270 {
01271 regex_t *reg = RREGEXP(re)->ptr;
01272 onig_errmsg_buffer err = "";
01273 int r;
01274 OnigErrorInfo einfo;
01275 const char *pattern;
01276 VALUE unescaped;
01277 rb_encoding *fixed_enc = 0;
01278 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01279
01280 if (reg->enc == enc) return reg;
01281
01282 rb_reg_check(re);
01283 reg = RREGEXP(re)->ptr;
01284 pattern = RREGEXP_SRC_PTR(re);
01285
01286 unescaped = rb_reg_preprocess(
01287 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01288 &fixed_enc, err);
01289
01290 if (unescaped == Qnil) {
01291 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01292 }
01293
01294 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped),
01295 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01296 reg->options, enc,
01297 OnigDefaultSyntax, &einfo);
01298 if (r) {
01299 onig_error_code_to_str((UChar*)err, r, &einfo);
01300 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01301 }
01302
01303 RB_GC_GUARD(unescaped);
01304 return reg;
01305 }
01306
01307 long
01308 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01309 {
01310 long range;
01311 rb_encoding *enc;
01312 UChar *p, *string;
01313
01314 enc = rb_reg_prepare_enc(re, str, 0);
01315
01316 if (reverse) {
01317 range = -pos;
01318 }
01319 else {
01320 range = RSTRING_LEN(str) - pos;
01321 }
01322
01323 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01324 string = (UChar*)RSTRING_PTR(str);
01325
01326 if (range > 0) {
01327 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01328 }
01329 else {
01330 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01331 }
01332 return p - string;
01333 }
01334
01335 return pos;
01336 }
01337
01338 long
01339 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01340 {
01341 long result;
01342 VALUE match;
01343 struct re_registers regi, *regs = ®i;
01344 char *range = RSTRING_PTR(str);
01345 regex_t *reg;
01346 int tmpreg;
01347
01348 if (pos > RSTRING_LEN(str) || pos < 0) {
01349 rb_backref_set(Qnil);
01350 return -1;
01351 }
01352
01353 reg = rb_reg_prepare_re(re, str);
01354 tmpreg = reg != RREGEXP(re)->ptr;
01355 if (!tmpreg) RREGEXP(re)->usecnt++;
01356
01357 match = rb_backref_get();
01358 if (!NIL_P(match)) {
01359 if (FL_TEST(match, MATCH_BUSY)) {
01360 match = Qnil;
01361 }
01362 else {
01363 regs = RMATCH_REGS(match);
01364 }
01365 }
01366 if (NIL_P(match)) {
01367 MEMZERO(regs, struct re_registers, 1);
01368 }
01369 if (!reverse) {
01370 range += RSTRING_LEN(str);
01371 }
01372 result = onig_search(reg,
01373 (UChar*)(RSTRING_PTR(str)),
01374 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01375 ((UChar*)(RSTRING_PTR(str)) + pos),
01376 ((UChar*)range),
01377 regs, ONIG_OPTION_NONE);
01378 if (!tmpreg) RREGEXP(re)->usecnt--;
01379 if (tmpreg) {
01380 if (RREGEXP(re)->usecnt) {
01381 onig_free(reg);
01382 }
01383 else {
01384 onig_free(RREGEXP(re)->ptr);
01385 RREGEXP(re)->ptr = reg;
01386 }
01387 }
01388 if (result < 0) {
01389 if (regs == ®i)
01390 onig_region_free(regs, 0);
01391 if (result == ONIG_MISMATCH) {
01392 rb_backref_set(Qnil);
01393 return result;
01394 }
01395 else {
01396 onig_errmsg_buffer err = "";
01397 onig_error_code_to_str((UChar*)err, (int)result);
01398 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
01399 }
01400 }
01401
01402 if (NIL_P(match)) {
01403 match = match_alloc(rb_cMatch);
01404 onig_region_copy(RMATCH_REGS(match), regs);
01405 onig_region_free(regs, 0);
01406 }
01407 else {
01408 if (rb_safe_level() >= 3)
01409 OBJ_TAINT(match);
01410 else
01411 FL_UNSET(match, FL_TAINT);
01412 }
01413
01414 RMATCH(match)->str = rb_str_new4(str);
01415 RMATCH(match)->regexp = re;
01416 RMATCH(match)->rmatch->char_offset_updated = 0;
01417 rb_backref_set(match);
01418
01419 OBJ_INFECT(match, re);
01420 OBJ_INFECT(match, str);
01421
01422 return result;
01423 }
01424
01425 VALUE
01426 rb_reg_nth_defined(int nth, VALUE match)
01427 {
01428 struct re_registers *regs;
01429 if (NIL_P(match)) return Qnil;
01430 match_check(match);
01431 regs = RMATCH_REGS(match);
01432 if (nth >= regs->num_regs) {
01433 return Qnil;
01434 }
01435 if (nth < 0) {
01436 nth += regs->num_regs;
01437 if (nth <= 0) return Qnil;
01438 }
01439 if (BEG(nth) == -1) return Qfalse;
01440 return Qtrue;
01441 }
01442
01443 VALUE
01444 rb_reg_nth_match(int nth, VALUE match)
01445 {
01446 VALUE str;
01447 long start, end, len;
01448 struct re_registers *regs;
01449
01450 if (NIL_P(match)) return Qnil;
01451 match_check(match);
01452 regs = RMATCH_REGS(match);
01453 if (nth >= regs->num_regs) {
01454 return Qnil;
01455 }
01456 if (nth < 0) {
01457 nth += regs->num_regs;
01458 if (nth <= 0) return Qnil;
01459 }
01460 start = BEG(nth);
01461 if (start == -1) return Qnil;
01462 end = END(nth);
01463 len = end - start;
01464 str = rb_str_subseq(RMATCH(match)->str, start, len);
01465 OBJ_INFECT(str, match);
01466 return str;
01467 }
01468
01469 VALUE
01470 rb_reg_last_match(VALUE match)
01471 {
01472 return rb_reg_nth_match(0, match);
01473 }
01474
01475
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487 VALUE
01488 rb_reg_match_pre(VALUE match)
01489 {
01490 VALUE str;
01491 struct re_registers *regs;
01492
01493 if (NIL_P(match)) return Qnil;
01494 match_check(match);
01495 regs = RMATCH_REGS(match);
01496 if (BEG(0) == -1) return Qnil;
01497 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01498 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01499 return str;
01500 }
01501
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513
01514 VALUE
01515 rb_reg_match_post(VALUE match)
01516 {
01517 VALUE str;
01518 long pos;
01519 struct re_registers *regs;
01520
01521 if (NIL_P(match)) return Qnil;
01522 match_check(match);
01523 regs = RMATCH_REGS(match);
01524 if (BEG(0) == -1) return Qnil;
01525 str = RMATCH(match)->str;
01526 pos = END(0);
01527 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01528 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01529 return str;
01530 }
01531
01532 VALUE
01533 rb_reg_match_last(VALUE match)
01534 {
01535 int i;
01536 struct re_registers *regs;
01537
01538 if (NIL_P(match)) return Qnil;
01539 match_check(match);
01540 regs = RMATCH_REGS(match);
01541 if (BEG(0) == -1) return Qnil;
01542
01543 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01544 ;
01545 if (i == 0) return Qnil;
01546 return rb_reg_nth_match(i, match);
01547 }
01548
01549 static VALUE
01550 last_match_getter(void)
01551 {
01552 return rb_reg_last_match(rb_backref_get());
01553 }
01554
01555 static VALUE
01556 prematch_getter(void)
01557 {
01558 return rb_reg_match_pre(rb_backref_get());
01559 }
01560
01561 static VALUE
01562 postmatch_getter(void)
01563 {
01564 return rb_reg_match_post(rb_backref_get());
01565 }
01566
01567 static VALUE
01568 last_paren_match_getter(void)
01569 {
01570 return rb_reg_match_last(rb_backref_get());
01571 }
01572
01573 static VALUE
01574 match_array(VALUE match, int start)
01575 {
01576 struct re_registers *regs;
01577 VALUE ary;
01578 VALUE target;
01579 int i;
01580 int taint = OBJ_TAINTED(match);
01581
01582 match_check(match);
01583 regs = RMATCH_REGS(match);
01584 ary = rb_ary_new2(regs->num_regs);
01585 target = RMATCH(match)->str;
01586
01587 for (i=start; i<regs->num_regs; i++) {
01588 if (regs->beg[i] == -1) {
01589 rb_ary_push(ary, Qnil);
01590 }
01591 else {
01592 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01593 if (taint) OBJ_TAINT(str);
01594 rb_ary_push(ary, str);
01595 }
01596 }
01597 return ary;
01598 }
01599
01600
01601
01602
01603
01604
01605
01606
01607
01608
01609
01610
01611
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627 static VALUE
01628 match_to_a(VALUE match)
01629 {
01630 return match_array(match, 0);
01631 }
01632
01633
01634
01635
01636
01637
01638
01639
01640
01641
01642
01643
01644
01645
01646 static VALUE
01647 match_captures(VALUE match)
01648 {
01649 return match_array(match, 1);
01650 }
01651
01652 static int
01653 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01654 {
01655 int num;
01656
01657 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01658 (const unsigned char* )name, (const unsigned char* )name_end, regs);
01659 if (num >= 1) {
01660 return num;
01661 }
01662 else {
01663 VALUE s = rb_str_new(name, (long )(name_end - name));
01664 rb_raise(rb_eIndexError, "undefined group name reference: %s",
01665 StringValuePtr(s));
01666 }
01667 }
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695 static VALUE
01696 match_aref(int argc, VALUE *argv, VALUE match)
01697 {
01698 VALUE idx, rest;
01699
01700 match_check(match);
01701 rb_scan_args(argc, argv, "11", &idx, &rest);
01702
01703 if (NIL_P(rest)) {
01704 if (FIXNUM_P(idx)) {
01705 if (FIX2INT(idx) >= 0) {
01706 return rb_reg_nth_match(FIX2INT(idx), match);
01707 }
01708 }
01709 else {
01710 const char *p;
01711 int num;
01712
01713 switch (TYPE(idx)) {
01714 case T_SYMBOL:
01715 p = rb_id2name(SYM2ID(idx));
01716 goto name_to_backref;
01717 break;
01718 case T_STRING:
01719 p = StringValuePtr(idx);
01720
01721 name_to_backref:
01722 num = name_to_backref_number(RMATCH_REGS(match),
01723 RMATCH(match)->regexp, p, p + strlen(p));
01724 return rb_reg_nth_match(num, match);
01725 break;
01726
01727 default:
01728 break;
01729 }
01730 }
01731 }
01732
01733 return rb_ary_aref(argc, argv, match_to_a(match));
01734 }
01735
01736 static VALUE
01737 match_entry(VALUE match, long n)
01738 {
01739
01740 return rb_reg_nth_match((int)n, match);
01741 }
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757 static VALUE
01758 match_values_at(int argc, VALUE *argv, VALUE match)
01759 {
01760 struct re_registers *regs;
01761
01762 match_check(match);
01763 regs = RMATCH_REGS(match);
01764 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01765 }
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778 static VALUE
01779 match_to_s(VALUE match)
01780 {
01781 VALUE str = rb_reg_last_match(match);
01782
01783 match_check(match);
01784 if (NIL_P(str)) str = rb_str_new(0,0);
01785 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01786 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01787 return str;
01788 }
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801 static VALUE
01802 match_string(VALUE match)
01803 {
01804 match_check(match);
01805 return RMATCH(match)->str;
01806 }
01807
01808 struct backref_name_tag {
01809 const UChar *name;
01810 long len;
01811 };
01812
01813 static int
01814 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01815 int back_num, int *back_refs, OnigRegex regex, void *arg0)
01816 {
01817 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01818 int i;
01819
01820 for (i = 0; i < back_num; i++) {
01821 arg[back_refs[i]].name = name;
01822 arg[back_refs[i]].len = name_end - name;
01823 }
01824 return 0;
01825 }
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847 static VALUE
01848 match_inspect(VALUE match)
01849 {
01850 const char *cname = rb_obj_classname(match);
01851 VALUE str;
01852 int i;
01853 struct re_registers *regs = RMATCH_REGS(match);
01854 int num_regs = regs->num_regs;
01855 struct backref_name_tag *names;
01856 VALUE regexp = RMATCH(match)->regexp;
01857
01858 if (regexp == 0) {
01859 return rb_sprintf("#<%s:%p>", cname, (void*)match);
01860 }
01861
01862 names = ALLOCA_N(struct backref_name_tag, num_regs);
01863 MEMZERO(names, struct backref_name_tag, num_regs);
01864
01865 onig_foreach_name(RREGEXP(regexp)->ptr,
01866 match_inspect_name_iter, names);
01867
01868 str = rb_str_buf_new2("#<");
01869 rb_str_buf_cat2(str, cname);
01870
01871 for (i = 0; i < num_regs; i++) {
01872 VALUE v;
01873 rb_str_buf_cat2(str, " ");
01874 if (0 < i) {
01875 if (names[i].name)
01876 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01877 else {
01878 rb_str_catf(str, "%d", i);
01879 }
01880 rb_str_buf_cat2(str, ":");
01881 }
01882 v = rb_reg_nth_match(i, match);
01883 if (v == Qnil)
01884 rb_str_buf_cat2(str, "nil");
01885 else
01886 rb_str_buf_append(str, rb_str_inspect(v));
01887 }
01888 rb_str_buf_cat2(str, ">");
01889
01890 return str;
01891 }
01892
01893 VALUE rb_cRegexp;
01894
01895 static int
01896 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01897 {
01898 const char *p = *pp;
01899 int code;
01900 int meta_prefix = 0, ctrl_prefix = 0;
01901 size_t len;
01902 int retbyte;
01903
01904 retbyte = -1;
01905 if (p == end || *p++ != '\\') {
01906 errcpy(err, "too short escaped multibyte character");
01907 return -1;
01908 }
01909
01910 again:
01911 if (p == end) {
01912 errcpy(err, "too short escape sequence");
01913 return -1;
01914 }
01915 switch (*p++) {
01916 case '\\': code = '\\'; break;
01917 case 'n': code = '\n'; break;
01918 case 't': code = '\t'; break;
01919 case 'r': code = '\r'; break;
01920 case 'f': code = '\f'; break;
01921 case 'v': code = '\013'; break;
01922 case 'a': code = '\007'; break;
01923 case 'e': code = '\033'; break;
01924
01925
01926 case '0': case '1': case '2': case '3':
01927 case '4': case '5': case '6': case '7':
01928 p--;
01929 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01930 p += len;
01931 break;
01932
01933 case 'x':
01934 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01935 if (len < 1) {
01936 errcpy(err, "invalid hex escape");
01937 return -1;
01938 }
01939 p += len;
01940 break;
01941
01942 case 'M':
01943 if (meta_prefix) {
01944 errcpy(err, "duplicate meta escape");
01945 return -1;
01946 }
01947 meta_prefix = 1;
01948 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01949 if (*p == '\\') {
01950 p++;
01951 goto again;
01952 }
01953 else {
01954 code = *p++;
01955 break;
01956 }
01957 }
01958 errcpy(err, "too short meta escape");
01959 return -1;
01960
01961 case 'C':
01962 if (p == end || *p++ != '-') {
01963 errcpy(err, "too short control escape");
01964 return -1;
01965 }
01966 case 'c':
01967 if (ctrl_prefix) {
01968 errcpy(err, "duplicate control escape");
01969 return -1;
01970 }
01971 ctrl_prefix = 1;
01972 if (p < end && (*p & 0x80) == 0) {
01973 if (*p == '\\') {
01974 p++;
01975 goto again;
01976 }
01977 else {
01978 code = *p++;
01979 break;
01980 }
01981 }
01982 errcpy(err, "too short control escape");
01983 return -1;
01984
01985 default:
01986 errcpy(err, "unexpected escape sequence");
01987 return -1;
01988 }
01989 if (code < 0 || 0xff < code) {
01990 errcpy(err, "invalid escape code");
01991 return -1;
01992 }
01993
01994 if (ctrl_prefix)
01995 code &= 0x1f;
01996 if (meta_prefix)
01997 code |= 0x80;
01998
01999 *pp = p;
02000 return code;
02001 }
02002
02003 static int
02004 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02005 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02006 {
02007 const char *p = *pp;
02008 int chmaxlen = rb_enc_mbmaxlen(enc);
02009 char *chbuf = ALLOCA_N(char, chmaxlen);
02010 int chlen = 0;
02011 int byte;
02012 int l;
02013
02014 memset(chbuf, 0, chmaxlen);
02015
02016 byte = read_escaped_byte(&p, end, err);
02017 if (byte == -1) {
02018 return -1;
02019 }
02020
02021 chbuf[chlen++] = byte;
02022 while (chlen < chmaxlen &&
02023 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02024 byte = read_escaped_byte(&p, end, err);
02025 if (byte == -1) {
02026 return -1;
02027 }
02028 chbuf[chlen++] = byte;
02029 }
02030
02031 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02032 if (MBCLEN_INVALID_P(l)) {
02033 errcpy(err, "invalid multibyte escape");
02034 return -1;
02035 }
02036 if (1 < chlen || (chbuf[0] & 0x80)) {
02037 rb_str_buf_cat(buf, chbuf, chlen);
02038
02039 if (*encp == 0)
02040 *encp = enc;
02041 else if (*encp != enc) {
02042 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02043 return -1;
02044 }
02045 }
02046 else {
02047 char escbuf[5];
02048 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02049 rb_str_buf_cat(buf, escbuf, 4);
02050 }
02051 *pp = p;
02052 return 0;
02053 }
02054
02055 static int
02056 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02057 {
02058 if ((0xd800 <= code && code <= 0xdfff) ||
02059 0x10ffff < code) {
02060 errcpy(err, "invalid Unicode range");
02061 return -1;
02062 }
02063 return 0;
02064 }
02065
02066 static int
02067 append_utf8(unsigned long uv,
02068 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02069 {
02070 if (check_unicode_range(uv, err) != 0)
02071 return -1;
02072 if (uv < 0x80) {
02073 char escbuf[5];
02074 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02075 rb_str_buf_cat(buf, escbuf, 4);
02076 }
02077 else {
02078 int len;
02079 char utf8buf[6];
02080 len = rb_uv_to_utf8(utf8buf, uv);
02081 rb_str_buf_cat(buf, utf8buf, len);
02082
02083 if (*encp == 0)
02084 *encp = rb_utf8_encoding();
02085 else if (*encp != rb_utf8_encoding()) {
02086 errcpy(err, "UTF-8 character in non UTF-8 regexp");
02087 return -1;
02088 }
02089 }
02090 return 0;
02091 }
02092
02093 static int
02094 unescape_unicode_list(const char **pp, const char *end,
02095 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02096 {
02097 const char *p = *pp;
02098 int has_unicode = 0;
02099 unsigned long code;
02100 size_t len;
02101
02102 while (p < end && ISSPACE(*p)) p++;
02103
02104 while (1) {
02105 code = ruby_scan_hex(p, end-p, &len);
02106 if (len == 0)
02107 break;
02108 if (6 < len) {
02109 errcpy(err, "invalid Unicode range");
02110 return -1;
02111 }
02112 p += len;
02113 if (append_utf8(code, buf, encp, err) != 0)
02114 return -1;
02115 has_unicode = 1;
02116
02117 while (p < end && ISSPACE(*p)) p++;
02118 }
02119
02120 if (has_unicode == 0) {
02121 errcpy(err, "invalid Unicode list");
02122 return -1;
02123 }
02124
02125 *pp = p;
02126
02127 return 0;
02128 }
02129
02130 static int
02131 unescape_unicode_bmp(const char **pp, const char *end,
02132 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02133 {
02134 const char *p = *pp;
02135 size_t len;
02136 unsigned long code;
02137
02138 if (end < p+4) {
02139 errcpy(err, "invalid Unicode escape");
02140 return -1;
02141 }
02142 code = ruby_scan_hex(p, 4, &len);
02143 if (len != 4) {
02144 errcpy(err, "invalid Unicode escape");
02145 return -1;
02146 }
02147 if (append_utf8(code, buf, encp, err) != 0)
02148 return -1;
02149 *pp = p + 4;
02150 return 0;
02151 }
02152
02153 static int
02154 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02155 VALUE buf, rb_encoding **encp, int *has_property,
02156 onig_errmsg_buffer err)
02157 {
02158 char c;
02159 char smallbuf[2];
02160
02161 while (p < end) {
02162 int chlen = rb_enc_precise_mbclen(p, end, enc);
02163 if (!MBCLEN_CHARFOUND_P(chlen)) {
02164 errcpy(err, "invalid multibyte character");
02165 return -1;
02166 }
02167 chlen = MBCLEN_CHARFOUND_LEN(chlen);
02168 if (1 < chlen || (*p & 0x80)) {
02169 rb_str_buf_cat(buf, p, chlen);
02170 p += chlen;
02171 if (*encp == 0)
02172 *encp = enc;
02173 else if (*encp != enc) {
02174 errcpy(err, "non ASCII character in UTF-8 regexp");
02175 return -1;
02176 }
02177 continue;
02178 }
02179
02180 switch (c = *p++) {
02181 case '\\':
02182 if (p == end) {
02183 errcpy(err, "too short escape sequence");
02184 return -1;
02185 }
02186 switch (c = *p++) {
02187 case '1': case '2': case '3':
02188 case '4': case '5': case '6': case '7':
02189 {
02190 size_t octlen;
02191 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02192
02193
02194
02195 goto escape_asis;
02196 }
02197 }
02198
02199
02200 case '0':
02201
02202 case 'x':
02203 case 'c':
02204 case 'C':
02205 case 'M':
02206 p = p-2;
02207 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02208 return -1;
02209 break;
02210
02211 case 'u':
02212 if (p == end) {
02213 errcpy(err, "too short escape sequence");
02214 return -1;
02215 }
02216 if (*p == '{') {
02217
02218 p++;
02219 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02220 return -1;
02221 if (p == end || *p++ != '}') {
02222 errcpy(err, "invalid Unicode list");
02223 return -1;
02224 }
02225 break;
02226 }
02227 else {
02228
02229 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02230 return -1;
02231 break;
02232 }
02233
02234 case 'p':
02235 case 'P':
02236 if (!*encp) {
02237 *has_property = 1;
02238 }
02239 goto escape_asis;
02240
02241 default:
02242 escape_asis:
02243 smallbuf[0] = '\\';
02244 smallbuf[1] = c;
02245 rb_str_buf_cat(buf, smallbuf, 2);
02246 break;
02247 }
02248 break;
02249
02250 default:
02251 rb_str_buf_cat(buf, &c, 1);
02252 break;
02253 }
02254 }
02255
02256 return 0;
02257 }
02258
02259 static VALUE
02260 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02261 rb_encoding **fixed_enc, onig_errmsg_buffer err)
02262 {
02263 VALUE buf;
02264 int has_property = 0;
02265
02266 buf = rb_str_buf_new(0);
02267
02268 if (rb_enc_asciicompat(enc))
02269 *fixed_enc = 0;
02270 else {
02271 *fixed_enc = enc;
02272 rb_enc_associate(buf, enc);
02273 }
02274
02275 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02276 return Qnil;
02277
02278 if (has_property && !*fixed_enc) {
02279 *fixed_enc = enc;
02280 }
02281
02282 if (*fixed_enc) {
02283 rb_enc_associate(buf, *fixed_enc);
02284 }
02285
02286 return buf;
02287 }
02288
02289 VALUE
02290 rb_reg_check_preprocess(VALUE str)
02291 {
02292 rb_encoding *fixed_enc = 0;
02293 onig_errmsg_buffer err = "";
02294 VALUE buf;
02295 char *p, *end;
02296 rb_encoding *enc;
02297
02298 StringValue(str);
02299 p = RSTRING_PTR(str);
02300 end = p + RSTRING_LEN(str);
02301 enc = rb_enc_get(str);
02302
02303 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02304 RB_GC_GUARD(str);
02305
02306 if (buf == Qnil) {
02307 return rb_reg_error_desc(str, 0, err);
02308 }
02309 return Qnil;
02310 }
02311
02312 static VALUE
02313 rb_reg_preprocess_dregexp(VALUE ary, int options)
02314 {
02315 rb_encoding *fixed_enc = 0;
02316 rb_encoding *regexp_enc = 0;
02317 onig_errmsg_buffer err = "";
02318 int i;
02319 VALUE result = 0;
02320 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02321
02322 if (RARRAY_LEN(ary) == 0) {
02323 rb_raise(rb_eArgError, "no arguments given");
02324 }
02325
02326 for (i = 0; i < RARRAY_LEN(ary); i++) {
02327 VALUE str = RARRAY_PTR(ary)[i];
02328 VALUE buf;
02329 char *p, *end;
02330 rb_encoding *src_enc;
02331
02332 src_enc = rb_enc_get(str);
02333 if (options & ARG_ENCODING_NONE &&
02334 src_enc != ascii8bit) {
02335 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02336 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02337 else
02338 src_enc = ascii8bit;
02339 }
02340
02341 StringValue(str);
02342 p = RSTRING_PTR(str);
02343 end = p + RSTRING_LEN(str);
02344
02345 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02346
02347 if (buf == Qnil)
02348 rb_raise(rb_eArgError, "%s", err);
02349
02350 if (fixed_enc != 0) {
02351 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02352 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02353 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02354 }
02355 regexp_enc = fixed_enc;
02356 }
02357
02358 if (!result)
02359 result = rb_str_new3(str);
02360 else
02361 rb_str_buf_append(result, str);
02362 }
02363 if (regexp_enc) {
02364 rb_enc_associate(result, regexp_enc);
02365 }
02366
02367 return result;
02368 }
02369
02370 static int
02371 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02372 int options, onig_errmsg_buffer err,
02373 const char *sourcefile, int sourceline)
02374 {
02375 struct RRegexp *re = RREGEXP(obj);
02376 VALUE unescaped;
02377 rb_encoding *fixed_enc = 0;
02378 rb_encoding *a_enc = rb_ascii8bit_encoding();
02379
02380 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02381 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02382 rb_check_frozen(obj);
02383 if (FL_TEST(obj, REG_LITERAL))
02384 rb_raise(rb_eSecurityError, "can't modify literal regexp");
02385 if (re->ptr)
02386 rb_raise(rb_eTypeError, "already initialized regexp");
02387 re->ptr = 0;
02388
02389 if (rb_enc_dummy_p(enc)) {
02390 errcpy(err, "can't make regexp with dummy encoding");
02391 return -1;
02392 }
02393
02394 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02395 if (unescaped == Qnil)
02396 return -1;
02397
02398 if (fixed_enc) {
02399 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02400 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02401 errcpy(err, "incompatible character encoding");
02402 return -1;
02403 }
02404 if (fixed_enc != a_enc) {
02405 options |= ARG_ENCODING_FIXED;
02406 enc = fixed_enc;
02407 }
02408 }
02409 else if (!(options & ARG_ENCODING_FIXED)) {
02410 enc = rb_usascii_encoding();
02411 }
02412
02413 rb_enc_associate((VALUE)re, enc);
02414 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02415 re->basic.flags |= KCODE_FIXED;
02416 }
02417 if (options & ARG_ENCODING_NONE) {
02418 re->basic.flags |= REG_ENCODING_NONE;
02419 }
02420
02421 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02422 options & ARG_REG_OPTION_MASK, err,
02423 sourcefile, sourceline);
02424 if (!re->ptr) return -1;
02425 re->src = rb_enc_str_new(s, len, enc);
02426 OBJ_FREEZE(re->src);
02427 RB_GC_GUARD(unescaped);
02428 return 0;
02429 }
02430
02431 static int
02432 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02433 const char *sourcefile, int sourceline)
02434 {
02435 int ret;
02436 rb_encoding *enc = rb_enc_get(str);
02437 if (options & ARG_ENCODING_NONE) {
02438 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02439 if (enc != ascii8bit) {
02440 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02441 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02442 return -1;
02443 }
02444 enc = ascii8bit;
02445 }
02446 }
02447 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02448 options, err, sourcefile, sourceline);
02449 OBJ_INFECT(obj, str);
02450 RB_GC_GUARD(str);
02451 return ret;
02452 }
02453
02454 static VALUE
02455 rb_reg_s_alloc(VALUE klass)
02456 {
02457 NEWOBJ(re, struct RRegexp);
02458 OBJSETUP(re, klass, T_REGEXP);
02459
02460 re->ptr = 0;
02461 re->src = 0;
02462 re->usecnt = 0;
02463
02464 return (VALUE)re;
02465 }
02466
02467 VALUE
02468 rb_reg_alloc(void)
02469 {
02470 return rb_reg_s_alloc(rb_cRegexp);
02471 }
02472
02473 VALUE
02474 rb_reg_new_str(VALUE s, int options)
02475 {
02476 return rb_reg_init_str(rb_reg_alloc(), s, options);
02477 }
02478
02479 VALUE
02480 rb_reg_init_str(VALUE re, VALUE s, int options)
02481 {
02482 onig_errmsg_buffer err = "";
02483
02484 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02485 rb_reg_raise_str(s, options, err);
02486 }
02487
02488 return re;
02489 }
02490
02491 VALUE
02492 rb_reg_new_ary(VALUE ary, int opt)
02493 {
02494 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02495 }
02496
02497 VALUE
02498 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02499 {
02500 VALUE re = rb_reg_alloc();
02501 onig_errmsg_buffer err = "";
02502
02503 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02504 rb_enc_reg_raise(s, len, enc, options, err);
02505 }
02506
02507 return re;
02508 }
02509
02510 VALUE
02511 rb_reg_new(const char *s, long len, int options)
02512 {
02513 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02514 }
02515
02516 VALUE
02517 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02518 {
02519 VALUE re = rb_reg_alloc();
02520 onig_errmsg_buffer err = "";
02521
02522 if (!str) str = rb_str_new(0,0);
02523 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02524 rb_set_errinfo(rb_reg_error_desc(str, options, err));
02525 return Qnil;
02526 }
02527 FL_SET(re, REG_LITERAL);
02528 return re;
02529 }
02530
02531 static VALUE reg_cache;
02532
02533 VALUE
02534 rb_reg_regcomp(VALUE str)
02535 {
02536 volatile VALUE save_str = str;
02537 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02538 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02539 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02540 return reg_cache;
02541
02542 return reg_cache = rb_reg_new_str(save_str, 0);
02543 }
02544
02545 static st_index_t reg_hash(VALUE re);
02546
02547
02548
02549
02550
02551
02552
02553 static VALUE
02554 rb_reg_hash(VALUE re)
02555 {
02556 st_index_t hashval = reg_hash(re);
02557 return LONG2FIX(hashval);
02558 }
02559
02560 static st_index_t
02561 reg_hash(VALUE re)
02562 {
02563 st_index_t hashval;
02564
02565 rb_reg_check(re);
02566 hashval = RREGEXP(re)->ptr->options;
02567 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02568 return rb_hash_end(hashval);
02569 }
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587 static VALUE
02588 rb_reg_equal(VALUE re1, VALUE re2)
02589 {
02590 if (re1 == re2) return Qtrue;
02591 if (TYPE(re2) != T_REGEXP) return Qfalse;
02592 rb_reg_check(re1); rb_reg_check(re2);
02593 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02594 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02595 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02596 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02597 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02598 return Qtrue;
02599 }
02600 return Qfalse;
02601 }
02602
02603
02604
02605
02606
02607
02608
02609
02610
02611 static VALUE
02612 match_hash(VALUE match)
02613 {
02614 const struct re_registers *regs;
02615 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02616
02617 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02618 regs = RMATCH_REGS(match);
02619 hashval = rb_hash_uint(hashval, regs->num_regs);
02620 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02621 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02622 hashval = rb_hash_end(hashval);
02623 return LONG2FIX(hashval);
02624 }
02625
02626
02627
02628
02629
02630
02631
02632
02633
02634 static VALUE
02635 match_equal(VALUE match1, VALUE match2)
02636 {
02637 const struct re_registers *regs1, *regs2;
02638 if (match1 == match2) return Qtrue;
02639 if (TYPE(match2) != T_MATCH) return Qfalse;
02640 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02641 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02642 regs1 = RMATCH_REGS(match1);
02643 regs2 = RMATCH_REGS(match2);
02644 if (regs1->num_regs != regs2->num_regs) return Qfalse;
02645 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02646 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02647 return Qtrue;
02648 }
02649
02650 static VALUE
02651 reg_operand(VALUE s, int check)
02652 {
02653 if (SYMBOL_P(s)) {
02654 return rb_sym_to_s(s);
02655 }
02656 else {
02657 VALUE tmp = rb_check_string_type(s);
02658 if (check && NIL_P(tmp)) {
02659 rb_raise(rb_eTypeError, "can't convert %s to String",
02660 rb_obj_classname(s));
02661 }
02662 return tmp;
02663 }
02664 }
02665
02666 static long
02667 reg_match_pos(VALUE re, VALUE *strp, long pos)
02668 {
02669 VALUE str = *strp;
02670
02671 if (NIL_P(str)) {
02672 rb_backref_set(Qnil);
02673 return -1;
02674 }
02675 *strp = str = reg_operand(str, TRUE);
02676 if (pos != 0) {
02677 if (pos < 0) {
02678 VALUE l = rb_str_length(str);
02679 pos += NUM2INT(l);
02680 if (pos < 0) {
02681 return pos;
02682 }
02683 }
02684 pos = rb_str_offset(str, pos);
02685 }
02686 return rb_reg_search(re, str, pos, 0);
02687 }
02688
02689
02690
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724
02725
02726
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736
02737 VALUE
02738 rb_reg_match(VALUE re, VALUE str)
02739 {
02740 long pos = reg_match_pos(re, &str, 0);
02741 if (pos < 0) return Qnil;
02742 pos = rb_str_sublen(str, pos);
02743 return LONG2FIX(pos);
02744 }
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763
02764 VALUE
02765 rb_reg_eqq(VALUE re, VALUE str)
02766 {
02767 long start;
02768
02769 str = reg_operand(str, FALSE);
02770 if (NIL_P(str)) {
02771 rb_backref_set(Qnil);
02772 return Qfalse;
02773 }
02774 start = rb_reg_search(re, str, 0, 0);
02775 if (start < 0) {
02776 return Qfalse;
02777 }
02778 return Qtrue;
02779 }
02780
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793 VALUE
02794 rb_reg_match2(VALUE re)
02795 {
02796 long start;
02797 VALUE line = rb_lastline_get();
02798
02799 if (TYPE(line) != T_STRING) {
02800 rb_backref_set(Qnil);
02801 return Qnil;
02802 }
02803
02804 start = rb_reg_search(re, line, 0, 0);
02805 if (start < 0) {
02806 return Qnil;
02807 }
02808 start = rb_str_sublen(line, start);
02809 return LONG2FIX(start);
02810 }
02811
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841 static VALUE
02842 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02843 {
02844 VALUE result, str, initpos;
02845 long pos;
02846
02847 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02848 pos = NUM2LONG(initpos);
02849 }
02850 else {
02851 pos = 0;
02852 }
02853
02854 pos = reg_match_pos(re, &str, pos);
02855 if (pos < 0) {
02856 rb_backref_set(Qnil);
02857 return Qnil;
02858 }
02859 result = rb_backref_get();
02860 rb_match_busy(result);
02861 if (!NIL_P(result) && rb_block_given_p()) {
02862 return rb_yield(result);
02863 }
02864 return result;
02865 }
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897 static VALUE
02898 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02899 {
02900 onig_errmsg_buffer err = "";
02901 int flags = 0;
02902 VALUE str;
02903 rb_encoding *enc;
02904 const char *ptr;
02905 long len;
02906
02907 if (argc == 0 || argc > 3) {
02908 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02909 }
02910 if (TYPE(argv[0]) == T_REGEXP) {
02911 VALUE re = argv[0];
02912
02913 if (argc > 1) {
02914 rb_warn("flags ignored");
02915 }
02916 rb_reg_check(re);
02917 flags = rb_reg_options(re);
02918 ptr = RREGEXP_SRC_PTR(re);
02919 len = RREGEXP_SRC_LEN(re);
02920 enc = rb_enc_get(re);
02921 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02922 str = rb_enc_str_new(ptr, len, enc);
02923 rb_reg_raise_str(str, flags, err);
02924 }
02925 }
02926 else {
02927 if (argc >= 2) {
02928 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02929 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02930 }
02931 enc = 0;
02932 if (argc == 3 && !NIL_P(argv[2])) {
02933 char *kcode = StringValuePtr(argv[2]);
02934 if (kcode[0] == 'n' || kcode[0] == 'N') {
02935 enc = rb_ascii8bit_encoding();
02936 flags |= ARG_ENCODING_NONE;
02937 }
02938 else {
02939 rb_warn("encoding option is ignored - %s", kcode);
02940 }
02941 }
02942 str = argv[0];
02943 ptr = StringValuePtr(str);
02944 if (enc
02945 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02946 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02947 rb_reg_raise_str(str, flags, err);
02948 }
02949 }
02950 return self;
02951 }
02952
02953 VALUE
02954 rb_reg_quote(VALUE str)
02955 {
02956 rb_encoding *enc = rb_enc_get(str);
02957 char *s, *send, *t;
02958 VALUE tmp;
02959 int c, clen;
02960 int ascii_only = rb_enc_str_asciionly_p(str);
02961
02962 s = RSTRING_PTR(str);
02963 send = s + RSTRING_LEN(str);
02964 while (s < send) {
02965 c = rb_enc_ascget(s, send, &clen, enc);
02966 if (c == -1) {
02967 s += mbclen(s, send, enc);
02968 continue;
02969 }
02970 switch (c) {
02971 case '[': case ']': case '{': case '}':
02972 case '(': case ')': case '|': case '-':
02973 case '*': case '.': case '\\':
02974 case '?': case '+': case '^': case '$':
02975 case ' ': case '#':
02976 case '\t': case '\f': case '\v': case '\n': case '\r':
02977 goto meta_found;
02978 }
02979 s += clen;
02980 }
02981 tmp = rb_str_new3(str);
02982 if (ascii_only) {
02983 rb_enc_associate(tmp, rb_usascii_encoding());
02984 }
02985 return tmp;
02986
02987 meta_found:
02988 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02989 if (ascii_only) {
02990 rb_enc_associate(tmp, rb_usascii_encoding());
02991 }
02992 else {
02993 rb_enc_copy(tmp, str);
02994 }
02995 t = RSTRING_PTR(tmp);
02996
02997 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02998 t += s - RSTRING_PTR(str);
02999
03000 while (s < send) {
03001 c = rb_enc_ascget(s, send, &clen, enc);
03002 if (c == -1) {
03003 int n = mbclen(s, send, enc);
03004
03005 while (n--)
03006 *t++ = *s++;
03007 continue;
03008 }
03009 s += clen;
03010 switch (c) {
03011 case '[': case ']': case '{': case '}':
03012 case '(': case ')': case '|': case '-':
03013 case '*': case '.': case '\\':
03014 case '?': case '+': case '^': case '$':
03015 case '#':
03016 t += rb_enc_mbcput('\\', t, enc);
03017 break;
03018 case ' ':
03019 t += rb_enc_mbcput('\\', t, enc);
03020 t += rb_enc_mbcput(' ', t, enc);
03021 continue;
03022 case '\t':
03023 t += rb_enc_mbcput('\\', t, enc);
03024 t += rb_enc_mbcput('t', t, enc);
03025 continue;
03026 case '\n':
03027 t += rb_enc_mbcput('\\', t, enc);
03028 t += rb_enc_mbcput('n', t, enc);
03029 continue;
03030 case '\r':
03031 t += rb_enc_mbcput('\\', t, enc);
03032 t += rb_enc_mbcput('r', t, enc);
03033 continue;
03034 case '\f':
03035 t += rb_enc_mbcput('\\', t, enc);
03036 t += rb_enc_mbcput('f', t, enc);
03037 continue;
03038 case '\v':
03039 t += rb_enc_mbcput('\\', t, enc);
03040 t += rb_enc_mbcput('v', t, enc);
03041 continue;
03042 }
03043 t += rb_enc_mbcput(c, t, enc);
03044 }
03045 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03046 OBJ_INFECT(tmp, str);
03047 return tmp;
03048 }
03049
03050
03051
03052
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064
03065 static VALUE
03066 rb_reg_s_quote(VALUE c, VALUE str)
03067 {
03068 return rb_reg_quote(reg_operand(str, TRUE));
03069 }
03070
03071 int
03072 rb_reg_options(VALUE re)
03073 {
03074 int options;
03075
03076 rb_reg_check(re);
03077 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03078 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03079 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03080 return options;
03081 }
03082
03083 VALUE
03084 rb_check_regexp_type(VALUE re)
03085 {
03086 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03087 }
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105
03106 static VALUE
03107 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03108 {
03109 return rb_check_regexp_type(re);
03110 }
03111
03112 static VALUE
03113 rb_reg_s_union(VALUE self, VALUE args0)
03114 {
03115 long argc = RARRAY_LEN(args0);
03116
03117 if (argc == 0) {
03118 VALUE args[1];
03119 args[0] = rb_str_new2("(?!)");
03120 return rb_class_new_instance(1, args, rb_cRegexp);
03121 }
03122 else if (argc == 1) {
03123 VALUE arg = rb_ary_entry(args0, 0);
03124 VALUE re = rb_check_regexp_type(arg);
03125 if (!NIL_P(re))
03126 return re;
03127 else {
03128 VALUE quoted;
03129 quoted = rb_reg_s_quote(Qnil, arg);
03130 return rb_reg_new_str(quoted, 0);
03131 }
03132 }
03133 else {
03134 int i;
03135 VALUE source = rb_str_buf_new(0);
03136 rb_encoding *result_enc;
03137
03138 int has_asciionly = 0;
03139 rb_encoding *has_ascii_compat_fixed = 0;
03140 rb_encoding *has_ascii_incompat = 0;
03141
03142 for (i = 0; i < argc; i++) {
03143 volatile VALUE v;
03144 VALUE e = rb_ary_entry(args0, i);
03145
03146 if (0 < i)
03147 rb_str_buf_cat_ascii(source, "|");
03148
03149 v = rb_check_regexp_type(e);
03150 if (!NIL_P(v)) {
03151 rb_encoding *enc = rb_enc_get(v);
03152 if (!rb_enc_asciicompat(enc)) {
03153 if (!has_ascii_incompat)
03154 has_ascii_incompat = enc;
03155 else if (has_ascii_incompat != enc)
03156 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03157 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03158 }
03159 else if (rb_reg_fixed_encoding_p(v)) {
03160 if (!has_ascii_compat_fixed)
03161 has_ascii_compat_fixed = enc;
03162 else if (has_ascii_compat_fixed != enc)
03163 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03164 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03165 }
03166 else {
03167 has_asciionly = 1;
03168 }
03169 v = rb_reg_to_s(v);
03170 }
03171 else {
03172 rb_encoding *enc;
03173 StringValue(e);
03174 enc = rb_enc_get(e);
03175 if (!rb_enc_str_asciicompat_p(e)) {
03176 if (!has_ascii_incompat)
03177 has_ascii_incompat = enc;
03178 else if (has_ascii_incompat != enc)
03179 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03180 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03181 }
03182 else if (rb_enc_str_asciionly_p(e)) {
03183 has_asciionly = 1;
03184 }
03185 else {
03186 if (!has_ascii_compat_fixed)
03187 has_ascii_compat_fixed = enc;
03188 else if (has_ascii_compat_fixed != enc)
03189 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03190 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03191 }
03192 v = rb_reg_s_quote(Qnil, e);
03193 }
03194 if (has_ascii_incompat) {
03195 if (has_asciionly) {
03196 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03197 rb_enc_name(has_ascii_incompat));
03198 }
03199 if (has_ascii_compat_fixed) {
03200 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03201 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03202 }
03203 }
03204
03205 if (i == 0) {
03206 rb_enc_copy(source, v);
03207 }
03208 rb_str_append(source, v);
03209 }
03210
03211 if (has_ascii_incompat) {
03212 result_enc = has_ascii_incompat;
03213 }
03214 else if (has_ascii_compat_fixed) {
03215 result_enc = has_ascii_compat_fixed;
03216 }
03217 else {
03218 result_enc = rb_ascii8bit_encoding();
03219 }
03220
03221 rb_enc_associate(source, result_enc);
03222 return rb_class_new_instance(1, &source, rb_cRegexp);
03223 }
03224 }
03225
03226
03227
03228
03229
03230
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243
03244 static VALUE
03245 rb_reg_s_union_m(VALUE self, VALUE args)
03246 {
03247 VALUE v;
03248 if (RARRAY_LEN(args) == 1 &&
03249 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03250 return rb_reg_s_union(self, v);
03251 }
03252 return rb_reg_s_union(self, args);
03253 }
03254
03255
03256 static VALUE
03257 rb_reg_init_copy(VALUE copy, VALUE re)
03258 {
03259 onig_errmsg_buffer err = "";
03260 const char *s;
03261 long len;
03262
03263 if (copy == re) return copy;
03264 rb_check_frozen(copy);
03265
03266 if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03267 rb_raise(rb_eTypeError, "wrong argument type");
03268 }
03269 rb_reg_check(re);
03270 s = RREGEXP_SRC_PTR(re);
03271 len = RREGEXP_SRC_LEN(re);
03272 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03273 err, NULL, 0) != 0) {
03274 rb_reg_raise(s, len, err, re);
03275 }
03276 return copy;
03277 }
03278
03279 VALUE
03280 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03281 {
03282 VALUE val = 0;
03283 char *p, *s, *e;
03284 int no, clen;
03285 rb_encoding *str_enc = rb_enc_get(str);
03286 rb_encoding *src_enc = rb_enc_get(src);
03287 int acompat = rb_enc_asciicompat(str_enc);
03288 #define ASCGET(s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : rb_enc_ascget(s, e, cl, str_enc))
03289
03290 p = s = RSTRING_PTR(str);
03291 e = s + RSTRING_LEN(str);
03292
03293 while (s < e) {
03294 int c = ASCGET(s, e, &clen);
03295 char *ss;
03296
03297 if (c == -1) {
03298 s += mbclen(s, e, str_enc);
03299 continue;
03300 }
03301 ss = s;
03302 s += clen;
03303
03304 if (c != '\\' || s == e) continue;
03305
03306 if (!val) {
03307 val = rb_str_buf_new(ss-p);
03308 }
03309 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03310
03311 c = ASCGET(s, e, &clen);
03312 if (c == -1) {
03313 s += mbclen(s, e, str_enc);
03314 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03315 p = s;
03316 continue;
03317 }
03318 s += clen;
03319
03320 p = s;
03321 switch (c) {
03322 case '1': case '2': case '3': case '4':
03323 case '5': case '6': case '7': case '8': case '9':
03324 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03325 no = c - '0';
03326 }
03327 else {
03328 continue;
03329 }
03330 break;
03331
03332 case 'k':
03333 if (s < e && ASCGET(s, e, &clen) == '<') {
03334 char *name, *name_end;
03335
03336 name_end = name = s + clen;
03337 while (name_end < e) {
03338 c = ASCGET(name_end, e, &clen);
03339 if (c == '>') break;
03340 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03341 }
03342 if (name_end < e) {
03343 no = name_to_backref_number(regs, regexp, name, name_end);
03344 p = s = name_end + clen;
03345 break;
03346 }
03347 else {
03348 rb_raise(rb_eRuntimeError, "invalid group name reference format");
03349 }
03350 }
03351
03352 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03353 continue;
03354
03355 case '0':
03356 case '&':
03357 no = 0;
03358 break;
03359
03360 case '`':
03361 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03362 continue;
03363
03364 case '\'':
03365 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03366 continue;
03367
03368 case '+':
03369 no = regs->num_regs-1;
03370 while (BEG(no) == -1 && no > 0) no--;
03371 if (no == 0) continue;
03372 break;
03373
03374 case '\\':
03375 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03376 continue;
03377
03378 default:
03379 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03380 continue;
03381 }
03382
03383 if (no >= 0) {
03384 if (no >= regs->num_regs) continue;
03385 if (BEG(no) == -1) continue;
03386 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03387 }
03388 }
03389
03390 if (!val) return str;
03391 if (p < e) {
03392 rb_enc_str_buf_cat(val, p, e-p, str_enc);
03393 }
03394
03395 return val;
03396 }
03397
03398 static VALUE
03399 kcode_getter(void)
03400 {
03401 rb_warn("variable $KCODE is no longer effective");
03402 return Qnil;
03403 }
03404
03405 static void
03406 kcode_setter(VALUE val, ID id)
03407 {
03408 rb_warn("variable $KCODE is no longer effective; ignored");
03409 }
03410
03411 static VALUE
03412 ignorecase_getter(void)
03413 {
03414 rb_warn("variable $= is no longer effective");
03415 return Qfalse;
03416 }
03417
03418 static void
03419 ignorecase_setter(VALUE val, ID id)
03420 {
03421 rb_warn("variable $= is no longer effective; ignored");
03422 }
03423
03424 static VALUE
03425 match_getter(void)
03426 {
03427 VALUE match = rb_backref_get();
03428
03429 if (NIL_P(match)) return Qnil;
03430 rb_match_busy(match);
03431 return match;
03432 }
03433
03434 static void
03435 match_setter(VALUE val)
03436 {
03437 if (!NIL_P(val)) {
03438 Check_Type(val, T_MATCH);
03439 }
03440 rb_backref_set(val);
03441 }
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460
03461
03462
03463
03464
03465
03466
03467
03468
03469 static VALUE
03470 rb_reg_s_last_match(int argc, VALUE *argv)
03471 {
03472 VALUE nth;
03473
03474 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03475 VALUE match = rb_backref_get();
03476 int n;
03477 if (NIL_P(match)) return Qnil;
03478 n = match_backref_number(match, nth);
03479 return rb_reg_nth_match(n, match);
03480 }
03481 return match_getter();
03482 }
03483
03484 static void
03485 re_warn(const char *s)
03486 {
03487 rb_warn("%s", s);
03488 }
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513 void
03514 Init_Regexp(void)
03515 {
03516 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03517
03518 onigenc_set_default_caseconv_table((UChar*)casetable);
03519 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03520 onig_set_warn_func(re_warn);
03521 onig_set_verb_warn_func(re_warn);
03522
03523 rb_define_virtual_variable("$~", match_getter, match_setter);
03524 rb_define_virtual_variable("$&", last_match_getter, 0);
03525 rb_define_virtual_variable("$`", prematch_getter, 0);
03526 rb_define_virtual_variable("$'", postmatch_getter, 0);
03527 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03528
03529 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03530 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03531 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03532
03533 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03534 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03535 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03536 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03537 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03538 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03539 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03540 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03541
03542 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03543 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03544 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03545 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03546 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03547 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03548 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03549 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03550 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03551 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03552 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03553 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03554 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03555 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03556 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0);
03557 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03558 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03559 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03560
03561 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03562 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03563 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03564 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03565
03566 rb_global_variable(®_cache);
03567
03568 rb_cMatch = rb_define_class("MatchData", rb_cObject);
03569 rb_define_alloc_func(rb_cMatch, match_alloc);
03570 rb_undef_method(CLASS_OF(rb_cMatch), "new");
03571
03572 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03573 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03574 rb_define_method(rb_cMatch, "names", match_names, 0);
03575 rb_define_method(rb_cMatch, "size", match_size, 0);
03576 rb_define_method(rb_cMatch, "length", match_size, 0);
03577 rb_define_method(rb_cMatch, "offset", match_offset, 1);
03578 rb_define_method(rb_cMatch, "begin", match_begin, 1);
03579 rb_define_method(rb_cMatch, "end", match_end, 1);
03580 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03581 rb_define_method(rb_cMatch, "[]", match_aref, -1);
03582 rb_define_method(rb_cMatch, "captures", match_captures, 0);
03583 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03584 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03585 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03586 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03587 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03588 rb_define_method(rb_cMatch, "string", match_string, 0);
03589 rb_define_method(rb_cMatch, "hash", match_hash, 0);
03590 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03591 rb_define_method(rb_cMatch, "==", match_equal, 1);
03592 }
03593