00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include "regenc.h"
00032
00033 #if 1
00034 #define DEBUG_GB18030(arg)
00035 #else
00036 #define DEBUG_GB18030(arg) printf arg
00037 #endif
00038
00039 enum {
00040 C1,
00041 C2,
00042 C4,
00043 CM
00044 };
00045
00046 static const char GB18030_MAP[] = {
00047 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00048 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00049 C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00050 C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
00051 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00052 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00053 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00054 C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
00055 C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00056 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00057 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00058 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00059 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00060 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00061 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00062 CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
00063 };
00064
00065 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t;
00066 #define A ACCEPT
00067 #define F FAILURE
00068 static const signed char trans[][0x100] = {
00069 {
00070 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00071 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00072 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00073 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00074 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00075 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00076 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00077 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00078 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00080 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00081 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00082 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00083 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00086 },
00087 {
00088 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00089 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00090 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00091 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, F, F, F, F, F, F,
00092 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00093 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00094 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00095 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00096 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00097 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00098 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00099 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00104 },
00105 {
00106 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00107 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00108 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00109 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00110 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00111 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00112 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00113 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114 F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00115 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00116 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00117 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00118 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00119 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00120 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00121 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, F
00122 },
00123 {
00124 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00125 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00126 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00127 A, A, A, A, A, A, A, A, A, A, F, F, F, F, F, F,
00128 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00129 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00130 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00131 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00132 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00133 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00134 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00135 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00136 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00137 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00138 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00139 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
00140 }
00141 };
00142 #undef A
00143 #undef F
00144
00145 static int
00146 gb18030_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00147 {
00148 int firstbyte = *p++;
00149 state_t s = trans[0][firstbyte];
00150 #define RETURN(n) \
00151 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
00152 ONIGENC_CONSTRUCT_MBCLEN_INVALID()
00153 if (s < 0) RETURN(1);
00154 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2-1);
00155 s = trans[s][*p++];
00156 if (s < 0) RETURN(2);
00157 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2);
00158 s = trans[s][*p++];
00159 if (s < 0) RETURN(3);
00160 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3);
00161 s = trans[s][*p++];
00162 RETURN(4);
00163 #undef RETURN
00164 }
00165
00166 static OnigCodePoint
00167 gb18030_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00168 {
00169 int c, i, len;
00170 OnigCodePoint n;
00171
00172 len = enclen(enc, p, end);
00173 n = (OnigCodePoint )(*p++);
00174 if (len == 1) return n;
00175
00176 for (i = 1; i < len; i++) {
00177 if (p >= end) break;
00178 c = *p++;
00179 n <<= 8; n += c;
00180 }
00181 return n;
00182 }
00183
00184 static int
00185 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00186 {
00187 return onigenc_mb4_code_to_mbc(enc, code, buf);
00188 }
00189
00190 static int
00191 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
00192 UChar* lower, OnigEncoding enc)
00193 {
00194 return onigenc_mbn_mbc_case_fold(enc, flag,
00195 pp, end, lower);
00196 }
00197
00198 #if 0
00199 static int
00200 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
00201 const UChar** pp, const UChar* end, OnigEncoding enc)
00202 {
00203 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00204 }
00205 #endif
00206
00207 static int
00208 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00209 {
00210 return onigenc_mb4_is_code_ctype(enc, code, ctype);
00211 }
00212
00213 enum state {
00214 S_START,
00215 S_one_C2,
00216 S_one_C4,
00217 S_one_CM,
00218
00219 S_odd_CM_one_CX,
00220 S_even_CM_one_CX,
00221
00222
00223 S_one_CMC4,
00224 S_odd_CMC4,
00225 S_one_C4_odd_CMC4,
00226 S_even_CMC4,
00227 S_one_C4_even_CMC4,
00228
00229 S_odd_CM_odd_CMC4,
00230 S_even_CM_odd_CMC4,
00231
00232 S_odd_CM_even_CMC4,
00233 S_even_CM_even_CMC4,
00234
00235
00236 S_odd_C4CM,
00237 S_one_CM_odd_C4CM,
00238 S_even_C4CM,
00239 S_one_CM_even_C4CM,
00240
00241 S_even_CM_odd_C4CM,
00242 S_odd_CM_odd_C4CM,
00243 S_even_CM_even_C4CM,
00244 S_odd_CM_even_C4CM
00245 };
00246
00247 static UChar*
00248 gb18030_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00249 {
00250 const UChar *p;
00251 enum state state = S_START;
00252
00253 DEBUG_GB18030(("----------------\n"));
00254 for (p = s; p >= start; p--) {
00255 DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
00256 switch (state) {
00257 case S_START:
00258 switch (GB18030_MAP[*p]) {
00259 case C1:
00260 return (UChar *)s;
00261 case C2:
00262 state = S_one_C2;
00263 break;
00264 case C4:
00265 state = S_one_C4;
00266 break;
00267 case CM:
00268 state = S_one_CM;
00269 break;
00270 }
00271 break;
00272 case S_one_C2:
00273 switch (GB18030_MAP[*p]) {
00274 case C1:
00275 case C2:
00276 case C4:
00277 return (UChar *)s;
00278 case CM:
00279 state = S_odd_CM_one_CX;
00280 break;
00281 }
00282 break;
00283 case S_one_C4:
00284 switch (GB18030_MAP[*p]) {
00285 case C1:
00286 case C2:
00287 case C4:
00288 return (UChar *)s;
00289 case CM:
00290 state = S_one_CMC4;
00291 break;
00292 }
00293 break;
00294 case S_one_CM:
00295 switch (GB18030_MAP[*p]) {
00296 case C1:
00297 case C2:
00298 return (UChar *)s;
00299 case C4:
00300 state = S_odd_C4CM;
00301 break;
00302 case CM:
00303 state = S_odd_CM_one_CX;
00304 break;
00305 }
00306 break;
00307
00308 case S_odd_CM_one_CX:
00309 switch (GB18030_MAP[*p]) {
00310 case C1:
00311 case C2:
00312 case C4:
00313 return (UChar *)(s - 1);
00314 case CM:
00315 state = S_even_CM_one_CX;
00316 break;
00317 }
00318 break;
00319 case S_even_CM_one_CX:
00320 switch (GB18030_MAP[*p]) {
00321 case C1:
00322 case C2:
00323 case C4:
00324 return (UChar *)s;
00325 case CM:
00326 state = S_odd_CM_one_CX;
00327 break;
00328 }
00329 break;
00330
00331 case S_one_CMC4:
00332 switch (GB18030_MAP[*p]) {
00333 case C1:
00334 case C2:
00335 return (UChar *)(s - 1);
00336 case C4:
00337 state = S_one_C4_odd_CMC4;
00338 break;
00339 case CM:
00340 state = S_even_CM_one_CX;
00341 break;
00342 }
00343 break;
00344 case S_odd_CMC4:
00345 switch (GB18030_MAP[*p]) {
00346 case C1:
00347 case C2:
00348 return (UChar *)(s - 1);
00349 case C4:
00350 state = S_one_C4_odd_CMC4;
00351 break;
00352 case CM:
00353 state = S_odd_CM_odd_CMC4;
00354 break;
00355 }
00356 break;
00357 case S_one_C4_odd_CMC4:
00358 switch (GB18030_MAP[*p]) {
00359 case C1:
00360 case C2:
00361 case C4:
00362 return (UChar *)(s - 1);
00363 case CM:
00364 state = S_even_CMC4;
00365 break;
00366 }
00367 break;
00368 case S_even_CMC4:
00369 switch (GB18030_MAP[*p]) {
00370 case C1:
00371 case C2:
00372 return (UChar *)(s - 3);
00373 case C4:
00374 state = S_one_C4_even_CMC4;
00375 break;
00376 case CM:
00377 state = S_odd_CM_even_CMC4;
00378 break;
00379 }
00380 break;
00381 case S_one_C4_even_CMC4:
00382 switch (GB18030_MAP[*p]) {
00383 case C1:
00384 case C2:
00385 case C4:
00386 return (UChar *)(s - 3);
00387 case CM:
00388 state = S_odd_CMC4;
00389 break;
00390 }
00391 break;
00392
00393 case S_odd_CM_odd_CMC4:
00394 switch (GB18030_MAP[*p]) {
00395 case C1:
00396 case C2:
00397 case C4:
00398 return (UChar *)(s - 3);
00399 case CM:
00400 state = S_even_CM_odd_CMC4;
00401 break;
00402 }
00403 break;
00404 case S_even_CM_odd_CMC4:
00405 switch (GB18030_MAP[*p]) {
00406 case C1:
00407 case C2:
00408 case C4:
00409 return (UChar *)(s - 1);
00410 case CM:
00411 state = S_odd_CM_odd_CMC4;
00412 break;
00413 }
00414 break;
00415
00416 case S_odd_CM_even_CMC4:
00417 switch (GB18030_MAP[*p]) {
00418 case C1:
00419 case C2:
00420 case C4:
00421 return (UChar *)(s - 1);
00422 case CM:
00423 state = S_even_CM_even_CMC4;
00424 break;
00425 }
00426 break;
00427 case S_even_CM_even_CMC4:
00428 switch (GB18030_MAP[*p]) {
00429 case C1:
00430 case C2:
00431 case C4:
00432 return (UChar *)(s - 3);
00433 case CM:
00434 state = S_odd_CM_even_CMC4;
00435 break;
00436 }
00437 break;
00438
00439 case S_odd_C4CM:
00440 switch (GB18030_MAP[*p]) {
00441 case C1:
00442 case C2:
00443 case C4:
00444 return (UChar *)s;
00445 case CM:
00446 state = S_one_CM_odd_C4CM;
00447 break;
00448 }
00449 break;
00450 case S_one_CM_odd_C4CM:
00451 switch (GB18030_MAP[*p]) {
00452 case C1:
00453 case C2:
00454 return (UChar *)(s - 2);
00455 case C4:
00456 state = S_even_C4CM;
00457 break;
00458 case CM:
00459 state = S_even_CM_odd_C4CM;
00460 break;
00461 }
00462 break;
00463 case S_even_C4CM:
00464 switch (GB18030_MAP[*p]) {
00465 case C1:
00466 case C2:
00467 case C4:
00468 return (UChar *)(s - 2);
00469 case CM:
00470 state = S_one_CM_even_C4CM;
00471 break;
00472 }
00473 break;
00474 case S_one_CM_even_C4CM:
00475 switch (GB18030_MAP[*p]) {
00476 case C1:
00477 case C2:
00478 return (UChar *)(s - 0);
00479 case C4:
00480 state = S_odd_C4CM;
00481 break;
00482 case CM:
00483 state = S_even_CM_even_C4CM;
00484 break;
00485 }
00486 break;
00487
00488 case S_even_CM_odd_C4CM:
00489 switch (GB18030_MAP[*p]) {
00490 case C1:
00491 case C2:
00492 case C4:
00493 return (UChar *)(s - 0);
00494 case CM:
00495 state = S_odd_CM_odd_C4CM;
00496 break;
00497 }
00498 break;
00499 case S_odd_CM_odd_C4CM:
00500 switch (GB18030_MAP[*p]) {
00501 case C1:
00502 case C2:
00503 case C4:
00504 return (UChar *)(s - 2);
00505 case CM:
00506 state = S_even_CM_odd_C4CM;
00507 break;
00508 }
00509 break;
00510
00511 case S_even_CM_even_C4CM:
00512 switch (GB18030_MAP[*p]) {
00513 case C1:
00514 case C2:
00515 case C4:
00516 return (UChar *)(s - 2);
00517 case CM:
00518 state = S_odd_CM_even_C4CM;
00519 break;
00520 }
00521 break;
00522 case S_odd_CM_even_C4CM:
00523 switch (GB18030_MAP[*p]) {
00524 case C1:
00525 case C2:
00526 case C4:
00527 return (UChar *)(s - 0);
00528 case CM:
00529 state = S_even_CM_even_C4CM;
00530 break;
00531 }
00532 break;
00533 }
00534 }
00535
00536 DEBUG_GB18030(("state %d\n", state));
00537 switch (state) {
00538 case S_START: return (UChar *)(s - 0);
00539 case S_one_C2: return (UChar *)(s - 0);
00540 case S_one_C4: return (UChar *)(s - 0);
00541 case S_one_CM: return (UChar *)(s - 0);
00542
00543 case S_odd_CM_one_CX: return (UChar *)(s - 1);
00544 case S_even_CM_one_CX: return (UChar *)(s - 0);
00545
00546 case S_one_CMC4: return (UChar *)(s - 1);
00547 case S_odd_CMC4: return (UChar *)(s - 1);
00548 case S_one_C4_odd_CMC4: return (UChar *)(s - 1);
00549 case S_even_CMC4: return (UChar *)(s - 3);
00550 case S_one_C4_even_CMC4: return (UChar *)(s - 3);
00551
00552 case S_odd_CM_odd_CMC4: return (UChar *)(s - 3);
00553 case S_even_CM_odd_CMC4: return (UChar *)(s - 1);
00554
00555 case S_odd_CM_even_CMC4: return (UChar *)(s - 1);
00556 case S_even_CM_even_CMC4: return (UChar *)(s - 3);
00557
00558 case S_odd_C4CM: return (UChar *)(s - 0);
00559 case S_one_CM_odd_C4CM: return (UChar *)(s - 2);
00560 case S_even_C4CM: return (UChar *)(s - 2);
00561 case S_one_CM_even_C4CM: return (UChar *)(s - 0);
00562
00563 case S_even_CM_odd_C4CM: return (UChar *)(s - 0);
00564 case S_odd_CM_odd_C4CM: return (UChar *)(s - 2);
00565 case S_even_CM_even_C4CM: return (UChar *)(s - 2);
00566 case S_odd_CM_even_C4CM: return (UChar *)(s - 0);
00567 }
00568
00569 return (UChar* )s;
00570 }
00571
00572 static int
00573 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
00574 {
00575 return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
00576 }
00577
00578 OnigEncodingDefine(gb18030, GB18030) = {
00579 gb18030_mbc_enc_len,
00580 "GB18030",
00581 4,
00582 1,
00583 onigenc_is_mbc_newline_0x0a,
00584 gb18030_mbc_to_code,
00585 onigenc_mb4_code_to_mbclen,
00586 gb18030_code_to_mbc,
00587 gb18030_mbc_case_fold,
00588 onigenc_ascii_apply_all_case_fold,
00589 onigenc_ascii_get_case_fold_codes_by_str,
00590 onigenc_minimum_property_name_to_ctype,
00591 gb18030_is_code_ctype,
00592 onigenc_not_support_get_ctype_code_range,
00593 gb18030_left_adjust_char_head,
00594 gb18030_is_allowed_reverse_match
00595 };
00596
00597