enc/gb18030.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   gb18030.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2005-2007  KUBO Takehiro <kubo AT jiubao DOT org>
00006  *                          K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
00007  * All rights reserved.
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  * 1. Redistributions of source code must retain the above copyright
00013  *    notice, this list of conditions and the following disclaimer.
00014  * 2. Redistributions in binary form must reproduce the above copyright
00015  *    notice, this list of conditions and the following disclaimer in the
00016  *    documentation and/or other materials provided with the distribution.
00017  *
00018  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00019  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00022  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00023  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00024  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00025  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00026  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00027  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00028  * SUCH DAMAGE.
00029  */
00030 
00031 #include "regenc.h"
00032 
00033 #if 1
00034 #define DEBUG_GB18030(arg)
00035 #else
00036 #define DEBUG_GB18030(arg) printf arg
00037 #endif
00038 
00039 enum {
00040   C1, /* one-byte char */
00041   C2, /* one-byte or second of two-byte char */
00042   C4, /* one-byte or second or fourth of four-byte char */
00043   CM  /* first of two- or four-byte char or second of two-byte char */
00044 };
00045 
00046 static const char GB18030_MAP[] = {
00047   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00048   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00049   C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1,
00050   C4, C4, C4, C4, C4, C4, C4, C4, C4, C4, C1, C1, C1, C1, C1, C1,
00051   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00052   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00053   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2,
00054   C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C2, C1,
00055   C2, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00056   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00057   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00058   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00059   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00060   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00061   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM,
00062   CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, CM, C1
00063 };
00064 
00065 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2, S3 } state_t;
00066 #define A ACCEPT
00067 #define F FAILURE
00068 static const signed char trans[][0x100] = {
00069   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00070     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00071     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00072     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00073     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00074     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00075     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00076     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00077     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00078     /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00079     /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00080     /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00081     /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00082     /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00083     /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00084     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F 
00086   },
00087   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00088     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00089     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00090     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00091     /* 3 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, F, F, F, F, F, F,
00092     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00093     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00094     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00095     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00096     /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00097     /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00098     /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00099     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00100     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F 
00104   },
00105   { /* S2   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00106     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00107     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00108     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00109     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00110     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00111     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00112     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00113     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114     /* 8 */ F, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00115     /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00116     /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00117     /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00118     /* c */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00119     /* d */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00120     /* e */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
00121     /* f */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, F 
00122   },
00123   { /* S3   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00124     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00125     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00126     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00127     /* 3 */ A, A, A, A, A, A, A, A, A, A, F, F, F, F, F, F,
00128     /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00129     /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00130     /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00131     /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00132     /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00133     /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00134     /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00135     /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00136     /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00137     /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00138     /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00139     /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F 
00140   }
00141 };
00142 #undef A
00143 #undef F
00144 
00145 static int
00146 gb18030_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00147 {
00148   int firstbyte = *p++;
00149   state_t s = trans[0][firstbyte];
00150 #define RETURN(n) \
00151     return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
00152                          ONIGENC_CONSTRUCT_MBCLEN_INVALID()
00153   if (s < 0) RETURN(1);
00154   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(2-1);
00155   s = trans[s][*p++];
00156   if (s < 0) RETURN(2);
00157   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-2);
00158   s = trans[s][*p++];
00159   if (s < 0) RETURN(3);
00160   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-3);
00161   s = trans[s][*p++];
00162   RETURN(4);
00163 #undef RETURN
00164 }
00165 
00166 static OnigCodePoint
00167 gb18030_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00168 {
00169   int c, i, len;
00170   OnigCodePoint n;
00171 
00172   len = enclen(enc, p, end);
00173   n = (OnigCodePoint )(*p++);
00174   if (len == 1) return n;
00175 
00176   for (i = 1; i < len; i++) {
00177     if (p >= end) break;
00178     c = *p++;
00179     n <<= 8;  n += c;
00180   }
00181   return n;
00182 }
00183 
00184 static int
00185 gb18030_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00186 {
00187   return onigenc_mb4_code_to_mbc(enc, code, buf);
00188 }
00189 
00190 static int
00191 gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
00192                       UChar* lower, OnigEncoding enc)
00193 {
00194   return onigenc_mbn_mbc_case_fold(enc, flag,
00195                                    pp, end, lower);
00196 }
00197 
00198 #if 0
00199 static int
00200 gb18030_is_mbc_ambiguous(OnigCaseFoldType flag,
00201                          const UChar** pp, const UChar* end, OnigEncoding enc)
00202 {
00203   return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00204 }
00205 #endif
00206 
00207 static int
00208 gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00209 {
00210   return onigenc_mb4_is_code_ctype(enc, code, ctype);
00211 }
00212 
00213 enum state {
00214   S_START,
00215   S_one_C2,
00216   S_one_C4,
00217   S_one_CM,
00218 
00219   S_odd_CM_one_CX,
00220   S_even_CM_one_CX,
00221 
00222   /* CMC4 : pair of "CM C4" */
00223   S_one_CMC4,
00224   S_odd_CMC4,
00225   S_one_C4_odd_CMC4,
00226   S_even_CMC4,
00227   S_one_C4_even_CMC4,
00228 
00229   S_odd_CM_odd_CMC4,
00230   S_even_CM_odd_CMC4,
00231 
00232   S_odd_CM_even_CMC4,
00233   S_even_CM_even_CMC4,
00234 
00235   /* C4CM : pair of "C4 CM" */
00236   S_odd_C4CM,
00237   S_one_CM_odd_C4CM,
00238   S_even_C4CM,
00239   S_one_CM_even_C4CM,
00240 
00241   S_even_CM_odd_C4CM,
00242   S_odd_CM_odd_C4CM,
00243   S_even_CM_even_C4CM,
00244   S_odd_CM_even_C4CM
00245 };
00246 
00247 static UChar*
00248 gb18030_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00249 {
00250   const UChar *p;
00251   enum state state = S_START;
00252 
00253   DEBUG_GB18030(("----------------\n"));
00254   for (p = s; p >= start; p--) {
00255     DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
00256     switch (state) {
00257     case S_START:
00258       switch (GB18030_MAP[*p]) {
00259       case C1:
00260         return (UChar *)s;
00261       case C2:
00262         state = S_one_C2; /* C2 */
00263         break;
00264       case C4:
00265         state = S_one_C4; /* C4 */
00266         break;
00267       case CM:
00268         state = S_one_CM; /* CM */
00269         break;
00270       }
00271       break;
00272     case S_one_C2: /* C2 */
00273       switch (GB18030_MAP[*p]) {
00274       case C1:
00275       case C2:
00276       case C4:
00277         return (UChar *)s;
00278       case CM:
00279         state = S_odd_CM_one_CX; /* CM C2 */
00280         break;
00281       }
00282       break;
00283     case S_one_C4: /* C4 */
00284       switch (GB18030_MAP[*p]) {
00285       case C1:
00286       case C2:
00287       case C4:
00288         return (UChar *)s;
00289       case CM:
00290         state = S_one_CMC4;
00291         break;
00292       }
00293       break;
00294     case S_one_CM: /* CM */
00295       switch (GB18030_MAP[*p]) {
00296       case C1:
00297       case C2:
00298         return (UChar *)s;
00299       case C4:
00300         state = S_odd_C4CM;
00301         break;
00302       case CM:
00303         state = S_odd_CM_one_CX; /* CM CM */
00304         break;
00305       }
00306       break;
00307 
00308     case S_odd_CM_one_CX: /* CM C2 */ /* CM CM */ /* CM CM CM C4 */
00309       switch (GB18030_MAP[*p]) {
00310       case C1:
00311       case C2:
00312       case C4:
00313         return (UChar *)(s - 1);
00314       case CM:
00315         state = S_even_CM_one_CX;
00316         break;
00317       }
00318       break;
00319     case S_even_CM_one_CX: /* CM CM C2 */ /* CM CM CM */ /* CM CM C4 */
00320       switch (GB18030_MAP[*p]) {
00321       case C1:
00322       case C2:
00323       case C4:
00324         return (UChar *)s;
00325       case CM:
00326         state = S_odd_CM_one_CX;
00327         break;
00328       }
00329       break;
00330 
00331     case S_one_CMC4: /* CM C4 */
00332       switch (GB18030_MAP[*p]) {
00333       case C1:
00334       case C2:
00335         return (UChar *)(s - 1);
00336       case C4:
00337         state = S_one_C4_odd_CMC4; /* C4 CM C4 */
00338         break;
00339       case CM:
00340         state = S_even_CM_one_CX; /* CM CM C4 */
00341         break;
00342       }
00343       break;
00344     case S_odd_CMC4: /* CM C4 CM C4 CM C4 */
00345       switch (GB18030_MAP[*p]) {
00346       case C1:
00347       case C2:
00348         return (UChar *)(s - 1);
00349       case C4:
00350         state = S_one_C4_odd_CMC4;
00351         break;
00352       case CM:
00353         state = S_odd_CM_odd_CMC4;
00354         break;
00355       }
00356       break;
00357     case S_one_C4_odd_CMC4: /* C4 CM C4 */
00358       switch (GB18030_MAP[*p]) {
00359       case C1:
00360       case C2:
00361       case C4:
00362         return (UChar *)(s - 1);
00363       case CM:
00364         state = S_even_CMC4; /* CM C4 CM C4 */
00365         break;
00366       }
00367       break;
00368     case S_even_CMC4: /* CM C4 CM C4 */
00369       switch (GB18030_MAP[*p]) {
00370       case C1:
00371       case C2:
00372         return (UChar *)(s - 3);
00373       case C4:
00374         state = S_one_C4_even_CMC4;
00375         break;
00376       case CM:
00377         state = S_odd_CM_even_CMC4;
00378         break;
00379       }
00380       break;
00381     case S_one_C4_even_CMC4: /* C4 CM C4 CM C4 */
00382       switch (GB18030_MAP[*p]) {
00383       case C1:
00384       case C2:
00385       case C4:
00386         return (UChar *)(s - 3);
00387       case CM:
00388         state = S_odd_CMC4;
00389         break;
00390       }
00391       break;
00392 
00393     case S_odd_CM_odd_CMC4: /* CM CM C4 CM C4 CM C4 */
00394       switch (GB18030_MAP[*p]) {
00395       case C1:
00396       case C2:
00397       case C4:
00398         return (UChar *)(s - 3);
00399       case CM:
00400         state = S_even_CM_odd_CMC4;
00401         break;
00402       }
00403       break;
00404     case S_even_CM_odd_CMC4: /* CM CM CM C4 CM C4 CM C4 */
00405       switch (GB18030_MAP[*p]) {
00406       case C1:
00407       case C2:
00408       case C4:
00409         return (UChar *)(s - 1);
00410       case CM:
00411         state = S_odd_CM_odd_CMC4;
00412         break;
00413       }
00414       break;
00415 
00416     case S_odd_CM_even_CMC4: /* CM CM C4 CM C4 */
00417       switch (GB18030_MAP[*p]) {
00418       case C1:
00419       case C2:
00420       case C4:
00421         return (UChar *)(s - 1);
00422       case CM:
00423         state = S_even_CM_even_CMC4;
00424         break;
00425       }
00426       break;
00427     case S_even_CM_even_CMC4: /* CM CM CM C4 CM C4 */
00428       switch (GB18030_MAP[*p]) {
00429       case C1:
00430       case C2:
00431       case C4:
00432         return (UChar *)(s - 3);
00433       case CM:
00434         state = S_odd_CM_even_CMC4;
00435         break;
00436       }
00437       break;
00438 
00439     case S_odd_C4CM: /* C4 CM */  /* C4 CM C4 CM C4 CM*/
00440       switch (GB18030_MAP[*p]) {
00441       case C1:
00442       case C2:
00443       case C4:
00444         return (UChar *)s;
00445       case CM:
00446         state = S_one_CM_odd_C4CM; /* CM C4 CM */
00447         break;
00448       }
00449       break;
00450     case S_one_CM_odd_C4CM: /* CM C4 CM */ /* CM C4 CM C4 CM C4 CM */
00451       switch (GB18030_MAP[*p]) {
00452       case C1:
00453       case C2:
00454         return (UChar *)(s - 2); /* |CM C4 CM */
00455       case C4:
00456         state = S_even_C4CM;
00457         break;
00458       case CM:
00459         state = S_even_CM_odd_C4CM;
00460         break;
00461       }
00462       break;
00463     case S_even_C4CM: /* C4 CM C4 CM */
00464       switch (GB18030_MAP[*p]) {
00465       case C1:
00466       case C2:
00467       case C4:
00468         return (UChar *)(s - 2);  /* C4|CM C4 CM */
00469       case CM:
00470         state = S_one_CM_even_C4CM;
00471         break;
00472       }
00473       break;
00474     case S_one_CM_even_C4CM: /* CM C4 CM C4 CM */
00475       switch (GB18030_MAP[*p]) {
00476       case C1:
00477       case C2:
00478         return (UChar *)(s - 0);  /*|CM C4 CM C4|CM */
00479       case C4:
00480         state = S_odd_C4CM;
00481         break;
00482       case CM:
00483         state = S_even_CM_even_C4CM;
00484         break;
00485       }
00486       break;
00487 
00488     case S_even_CM_odd_C4CM: /* CM CM C4 CM */
00489       switch (GB18030_MAP[*p]) {
00490       case C1:
00491       case C2:
00492       case C4:
00493         return (UChar *)(s - 0); /* |CM CM|C4|CM */
00494       case CM:
00495         state = S_odd_CM_odd_C4CM;
00496         break;
00497       }
00498       break;
00499     case S_odd_CM_odd_C4CM: /* CM CM CM C4 CM */
00500       switch (GB18030_MAP[*p]) {
00501       case C1:
00502       case C2:
00503       case C4:
00504         return (UChar *)(s - 2); /* |CM CM|CM C4 CM */
00505       case CM:
00506         state = S_even_CM_odd_C4CM;
00507         break;
00508       }
00509       break;
00510 
00511     case S_even_CM_even_C4CM: /* CM CM C4 CM C4 CM */
00512       switch (GB18030_MAP[*p]) {
00513       case C1:
00514       case C2:
00515       case C4:
00516         return (UChar *)(s - 2); /* |CM CM|C4|CM C4 CM */
00517       case CM:
00518         state = S_odd_CM_even_C4CM;
00519         break;
00520       }
00521       break;
00522     case S_odd_CM_even_C4CM: /* CM CM CM C4 CM C4 CM */
00523       switch (GB18030_MAP[*p]) {
00524       case C1:
00525       case C2:
00526       case C4:
00527         return (UChar *)(s - 0);  /* |CM CM|CM C4 CM C4|CM */
00528       case CM:
00529         state = S_even_CM_even_C4CM;
00530         break;
00531       }
00532       break;
00533     }
00534   }
00535 
00536   DEBUG_GB18030(("state %d\n", state));
00537   switch (state) {
00538   case S_START:             return (UChar *)(s - 0);
00539   case S_one_C2:            return (UChar *)(s - 0);
00540   case S_one_C4:            return (UChar *)(s - 0);
00541   case S_one_CM:            return (UChar *)(s - 0);
00542 
00543   case S_odd_CM_one_CX:     return (UChar *)(s - 1);
00544   case S_even_CM_one_CX:    return (UChar *)(s - 0);
00545 
00546   case S_one_CMC4:          return (UChar *)(s - 1);
00547   case S_odd_CMC4:          return (UChar *)(s - 1);
00548   case S_one_C4_odd_CMC4:   return (UChar *)(s - 1);
00549   case S_even_CMC4:         return (UChar *)(s - 3);
00550   case S_one_C4_even_CMC4:  return (UChar *)(s - 3);
00551 
00552   case S_odd_CM_odd_CMC4:   return (UChar *)(s - 3);
00553   case S_even_CM_odd_CMC4:  return (UChar *)(s - 1);
00554 
00555   case S_odd_CM_even_CMC4:  return (UChar *)(s - 1);
00556   case S_even_CM_even_CMC4: return (UChar *)(s - 3);
00557 
00558   case S_odd_C4CM:          return (UChar *)(s - 0);
00559   case S_one_CM_odd_C4CM:   return (UChar *)(s - 2);
00560   case S_even_C4CM:         return (UChar *)(s - 2);
00561   case S_one_CM_even_C4CM:  return (UChar *)(s - 0);
00562 
00563   case S_even_CM_odd_C4CM:  return (UChar *)(s - 0);
00564   case S_odd_CM_odd_C4CM:   return (UChar *)(s - 2);
00565   case S_even_CM_even_C4CM: return (UChar *)(s - 2);
00566   case S_odd_CM_even_C4CM:  return (UChar *)(s - 0);
00567   }
00568 
00569   return (UChar* )s;  /* never come here. (escape warning) */
00570 }
00571 
00572 static int
00573 gb18030_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
00574 {
00575   return GB18030_MAP[*s] == C1 ? TRUE : FALSE;
00576 }
00577 
00578 OnigEncodingDefine(gb18030, GB18030) = {
00579   gb18030_mbc_enc_len,
00580   "GB18030",   /* name */
00581   4,          /* max enc length */
00582   1,          /* min enc length */
00583   onigenc_is_mbc_newline_0x0a,
00584   gb18030_mbc_to_code,
00585   onigenc_mb4_code_to_mbclen,
00586   gb18030_code_to_mbc,
00587   gb18030_mbc_case_fold,
00588   onigenc_ascii_apply_all_case_fold,
00589   onigenc_ascii_get_case_fold_codes_by_str,
00590   onigenc_minimum_property_name_to_ctype,
00591   gb18030_is_code_ctype,
00592   onigenc_not_support_get_ctype_code_range,
00593   gb18030_left_adjust_char_head,
00594   gb18030_is_allowed_reverse_match
00595 };
00596 
00597 

Generated on Wed Aug 10 09:15:05 2011 for Ruby by  doxygen 1.4.7