00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 static const int EncLen_BIG5[] = {
00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00049 };
00050 static const int EncLen_BIG5_HKSCS[] = {
00051
00052 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00053 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00054 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00055 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00060 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00061 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00062 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00063 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00064 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00065 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00066 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00067 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
00068 };
00069
00070 static const int EncLen_BIG5_UAO[] = {
00071
00072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00080 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00081 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00083 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00084 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00087 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
00088 };
00089
00090 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00091 #define A ACCEPT
00092 #define F FAILURE
00093 static const signed char trans[][0x100] = {
00094 {
00095 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00096 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00097 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00098 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00099 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00111 },
00112 {
00113 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00114 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00115 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00116 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00117 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00118 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00119 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00120 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00121 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00122 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00123 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00124 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00125 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00126 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00127 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00128 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00129 },
00130 {
00131 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00132 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00133 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00134 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00135 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00136 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00137 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00138 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00139 F, F, F, F, F, F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00141 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00147 }
00148 };
00149 #undef A
00150 #undef F
00151
00152 static int
00153 big5_mbc_enc_len0(const UChar* p, const UChar* e, int tridx, const int tbl[])
00154 {
00155 int firstbyte = *p++;
00156 state_t s = trans[tridx][firstbyte];
00157 #define RETURN(n) \
00158 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
00159 ONIGENC_CONSTRUCT_MBCLEN_INVALID()
00160 if (s < 0) RETURN(1);
00161 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(tbl[firstbyte]-1);
00162 s = trans[s][*p++];
00163 RETURN(2);
00164 #undef RETURN
00165 }
00166
00167 static int
00168 big5_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00169 {
00170 return big5_mbc_enc_len0(p, e, 0, EncLen_BIG5);
00171 }
00172
00173 static int
00174 big5_hkscs_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00175 {
00176 return big5_mbc_enc_len0(p, e, 2, EncLen_BIG5_HKSCS);
00177 }
00178
00179 static int
00180 big5_uao_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00181 {
00182 return big5_mbc_enc_len0(p, e, 2, EncLen_BIG5_UAO);
00183 }
00184
00185 static OnigCodePoint
00186 big5_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00187 {
00188 return onigenc_mbn_mbc_to_code(enc, p, end);
00189 }
00190
00191 static int
00192 big5_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00193 {
00194 return onigenc_mb2_code_to_mbc(enc, code, buf);
00195 }
00196
00197 static int
00198 big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
00199 UChar* lower, OnigEncoding enc)
00200 {
00201 return onigenc_mbn_mbc_case_fold(enc, flag,
00202 pp, end, lower);
00203 }
00204
00205 #if 0
00206 static int
00207 big5_is_mbc_ambiguous(OnigCaseFoldType flag,
00208 const UChar** pp, const UChar* end, OnigEncoding enc)
00209 {
00210 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211 }
00212 #endif
00213
00214 static int
00215 big5_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00216 {
00217 return onigenc_mb2_is_code_ctype(enc, code, ctype);
00218 }
00219
00220 static const char BIG5_CAN_BE_TRAIL_TABLE[256] = {
00221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00225 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00226 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
00237 };
00238
00239 #define BIG5_HKSCS_P(enc) ((enc)->precise_mbc_enc_len == big5_hkscs_mbc_enc_len)
00240 #define BIG5_UAO_P(enc) ((enc)->precise_mbc_enc_len == big5_uao_mbc_enc_len)
00241
00242 #define BIG5_ISMB_FIRST(byte) ( \
00243 BIG5_HKSCS_P(enc) ? EncLen_BIG5_HKSCS[byte] > 1 : \
00244 EncLen_BIG5[byte] > 1 \
00245 )
00246 #define BIG5_ISMB_TRAIL(byte) BIG5_CAN_BE_TRAIL_TABLE[(byte)]
00247
00248 static UChar*
00249 big5_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00250 {
00251 const UChar *p;
00252 int len;
00253
00254 if (s <= start) return (UChar* )s;
00255 p = s;
00256
00257 if (BIG5_ISMB_TRAIL(*p)) {
00258 while (p > start) {
00259 if (! BIG5_ISMB_FIRST(*--p)) {
00260 p++;
00261 break;
00262 }
00263 }
00264 }
00265 len = enclen(enc, p, end);
00266 if (p + len > s) return (UChar* )p;
00267 p += len;
00268 return (UChar* )(p + ((s - p) & ~1));
00269 }
00270
00271 static int
00272 big5_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
00273 {
00274 const UChar c = *s;
00275
00276 return (BIG5_ISMB_TRAIL(c) ? FALSE : TRUE);
00277 }
00278
00279
00280
00281
00282
00283
00284
00285
00286 OnigEncodingDefine(big5, BIG5) = {
00287 big5_mbc_enc_len,
00288 "Big5",
00289 2,
00290 1,
00291 onigenc_is_mbc_newline_0x0a,
00292 big5_mbc_to_code,
00293 onigenc_mb2_code_to_mbclen,
00294 big5_code_to_mbc,
00295 big5_mbc_case_fold,
00296 onigenc_ascii_apply_all_case_fold,
00297 onigenc_ascii_get_case_fold_codes_by_str,
00298 onigenc_minimum_property_name_to_ctype,
00299 big5_is_code_ctype,
00300 onigenc_not_support_get_ctype_code_range,
00301 big5_left_adjust_char_head,
00302 big5_is_allowed_reverse_match
00303 };
00304 ENC_ALIAS("CP950", "Big5")
00305
00306
00307
00308
00309
00310
00311
00312 OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = {
00313 big5_hkscs_mbc_enc_len,
00314 "Big5-HKSCS",
00315 2,
00316 1,
00317 onigenc_is_mbc_newline_0x0a,
00318 big5_mbc_to_code,
00319 onigenc_mb2_code_to_mbclen,
00320 big5_code_to_mbc,
00321 big5_mbc_case_fold,
00322 onigenc_ascii_apply_all_case_fold,
00323 onigenc_ascii_get_case_fold_codes_by_str,
00324 onigenc_minimum_property_name_to_ctype,
00325 big5_is_code_ctype,
00326 onigenc_not_support_get_ctype_code_range,
00327 big5_left_adjust_char_head,
00328 big5_is_allowed_reverse_match
00329 };
00330 ENC_ALIAS("CP951", "Big5-HKSCS")
00331
00332
00333
00334
00335
00336 OnigEncodingDefine(big5_uao, BIG5_UAO) = {
00337 big5_uao_mbc_enc_len,
00338 "Big5-UAO",
00339 2,
00340 1,
00341 onigenc_is_mbc_newline_0x0a,
00342 big5_mbc_to_code,
00343 onigenc_mb2_code_to_mbclen,
00344 big5_code_to_mbc,
00345 big5_mbc_case_fold,
00346 onigenc_ascii_apply_all_case_fold,
00347 onigenc_ascii_get_case_fold_codes_by_str,
00348 onigenc_minimum_property_name_to_ctype,
00349 big5_is_code_ctype,
00350 onigenc_not_support_get_ctype_code_range,
00351 big5_left_adjust_char_head,
00352 big5_is_allowed_reverse_match
00353 };
00354