00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regenc.h"
00031
00032 static const int EncLen_GBK[] = {
00033 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00049 };
00050
00051 static const char GBK_CAN_BE_TRAIL_TABLE[256] = {
00052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
00068 };
00069
00070 #define GBK_ISMB_FIRST(byte) (EncLen_GBK[byte] > 1)
00071 #define GBK_ISMB_TRAIL(byte) GBK_CAN_BE_TRAIL_TABLE[(byte)]
00072
00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00074 #define A ACCEPT
00075 #define F FAILURE
00076 static const signed char trans[][0x100] = {
00077 {
00078 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00082 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00083 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00084 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00085 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00086 A, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00094 },
00095 {
00096 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00104 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00107 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00108 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00109 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00110 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00111 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00112 }
00113 };
00114 #undef A
00115 #undef F
00116
00117 static int
00118 gbk_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120 int firstbyte = *p++;
00121 state_t s = trans[0][firstbyte];
00122 #define RETURN(n) \
00123 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID()
00125 if (s < 0) RETURN(1);
00126 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_GBK[firstbyte]-1);
00127 s = trans[s][*p++];
00128 RETURN(2);
00129 #undef RETURN
00130 }
00131
00132 static OnigCodePoint
00133 gbk_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00134 {
00135 return onigenc_mbn_mbc_to_code(enc, p, end);
00136 }
00137
00138 static int
00139 gbk_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00140 {
00141 return onigenc_mb2_code_to_mbc(enc, code, buf);
00142 }
00143
00144 static int
00145 gbk_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end,
00146 UChar* lower, OnigEncoding enc)
00147 {
00148 return onigenc_mbn_mbc_case_fold(enc, flag,
00149 pp, end, lower);
00150 }
00151
00152 #if 0
00153 static int
00154 gbk_is_mbc_ambiguous(OnigCaseFoldType flag,
00155 const UChar** pp, const UChar* end, OnigEncoding enc)
00156 {
00157 return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00158 }
00159 #endif
00160
00161 static int
00162 gbk_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00163 {
00164 return onigenc_mb2_is_code_ctype(enc, code, ctype);
00165 }
00166
00167 static UChar*
00168 gbk_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00169 {
00170 const UChar *p;
00171 int len;
00172
00173 if (s <= start) return (UChar* )s;
00174 p = s;
00175
00176 if (GBK_ISMB_TRAIL(*p)) {
00177 while (p > start) {
00178 if (! GBK_ISMB_FIRST(*--p)) {
00179 p++;
00180 break;
00181 }
00182 }
00183 }
00184 len = enclen(enc, p, end);
00185 if (p + len > s) return (UChar* )p;
00186 p += len;
00187 return (UChar* )(p + ((s - p) & ~1));
00188 }
00189
00190 static int
00191 gbk_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
00192 {
00193 const UChar c = *s;
00194 return (GBK_ISMB_TRAIL(c) ? FALSE : TRUE);
00195 }
00196
00197 OnigEncodingDefine(gbk, GBK) = {
00198 gbk_mbc_enc_len,
00199 "GBK",
00200 2,
00201 1,
00202 onigenc_is_mbc_newline_0x0a,
00203 gbk_mbc_to_code,
00204 onigenc_mb2_code_to_mbclen,
00205 gbk_code_to_mbc,
00206 gbk_mbc_case_fold,
00207 onigenc_ascii_apply_all_case_fold,
00208 onigenc_ascii_get_case_fold_codes_by_str,
00209 onigenc_minimum_property_name_to_ctype,
00210 gbk_is_code_ctype,
00211 onigenc_not_support_get_ctype_code_range,
00212 gbk_left_adjust_char_head,
00213 gbk_is_allowed_reverse_match
00214 };
00215
00216
00217
00218
00219
00220
00221
00222 ENC_ALIAS("CP936", "GBK")
00223