00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #define RUBY_NKF_REVISION "$Revision: 27947 $"
00011 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
00012
00013 #include "ruby/ruby.h"
00014 #include "ruby/encoding.h"
00015
00016
00017
00018
00019 #undef getc
00020 #undef ungetc
00021 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++])
00022 #define ungetc(c,f) input_ctr--
00023
00024 #define INCSIZE 32
00025 #undef putchar
00026 #undef TRUE
00027 #undef FALSE
00028 #define putchar(c) rb_nkf_putchar(c)
00029
00030
00031
00032 static unsigned char *output;
00033 static unsigned char *input;
00034 static int input_ctr;
00035 static int i_len;
00036 static int output_ctr;
00037 static int o_len;
00038 static int incsize;
00039
00040 static VALUE result;
00041
00042 static int
00043 rb_nkf_putchar(unsigned int c)
00044 {
00045 if (output_ctr >= o_len) {
00046 o_len += incsize;
00047 rb_str_resize(result, o_len);
00048 incsize *= 2;
00049 output = (unsigned char *)RSTRING_PTR(result);
00050 }
00051 output[output_ctr++] = c;
00052
00053 return c;
00054 }
00055
00056
00057
00058
00059 #define PERL_XS 1
00060 #include "nkf-utf8/config.h"
00061 #include "nkf-utf8/utf8tbl.c"
00062 #include "nkf-utf8/nkf.c"
00063
00064 rb_encoding* rb_nkf_enc_get(const char *name)
00065 {
00066 int idx = rb_enc_find_index(name);
00067 if (idx < 0) {
00068 nkf_encoding *nkf_enc = nkf_enc_find(name);
00069 idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc)));
00070 if (idx < 0) {
00071 idx = rb_define_dummy_encoding(name);
00072 }
00073 }
00074 return rb_enc_from_index(idx);
00075 }
00076
00077 int nkf_split_options(const char *arg)
00078 {
00079 int count = 0;
00080 unsigned char option[256];
00081 int i = 0, j = 0;
00082 int is_escaped = FALSE;
00083 int is_single_quoted = FALSE;
00084 int is_double_quoted = FALSE;
00085 for(i = 0; arg[i]; i++){
00086 if(j == 255){
00087 return -1;
00088 }else if(is_single_quoted){
00089 if(arg[i] == '\''){
00090 is_single_quoted = FALSE;
00091 }else{
00092 option[j++] = arg[i];
00093 }
00094 }else if(is_escaped){
00095 is_escaped = FALSE;
00096 option[j++] = arg[i];
00097 }else if(arg[i] == '\\'){
00098 is_escaped = TRUE;
00099 }else if(is_double_quoted){
00100 if(arg[i] == '"'){
00101 is_double_quoted = FALSE;
00102 }else{
00103 option[j++] = arg[i];
00104 }
00105 }else if(arg[i] == '\''){
00106 is_single_quoted = TRUE;
00107 }else if(arg[i] == '"'){
00108 is_double_quoted = TRUE;
00109 }else if(arg[i] == ' '){
00110 option[j] = '\0';
00111 options(option);
00112 j = 0;
00113 }else{
00114 option[j++] = arg[i];
00115 }
00116 }
00117 if(j){
00118 option[j] = '\0';
00119 options(option);
00120 }
00121 return count;
00122 }
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135 static VALUE
00136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
00137 {
00138 volatile VALUE tmp;
00139 reinit();
00140 StringValue(opt);
00141 nkf_split_options(RSTRING_PTR(opt));
00142 if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
00143
00144 switch (nkf_enc_to_index(output_encoding)) {
00145 case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break;
00146 case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
00147 case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
00148 case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
00149 case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
00150 }
00151 output_bom_f = FALSE;
00152
00153 incsize = INCSIZE;
00154
00155 input_ctr = 0;
00156 StringValue(src);
00157 input = (unsigned char *)RSTRING_PTR(src);
00158 i_len = RSTRING_LENINT(src);
00159 tmp = result = rb_str_new(0, i_len*3 + 10);
00160
00161 output_ctr = 0;
00162 output = (unsigned char *)RSTRING_PTR(result);
00163 o_len = RSTRING_LENINT(result);
00164 *output = '\0';
00165
00166 kanji_convert(NULL);
00167 rb_str_set_len(result, output_ctr);
00168 OBJ_INFECT(result, src);
00169
00170 if (mimeout_f)
00171 rb_enc_associate(result, rb_usascii_encoding());
00172 else
00173 rb_enc_associate(result, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
00174
00175 return result;
00176 }
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187 static VALUE
00188 rb_nkf_guess(VALUE obj, VALUE src)
00189 {
00190 reinit();
00191
00192 input_ctr = 0;
00193 StringValue(src);
00194 input = (unsigned char *)RSTRING_PTR(src);
00195 i_len = RSTRING_LENINT(src);
00196
00197 guess_f = TRUE;
00198 kanji_convert( NULL );
00199 guess_f = FALSE;
00200
00201 return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
00202 }
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475 void
00476 Init_nkf()
00477 {
00478 VALUE mNKF = rb_define_module("NKF");
00479
00480 rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
00481 rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
00482 rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
00483
00484 rb_define_const(mNKF, "AUTO", Qnil);
00485 rb_define_const(mNKF, "NOCONV", Qnil);
00486 rb_define_const(mNKF, "UNKNOWN", Qnil);
00487 rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
00488 rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
00489 rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
00490 rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
00491 rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
00492 rb_define_const(mNKF, "UTF8", rb_enc_from_encoding(rb_utf8_encoding()));
00493 rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
00494 rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
00495
00496
00497 rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
00498
00499 rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
00500
00501 rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
00502 }
00503