Ruby: ext/nkf/nkf.c Source File

00001 /*
00002  *  NKF - Ruby extension for Network Kanji Filter
00003  *
00004  *  original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
00005  *
00006  *  $Id: nkf.c 27947 2010-05-21 10:11:44Z nobu $
00007  *
00008  */
00009 
00010 #define RUBY_NKF_REVISION "$Revision: 27947 $"
00011 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
00012 
00013 #include "ruby/ruby.h"
00014 #include "ruby/encoding.h"
00015 
00016 /* Replace nkf's getchar/putchar for variable modification */
00017 /* we never use getc, ungetc */
00018 
00019 #undef getc
00020 #undef ungetc
00021 #define getc(f)         (input_ctr>=i_len?-1:input[input_ctr++])
00022 #define ungetc(c,f)     input_ctr--
00023 
00024 #define INCSIZE         32
00025 #undef putchar
00026 #undef TRUE
00027 #undef FALSE
00028 #define putchar(c)      rb_nkf_putchar(c)
00029 
00030 /* Input/Output pointers */
00031 
00032 static unsigned char *output;
00033 static unsigned char *input;
00034 static int input_ctr;
00035 static int i_len;
00036 static int output_ctr;
00037 static int o_len;
00038 static int incsize;
00039 
00040 static VALUE result;
00041 
00042 static int
00043 rb_nkf_putchar(unsigned int c)
00044 {
00045   if (output_ctr >= o_len) {
00046     o_len += incsize;
00047     rb_str_resize(result, o_len);
00048     incsize *= 2;
00049     output = (unsigned char *)RSTRING_PTR(result);
00050   }
00051   output[output_ctr++] = c;
00052 
00053   return c;
00054 }
00055 
00056 /* Include kanji filter main part */
00057 /* getchar and putchar will be replaced during inclusion */
00058 
00059 #define PERL_XS 1
00060 #include "nkf-utf8/config.h"
00061 #include "nkf-utf8/utf8tbl.c"
00062 #include "nkf-utf8/nkf.c"
00063 
00064 rb_encoding* rb_nkf_enc_get(const char *name)
00065 {
00066     int idx = rb_enc_find_index(name);
00067     if (idx < 0) {
00068         nkf_encoding *nkf_enc = nkf_enc_find(name);
00069         idx = rb_enc_find_index(nkf_enc_name(nkf_enc_to_base_encoding(nkf_enc)));
00070         if (idx < 0) {
00071             idx = rb_define_dummy_encoding(name);
00072         }
00073     }
00074     return rb_enc_from_index(idx);
00075 }
00076 
00077 int nkf_split_options(const char *arg)
00078 {
00079     int count = 0;
00080     unsigned char option[256];
00081     int i = 0, j = 0;
00082     int is_escaped = FALSE;
00083     int is_single_quoted = FALSE;
00084     int is_double_quoted = FALSE;
00085     for(i = 0; arg[i]; i++){
00086         if(j == 255){
00087             return -1;
00088         }else if(is_single_quoted){
00089             if(arg[i] == '\''){
00090                 is_single_quoted = FALSE;
00091             }else{
00092                 option[j++] = arg[i];
00093             }
00094         }else if(is_escaped){
00095             is_escaped = FALSE;
00096             option[j++] = arg[i];
00097         }else if(arg[i] == '\\'){
00098             is_escaped = TRUE;
00099         }else if(is_double_quoted){
00100             if(arg[i] == '"'){
00101                 is_double_quoted = FALSE;
00102             }else{
00103                 option[j++] = arg[i];
00104             }
00105         }else if(arg[i] == '\''){
00106             is_single_quoted = TRUE;
00107         }else if(arg[i] == '"'){
00108             is_double_quoted = TRUE;
00109         }else if(arg[i] == ' '){
00110             option[j] = '\0';
00111             options(option);
00112             j = 0;
00113         }else{
00114             option[j++] = arg[i];
00115         }
00116     }
00117     if(j){
00118         option[j] = '\0';
00119         options(option);
00120     }
00121     return count;
00122 }
00123 
00124 /*
00125  *  call-seq:
00126  *     NKF.nkf(opt, str)   => string
00127  *
00128  *  Convert _str_ and return converted result.
00129  *  Conversion details are specified by _opt_ as String.
00130  *
00131  *     require 'nkf'
00132  *     output = NKF.nkf("-s", input)
00133  */
00134 
00135 static VALUE
00136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
00137 {
00138     volatile VALUE tmp;
00139     reinit();
00140     StringValue(opt);
00141     nkf_split_options(RSTRING_PTR(opt));
00142     if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
00143 
00144     switch (nkf_enc_to_index(output_encoding)) {
00145     case UTF_8_BOM:    output_encoding = nkf_enc_from_index(UTF_8); break;
00146     case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
00147     case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
00148     case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
00149     case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
00150     }
00151     output_bom_f = FALSE;
00152 
00153     incsize = INCSIZE;
00154 
00155     input_ctr = 0;
00156     StringValue(src);
00157     input = (unsigned char *)RSTRING_PTR(src);
00158     i_len = RSTRING_LENINT(src);
00159     tmp = result = rb_str_new(0, i_len*3 + 10);
00160 
00161     output_ctr = 0;
00162     output     = (unsigned char *)RSTRING_PTR(result);
00163     o_len      = RSTRING_LENINT(result);
00164     *output    = '\0';
00165 
00166     kanji_convert(NULL);
00167     rb_str_set_len(result, output_ctr);
00168     OBJ_INFECT(result, src);
00169 
00170     if (mimeout_f)
00171         rb_enc_associate(result, rb_usascii_encoding());
00172     else
00173         rb_enc_associate(result, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
00174 
00175     return result;
00176 }
00177 
00178 
00179 /*
00180  *  call-seq:
00181  *     NKF.guess(str)  => encoding
00182  *
00183  *  Returns guessed encoding of _str_ by nkf routine.
00184  *
00185  */
00186 
00187 static VALUE
00188 rb_nkf_guess(VALUE obj, VALUE src)
00189 {
00190     reinit();
00191 
00192     input_ctr = 0;
00193     StringValue(src);
00194     input = (unsigned char *)RSTRING_PTR(src);
00195     i_len = RSTRING_LENINT(src);
00196 
00197     guess_f = TRUE;
00198     kanji_convert( NULL );
00199     guess_f = FALSE;
00200 
00201     return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
00202 }
00203 
00204 
00205 /*
00206  *  NKF - Ruby extension for Network Kanji Filter
00207  *
00208  *  == Description
00209  *
00210  *  This is a Ruby Extension version of nkf (Network Kanji Filter).
00211  *  It converts the first argument and returns converted result. Conversion
00212  *  details are specified by flags as the first argument.
00213  *
00214  *  *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
00215  *  It converts input kanji code to designated kanji code
00216  *  such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16.
00217  *
00218  *  One of the most unique faculty of *nkf* is the guess of the input kanji encodings.
00219  *  It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16.
00220  *  So users needn't set the input kanji code explicitly.
00221  *
00222  *  By default, X0201 kana is converted into X0208 kana.
00223  *  For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
00224  *  For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
00225  *  To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>.
00226  *
00227  *  == Flags
00228  *
00229  *  === -b -u
00230  *
00231  *  Output is buffered (DEFAULT), Output is unbuffered.
00232  *
00233  *  === -j -s -e -w -w16 -w32
00234  *
00235  *  Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
00236  *  UTF-8N, UTF-16BE, UTF-32BE.
00237  *  Without this option and compile option, ISO-2022-JP is assumed.
00238  *
00239  *  === -J -S -E -W -W16 -W32
00240  *
00241  *  Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
00242  *  UTF-8, UTF-16, UTF-32.
00243  *
00244  *  ==== -J
00245  *
00246  *  Assume  JIS input. It also accepts EUC-JP.
00247  *  This is the default. This flag does not exclude Shift_JIS.
00248  *
00249  *  ==== -S
00250  *
00251  *  Assume Shift_JIS and X0201 kana input. It also accepts JIS.
00252  *  EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag,
00253  *  X0201 kana (halfwidth kana) is converted into X0208.
00254  *
00255  *  ==== -E
00256  *
00257  *  Assume EUC-JP input. It also accepts JIS.
00258  *  Same as -J.
00259  *
00260  *  === -t
00261  *
00262  *  No conversion.
00263  *
00264  *  === -i_
00265  *
00266  *  Output sequence to designate JIS-kanji. (DEFAULT B)
00267  *
00268  *  === -o_
00269  *
00270  *  Output sequence to designate ASCII. (DEFAULT B)
00271  *
00272  *  === -r
00273  *
00274  *  {de/en}crypt ROT13/47
00275  *
00276  *  === -h[123] --hiragana --katakana --katakana-hiragana
00277  *
00278  *  [-h1 --hiragana] Katakana to Hiragana conversion.
00279  *
00280  *  [-h2 --katakana] Hiragana to Katakana conversion.
00281  *
00282  *  [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion.
00283  *
00284  *  === -T
00285  *
00286  *  Text mode output (MS-DOS)
00287  *
00288  *  === -l
00289  *
00290  *  ISO8859-1 (Latin-1) support
00291  *
00292  *  === -f[<code>m</code> [- <code>n</code>]]
00293  *
00294  *  Folding on <code>m</code> length with <code>n</code> margin in a line.
00295  *  Without this option, fold length is 60 and fold margin is 10.
00296  *
00297  *  === -F
00298  *
00299  *  New line preserving line folding.
00300  *
00301  *  === -Z[0-3]
00302  *
00303  *  Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
00304  *
00305  *  [-Z -Z0] Convert X0208 alphabet to ASCII.
00306  *
00307  *  [-Z1] Converts X0208 kankaku to single ASCII space.
00308  *
00309  *  [-Z2] Converts X0208 kankaku to double ASCII spaces.
00310  *
00311  *  [-Z3] Replacing Fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
00312  *
00313  *  === -X -x
00314  *
00315  *  Assume X0201 kana in MS-Kanji.
00316  *  With <b>-X</b> or without this option, X0201 is converted into X0208 Kana.
00317  *  With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
00318  *  In JIS output, ESC-(-I is used. In EUC output, SSO is used.
00319  *
00320  *  === -B[0-2]
00321  *
00322  *  Assume broken JIS-Kanji input, which lost ESC.
00323  *  Useful when your site is using old B-News Nihongo patch.
00324  *
00325  *  [-B1] allows any char after ESC-( or ESC-$.
00326  *
00327  *  [-B2] forces ASCII after NL.
00328  *
00329  *  === -I
00330  *
00331  *  Replacing non iso-2022-jp char into a geta character
00332  *  (substitute character in Japanese).
00333  *
00334  *  === -d -c
00335  *
00336  *  Delete \r in line feed, Add \r in line feed.
00337  *
00338  *  === -m[BQN0]
00339  *
00340  *  MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
00341  *  To see ISO8859-1 (Latin-1) -l is necessary.
00342  *
00343  *  [-mB] Decode MIME base64 encoded stream. Remove header or other part before
00344  *  conversion.
00345  *
00346  *  [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
00347  *
00348  *  [-mN] Non-strict decoding.
00349  *  It allows line break in the middle of the base64 encoding.
00350  *
00351  *  [-m0] No MIME decode.
00352  *
00353  *  === -M
00354  *
00355  *  MIME encode. Header style. All ASCII code and control characters are intact.
00356  *  Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
00357  *
00358  *  [-MB] MIME encode Base64 stream.
00359  *
00360  *  [-MQ] Perfome quoted encoding.
00361  *
00362  *  === -l
00363  *
00364  *  Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
00365  *  <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option.
00366  *
00367  *  === -L[uwm]
00368  *
00369  *  new line mode
00370  *  Without this option, nkf doesn't convert line breaks.
00371  *
00372  *  [-Lu] unix (LF)
00373  *
00374  *  [-Lw] windows (CRLF)
00375  *
00376  *  [-Lm] mac (CR)
00377  *
00378  *  === --fj --unix --mac --msdos --windows
00379  *
00380  *  convert for these system
00381  *
00382  *  === --jis --euc --sjis --mime --base64
00383  *
00384  *  convert for named code
00385  *
00386  *  === --jis-input --euc-input --sjis-input --mime-input --base64-input
00387  *
00388  *  assume input system
00389  *
00390  *  === --ic=<code>input codeset</code> --oc=<code>output codeset</code>
00391  *
00392  *  Set the input or output codeset.
00393  *  NKF supports following codesets and those codeset name are case insensitive.
00394  *
00395  *  [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET
00396  *
00397  *  [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS
00398  *
00399  *  [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii
00400  *
00401  *  [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms
00402  *
00403  *  [CP51932] Microsoft Version of EUC-JP.
00404  *
00405  *  [Shift_JIS] SJIS, MS-Kanji
00406  *
00407  *  [Windows-31J] a.k.a. CP932
00408  *
00409  *  [UTF-8] same as UTF-8N
00410  *
00411  *  [UTF-8N] UTF-8 without BOM
00412  *
00413  *  [UTF-8-BOM] UTF-8 with BOM
00414  *
00415  *  [UTF-16] same as UTF-16BE
00416  *
00417  *  [UTF-16BE] UTF-16 Big Endian without BOM
00418  *
00419  *  [UTF-16BE-BOM] UTF-16 Big Endian with BOM
00420  *
00421  *  [UTF-16LE] UTF-16 Little Endian without BOM
00422  *
00423  *  [UTF-16LE-BOM] UTF-16 Little Endian with BOM
00424  *
00425  *  [UTF-32] same as UTF-32BE
00426  *
00427  *  [UTF-32BE] UTF-32 Big Endian without BOM
00428  *
00429  *  [UTF-32BE-BOM] UTF-32 Big Endian with BOM
00430  *
00431  *  [UTF-32LE] UTF-32 Little Endian without BOM
00432  *
00433  *  [UTF-32LE-BOM] UTF-32 Little Endian with BOM
00434  *
00435  *  [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
00436  *
00437  *  === --fb-{skip, html, xml, perl, java, subchar}
00438  *
00439  *  Specify the way that nkf handles unassigned characters.
00440  *  Without this option, --fb-skip is assumed.
00441  *
00442  *  === --prefix= <code>escape character</code> <code>target character</code> ..
00443  *
00444  *  When nkf converts to Shift_JIS,
00445  *  nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
00446  *  1st byte of argument is the escape character and following bytes are target characters.
00447  *
00448  *  === --no-cp932ext
00449  *
00450  *  Handle the characters extended in CP932 as unassigned characters.
00451  *
00452  *  == --no-best-fit-chars
00453  *
00454  *  When Unicode to Encoded byte conversion,
00455  *  don't convert characters which is not round trip safe.
00456  *  When Unicode to Unicode conversion,
00457  *  with this and -x option, nkf can be used as UTF converter.
00458  *  (In other words, without this and -x option, nkf doesn't save some characters)
00459  *
00460  *  When nkf convert string which related to path, you should use this opion.
00461  *
00462  *  === --cap-input
00463  *
00464  *  Decode hex encoded characters.
00465  *
00466  *  === --url-input
00467  *
00468  *  Unescape percent escaped characters.
00469  *
00470  *  === --
00471  *
00472  *  Ignore rest of -option.
00473  */
00474 
00475 void
00476 Init_nkf()
00477 {
00478     VALUE mNKF = rb_define_module("NKF");
00479 
00480     rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
00481     rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
00482     rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
00483 
00484     rb_define_const(mNKF, "AUTO",       Qnil);
00485     rb_define_const(mNKF, "NOCONV",     Qnil);
00486     rb_define_const(mNKF, "UNKNOWN",    Qnil);
00487     rb_define_const(mNKF, "BINARY",     rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
00488     rb_define_const(mNKF, "ASCII",      rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
00489     rb_define_const(mNKF, "JIS",        rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
00490     rb_define_const(mNKF, "EUC",        rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
00491     rb_define_const(mNKF, "SJIS",       rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
00492     rb_define_const(mNKF, "UTF8",       rb_enc_from_encoding(rb_utf8_encoding()));
00493     rb_define_const(mNKF, "UTF16",      rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
00494     rb_define_const(mNKF, "UTF32",      rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
00495 
00496     /* Full version string of nkf */
00497     rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
00498     /* Version of nkf */
00499     rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
00500     /* Release date of nkf */
00501     rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
00502 }
00503