win_iconv.c 59 KB


  1. /*
  2. * iconv implementation using Win32 API to convert.
  3. *
  4. * This file is placed in the public domain.
  5. */
  6. /* for WC_NO_BEST_FIT_CHARS */
  7. #ifndef WINVER
  8. # define WINVER 0x0500
  9. #endif
  10. #define STRICT
  11. #include <windows.h>
  12. #include <errno.h>
  13. #include <string.h>
  14. #include <stdlib.h>
  15. /* WORKAROUND: */
  16. #ifndef UNDER_CE
  17. #define GetProcAddressA GetProcAddress
  18. #endif
  19. #if 0
  20. # define MAKE_EXE
  21. # define MAKE_DLL
  22. # define USE_LIBICONV_DLL
  23. #endif
  24. #if !defined(DEFAULT_LIBICONV_DLL)
  25. # define DEFAULT_LIBICONV_DLL ""
  26. #endif
  27. #define MB_CHAR_MAX 16
  28. #define UNICODE_MODE_BOM_DONE 1
  29. #define UNICODE_MODE_SWAPPED 2
  30. #define FLAG_USE_BOM 1
  31. #define FLAG_TRANSLIT 2 /* //TRANSLIT */
  32. #define FLAG_IGNORE 4 /* //IGNORE */
  33. typedef unsigned char uchar;
  34. typedef unsigned short ushort;
  35. typedef unsigned int uint;
  36. typedef void* iconv_t;
  37. iconv_t iconv_open(const char *tocode, const char *fromcode);
  38. int iconv_close(iconv_t cd);
  39. size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
  40. /* libiconv interface for vim */
  41. #if defined(MAKE_DLL)
  42. int
  43. iconvctl (iconv_t cd, int request, void* argument)
  44. {
  45. /* not supported */
  46. return 0;
  47. }
  48. #endif
  49. typedef struct compat_t compat_t;
  50. typedef struct csconv_t csconv_t;
  51. typedef struct rec_iconv_t rec_iconv_t;
  52. typedef iconv_t (*f_iconv_open)(const char *tocode, const char *fromcode);
  53. typedef int (*f_iconv_close)(iconv_t cd);
  54. typedef size_t (*f_iconv)(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
  55. typedef int* (*f_errno)(void);
  56. typedef int (*f_mbtowc)(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  57. typedef int (*f_wctomb)(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  58. typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize);
  59. typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize);
  60. #define COMPAT_IN 1
  61. #define COMPAT_OUT 2
  62. /* unicode mapping for compatibility with other conversion table. */
  63. struct compat_t {
  64. uint in;
  65. uint out;
  66. uint flag;
  67. };
  68. struct csconv_t {
  69. int codepage;
  70. int flags;
  71. f_mbtowc mbtowc;
  72. f_wctomb wctomb;
  73. f_mblen mblen;
  74. f_flush flush;
  75. DWORD mode;
  76. compat_t *compat;
  77. };
  78. struct rec_iconv_t {
  79. iconv_t cd;
  80. f_iconv_close iconv_close;
  81. f_iconv iconv;
  82. f_errno _errno;
  83. csconv_t from;
  84. csconv_t to;
  85. #if defined(USE_LIBICONV_DLL)
  86. HMODULE hlibiconv;
  87. #endif
  88. };
  89. static int win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode);
  90. static int win_iconv_close(iconv_t cd);
  91. static size_t win_iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
  92. static int load_mlang();
  93. static int make_csconv(const char *name, csconv_t *cv);
  94. static int name_to_codepage(const char *name);
  95. static uint utf16_to_ucs4(const ushort *wbuf);
  96. static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize);
  97. static int mbtowc_flags(int codepage);
  98. static int must_use_null_useddefaultchar(int codepage);
  99. static char *strrstr(const char *str, const char *token);
  100. static char *xstrndup(const char *s, size_t n);
  101. static int seterror(int err);
  102. #if defined(USE_LIBICONV_DLL)
  103. static int libiconv_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode);
  104. static PVOID MyImageDirectoryEntryToData(LPVOID Base, BOOLEAN MappedAsImage, USHORT DirectoryEntry, PULONG Size);
  105. static HMODULE find_imported_module_by_funcname(HMODULE hModule, const char *funcname);
  106. static HMODULE hwiniconv;
  107. #endif
  108. static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
  109. static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
  110. static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize);
  111. static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize);
  112. static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize);
  113. static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  114. static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  115. static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  116. static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  117. static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  118. static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  119. static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  120. static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  121. static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize);
  122. static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize);
  123. static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize);
  124. static struct {
  125. int codepage;
  126. const char *name;
  127. } codepage_alias[] = {
  128. {65001, "CP65001"},
  129. {65001, "UTF8"},
  130. {65001, "UTF-8"},
  131. {1200, "CP1200"},
  132. {1200, "UTF16LE"},
  133. {1200, "UTF-16LE"},
  134. {1200, "UCS2LE"},
  135. {1200, "UCS-2LE"},
  136. {1201, "CP1201"},
  137. {1201, "UTF16BE"},
  138. {1201, "UTF-16BE"},
  139. {1201, "UCS2BE"},
  140. {1201, "UCS-2BE"},
  141. {1201, "unicodeFFFE"},
  142. {12000, "CP12000"},
  143. {12000, "UTF32LE"},
  144. {12000, "UTF-32LE"},
  145. {12000, "UCS4LE"},
  146. {12000, "UCS-4LE"},
  147. {12001, "CP12001"},
  148. {12001, "UTF32BE"},
  149. {12001, "UTF-32BE"},
  150. {12001, "UCS4BE"},
  151. {12001, "UCS-4BE"},
  152. #ifndef GLIB_COMPILATION
  153. /*
  154. * Default is big endian.
  155. * See rfc2781 4.3 Interpreting text labelled as UTF-16.
  156. */
  157. {1201, "UTF16"},
  158. {1201, "UTF-16"},
  159. {1201, "UCS2"},
  160. {1201, "UCS-2"},
  161. {12001, "UTF32"},
  162. {12001, "UTF-32"},
  163. {12001, "UCS-4"},
  164. {12001, "UCS4"},
  165. #else
  166. /* Default is little endian, because the platform is */
  167. {1200, "UTF16"},
  168. {1200, "UTF-16"},
  169. {1200, "UCS2"},
  170. {1200, "UCS-2"},
  171. {12000, "UTF32"},
  172. {12000, "UTF-32"},
  173. {12000, "UCS4"},
  174. {12000, "UCS-4"},
  175. #endif
  176. /* copy from libiconv `iconv -l` */
  177. /* !IsValidCodePage(367) */
  178. {20127, "ANSI_X3.4-1968"},
  179. {20127, "ANSI_X3.4-1986"},
  180. {20127, "ASCII"},
  181. {20127, "CP367"},
  182. {20127, "IBM367"},
  183. {20127, "ISO-IR-6"},
  184. {20127, "ISO646-US"},
  185. {20127, "ISO_646.IRV:1991"},
  186. {20127, "US"},
  187. {20127, "US-ASCII"},
  188. {20127, "CSASCII"},
  189. /* !IsValidCodePage(819) */
  190. {1252, "CP819"},
  191. {1252, "IBM819"},
  192. {28591, "ISO-8859-1"},
  193. {28591, "ISO-IR-100"},
  194. {28591, "ISO8859-1"},
  195. {28591, "ISO_8859-1"},
  196. {28591, "ISO_8859-1:1987"},
  197. {28591, "L1"},
  198. {28591, "LATIN1"},
  199. {28591, "CSISOLATIN1"},
  200. {1250, "CP1250"},
  201. {1250, "MS-EE"},
  202. {1250, "WINDOWS-1250"},
  203. {1251, "CP1251"},
  204. {1251, "MS-CYRL"},
  205. {1251, "WINDOWS-1251"},
  206. {1252, "CP1252"},
  207. {1252, "MS-ANSI"},
  208. {1252, "WINDOWS-1252"},
  209. {1253, "CP1253"},
  210. {1253, "MS-GREEK"},
  211. {1253, "WINDOWS-1253"},
  212. {1254, "CP1254"},
  213. {1254, "MS-TURK"},
  214. {1254, "WINDOWS-1254"},
  215. {1255, "CP1255"},
  216. {1255, "MS-HEBR"},
  217. {1255, "WINDOWS-1255"},
  218. {1256, "CP1256"},
  219. {1256, "MS-ARAB"},
  220. {1256, "WINDOWS-1256"},
  221. {1257, "CP1257"},
  222. {1257, "WINBALTRIM"},
  223. {1257, "WINDOWS-1257"},
  224. {1258, "CP1258"},
  225. {1258, "WINDOWS-1258"},
  226. {850, "850"},
  227. {850, "CP850"},
  228. {850, "IBM850"},
  229. {850, "CSPC850MULTILINGUAL"},
  230. /* !IsValidCodePage(862) */
  231. {862, "862"},
  232. {862, "CP862"},
  233. {862, "IBM862"},
  234. {862, "CSPC862LATINHEBREW"},
  235. {866, "866"},
  236. {866, "CP866"},
  237. {866, "IBM866"},
  238. {866, "CSIBM866"},
  239. /* !IsValidCodePage(154) */
  240. {154, "CP154"},
  241. {154, "CYRILLIC-ASIAN"},
  242. {154, "PT154"},
  243. {154, "PTCP154"},
  244. {154, "CSPTCP154"},
  245. /* !IsValidCodePage(1133) */
  246. {1133, "CP1133"},
  247. {1133, "IBM-CP1133"},
  248. {874, "CP874"},
  249. {874, "WINDOWS-874"},
  250. /* !IsValidCodePage(51932) */
  251. {51932, "CP51932"},
  252. {51932, "MS51932"},
  253. {51932, "WINDOWS-51932"},
  254. {51932, "EUC-JP"},
  255. {932, "CP932"},
  256. {932, "MS932"},
  257. {932, "SHIFFT_JIS"},
  258. {932, "SHIFFT_JIS-MS"},
  259. {932, "SJIS"},
  260. {932, "SJIS-MS"},
  261. {932, "SJIS-OPEN"},
  262. {932, "SJIS-WIN"},
  263. {932, "WINDOWS-31J"},
  264. {932, "WINDOWS-932"},
  265. {932, "CSWINDOWS31J"},
  266. {50221, "CP50221"},
  267. {50221, "ISO-2022-JP"},
  268. {50221, "ISO-2022-JP-MS"},
  269. {50221, "ISO2022-JP"},
  270. {50221, "ISO2022-JP-MS"},
  271. {50221, "MS50221"},
  272. {50221, "WINDOWS-50221"},
  273. {936, "CP936"},
  274. {936, "GBK"},
  275. {936, "MS936"},
  276. {936, "WINDOWS-936"},
  277. {950, "CP950"},
  278. {950, "BIG5"},
  279. {950, "BIG5HKSCS"},
  280. {950, "BIG5-HKSCS"},
  281. {949, "CP949"},
  282. {949, "UHC"},
  283. {949, "EUC-KR"},
  284. {1361, "CP1361"},
  285. {1361, "JOHAB"},
  286. {437, "437"},
  287. {437, "CP437"},
  288. {437, "IBM437"},
  289. {437, "CSPC8CODEPAGE437"},
  290. {737, "CP737"},
  291. {775, "CP775"},
  292. {775, "IBM775"},
  293. {775, "CSPC775BALTIC"},
  294. {852, "852"},
  295. {852, "CP852"},
  296. {852, "IBM852"},
  297. {852, "CSPCP852"},
  298. /* !IsValidCodePage(853) */
  299. {853, "CP853"},
  300. {855, "855"},
  301. {855, "CP855"},
  302. {855, "IBM855"},
  303. {855, "CSIBM855"},
  304. {857, "857"},
  305. {857, "CP857"},
  306. {857, "IBM857"},
  307. {857, "CSIBM857"},
  308. /* !IsValidCodePage(858) */
  309. {858, "CP858"},
  310. {860, "860"},
  311. {860, "CP860"},
  312. {860, "IBM860"},
  313. {860, "CSIBM860"},
  314. {861, "861"},
  315. {861, "CP-IS"},
  316. {861, "CP861"},
  317. {861, "IBM861"},
  318. {861, "CSIBM861"},
  319. {863, "863"},
  320. {863, "CP863"},
  321. {863, "IBM863"},
  322. {863, "CSIBM863"},
  323. {864, "CP864"},
  324. {864, "IBM864"},
  325. {864, "CSIBM864"},
  326. {865, "865"},
  327. {865, "CP865"},
  328. {865, "IBM865"},
  329. {865, "CSIBM865"},
  330. {869, "869"},
  331. {869, "CP-GR"},
  332. {869, "CP869"},
  333. {869, "IBM869"},
  334. {869, "CSIBM869"},
  335. /* !IsValidCodePage(1152) */
  336. {1125, "CP1125"},
  337. /*
  338. * Code Page Identifiers
  339. * http://msdn2.microsoft.com/en-us/library/ms776446.aspx
  340. */
  341. {37, "IBM037"}, /* IBM EBCDIC US-Canada */
  342. {437, "IBM437"}, /* OEM United States */
  343. {500, "IBM500"}, /* IBM EBCDIC International */
  344. {708, "ASMO-708"}, /* Arabic (ASMO 708) */
  345. /* 709 Arabic (ASMO-449+, BCON V4) */
  346. /* 710 Arabic - Transparent Arabic */
  347. {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */
  348. {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */
  349. {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */
  350. {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */
  351. {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */
  352. {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */
  353. {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */
  354. {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */
  355. {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */
  356. {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */
  357. {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */
  358. {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */
  359. {864, "IBM864"}, /* OEM Arabic; Arabic (864) */
  360. {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */
  361. {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */
  362. {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */
  363. {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */
  364. {874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */
  365. {875, "cp875"}, /* IBM EBCDIC Greek Modern */
  366. {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */
  367. {932, "shift-jis"}, /* alternative name for it */
  368. {936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */
  369. {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */
  370. {950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */
  371. {950, "big5hkscs"}, /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */
  372. {950, "big5-hkscs"}, /* alternative name for it */
  373. {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */
  374. {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */
  375. {1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */
  376. {1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */
  377. {1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */
  378. {1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */
  379. {1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */
  380. {1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */
  381. {1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */
  382. {1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */
  383. {1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */
  384. {1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */
  385. {1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */
  386. {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */
  387. {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */
  388. {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */
  389. {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */
  390. {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */
  391. {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */
  392. {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */
  393. {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */
  394. {1361, "Johab"}, /* Korean (Johab) */
  395. {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */
  396. {10001, "x-mac-japanese"}, /* Japanese (Mac) */
  397. {10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */
  398. {10003, "x-mac-korean"}, /* Korean (Mac) */
  399. {10004, "x-mac-arabic"}, /* Arabic (Mac) */
  400. {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */
  401. {10006, "x-mac-greek"}, /* Greek (Mac) */
  402. {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */
  403. {10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */
  404. {10010, "x-mac-romanian"}, /* Romanian (Mac) */
  405. {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */
  406. {10021, "x-mac-thai"}, /* Thai (Mac) */
  407. {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */
  408. {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */
  409. {10081, "x-mac-turkish"}, /* Turkish (Mac) */
  410. {10082, "x-mac-croatian"}, /* Croatian (Mac) */
  411. {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */
  412. {20001, "x-cp20001"}, /* TCA Taiwan */
  413. {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */
  414. {20003, "x-cp20003"}, /* IBM5550 Taiwan */
  415. {20004, "x-cp20004"}, /* TeleText Taiwan */
  416. {20005, "x-cp20005"}, /* Wang Taiwan */
  417. {20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */
  418. {20106, "x-IA5-German"}, /* IA5 German (7-bit) */
  419. {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */
  420. {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */
  421. {20127, "us-ascii"}, /* US-ASCII (7-bit) */
  422. {20261, "x-cp20261"}, /* T.61 */
  423. {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */
  424. {20273, "IBM273"}, /* IBM EBCDIC Germany */
  425. {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */
  426. {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */
  427. {20280, "IBM280"}, /* IBM EBCDIC Italy */
  428. {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */
  429. {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */
  430. {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */
  431. {20297, "IBM297"}, /* IBM EBCDIC France */
  432. {20420, "IBM420"}, /* IBM EBCDIC Arabic */
  433. {20423, "IBM423"}, /* IBM EBCDIC Greek */
  434. {20424, "IBM424"}, /* IBM EBCDIC Hebrew */
  435. {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */
  436. {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */
  437. {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */
  438. {20871, "IBM871"}, /* IBM EBCDIC Icelandic */
  439. {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */
  440. {20905, "IBM905"}, /* IBM EBCDIC Turkish */
  441. {20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */
  442. {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */
  443. {20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */
  444. {20949, "x-cp20949"}, /* Korean Wansung */
  445. {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */
  446. /* 21027 (deprecated) */
  447. {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */
  448. {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
  449. {28591, "iso8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */
  450. {28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
  451. {28592, "iso8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */
  452. {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */
  453. {28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */
  454. {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */
  455. {28594, "iso8859-4"}, /* ISO 8859-4 Baltic */
  456. {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */
  457. {28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */
  458. {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */
  459. {28596, "iso8859-6"}, /* ISO 8859-6 Arabic */
  460. {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */
  461. {28597, "iso8859-7"}, /* ISO 8859-7 Greek */
  462. {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
  463. {28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */
  464. {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */
  465. {28599, "iso8859-9"}, /* ISO 8859-9 Turkish */
  466. {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */
  467. {28603, "iso8859-13"}, /* ISO 8859-13 Estonian */
  468. {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */
  469. {28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */
  470. {29001, "x-Europa"}, /* Europa 3 */
  471. {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
  472. {38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */
  473. {50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */
  474. {50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */
  475. {50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */
  476. {50225, "iso-2022-kr"}, /* ISO 2022 Korean */
  477. {50225, "iso2022-kr"}, /* ISO 2022 Korean */
  478. {50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */
  479. /* 50229 ISO 2022 Traditional Chinese */
  480. /* 50930 EBCDIC Japanese (Katakana) Extended */
  481. /* 50931 EBCDIC US-Canada and Japanese */
  482. /* 50933 EBCDIC Korean Extended and Korean */
  483. /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */
  484. /* 50936 EBCDIC Simplified Chinese */
  485. /* 50937 EBCDIC US-Canada and Traditional Chinese */
  486. /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */
  487. {51932, "euc-jp"}, /* EUC Japanese */
  488. {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */
  489. {51949, "euc-kr"}, /* EUC Korean */
  490. /* 51950 EUC Traditional Chinese */
  491. {52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */
  492. {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */
  493. {57002, "x-iscii-de"}, /* ISCII Devanagari */
  494. {57003, "x-iscii-be"}, /* ISCII Bengali */
  495. {57004, "x-iscii-ta"}, /* ISCII Tamil */
  496. {57005, "x-iscii-te"}, /* ISCII Telugu */
  497. {57006, "x-iscii-as"}, /* ISCII Assamese */
  498. {57007, "x-iscii-or"}, /* ISCII Oriya */
  499. {57008, "x-iscii-ka"}, /* ISCII Kannada */
  500. {57009, "x-iscii-ma"}, /* ISCII Malayalam */
  501. {57010, "x-iscii-gu"}, /* ISCII Gujarati */
  502. {57011, "x-iscii-pa"}, /* ISCII Punjabi */
  503. {0, NULL}
  504. };
  505. /*
  506. * SJIS SHIFTJIS table CP932 table
  507. * ---- --------------------------- --------------------------------
  508. * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS
  509. * 7E U+203E OVERLINE U+007E TILDE
  510. * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR
  511. * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS
  512. * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE
  513. * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO
  514. * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS
  515. * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN
  516. * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN
  517. * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN
  518. *
  519. * EUC-JP and ISO-2022-JP should be compatible with CP932.
  520. *
  521. * Kernel and MLang have different Unicode mapping table. Make sure
  522. * which API is used.
  523. */
  524. static compat_t cp932_compat[] = {
  525. {0x00A5, 0x005C, COMPAT_OUT},
  526. {0x203E, 0x007E, COMPAT_OUT},
  527. {0x2014, 0x2015, COMPAT_OUT},
  528. {0x301C, 0xFF5E, COMPAT_OUT},
  529. {0x2016, 0x2225, COMPAT_OUT},
  530. {0x2212, 0xFF0D, COMPAT_OUT},
  531. {0x00A2, 0xFFE0, COMPAT_OUT},
  532. {0x00A3, 0xFFE1, COMPAT_OUT},
  533. {0x00AC, 0xFFE2, COMPAT_OUT},
  534. {0, 0, 0}
  535. };
  536. static compat_t cp20932_compat[] = {
  537. {0x00A5, 0x005C, COMPAT_OUT},
  538. {0x203E, 0x007E, COMPAT_OUT},
  539. {0x2014, 0x2015, COMPAT_OUT},
  540. {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN},
  541. {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN},
  542. {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN},
  543. {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN},
  544. {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN},
  545. {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN},
  546. {0, 0, 0}
  547. };
  548. static compat_t *cp51932_compat = cp932_compat;
  549. /* cp20932_compat for kernel. cp932_compat for mlang. */
  550. static compat_t *cp5022x_compat = cp932_compat;
  551. typedef HRESULT (WINAPI *CONVERTINETSTRING)(
  552. LPDWORD lpdwMode,
  553. DWORD dwSrcEncoding,
  554. DWORD dwDstEncoding,
  555. LPCSTR lpSrcStr,
  556. LPINT lpnSrcSize,
  557. LPBYTE lpDstStr,
  558. LPINT lpnDstSize
  559. );
  560. typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)(
  561. LPDWORD lpdwMode,
  562. DWORD dwSrcEncoding,
  563. LPCSTR lpSrcStr,
  564. LPINT lpnMultiCharCount,
  565. LPWSTR lpDstStr,
  566. LPINT lpnWideCharCount
  567. );
  568. typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)(
  569. LPDWORD lpdwMode,
  570. DWORD dwEncoding,
  571. LPCWSTR lpSrcStr,
  572. LPINT lpnWideCharCount,
  573. LPSTR lpDstStr,
  574. LPINT lpnMultiCharCount
  575. );
  576. typedef HRESULT (WINAPI *ISCONVERTINETSTRINGAVAILABLE)(
  577. DWORD dwSrcEncoding,
  578. DWORD dwDstEncoding
  579. );
  580. typedef HRESULT (WINAPI *LCIDTORFC1766A)(
  581. LCID Locale,
  582. LPSTR pszRfc1766,
  583. int nChar
  584. );
  585. typedef HRESULT (WINAPI *LCIDTORFC1766W)(
  586. LCID Locale,
  587. LPWSTR pszRfc1766,
  588. int nChar
  589. );
  590. typedef HRESULT (WINAPI *RFC1766TOLCIDA)(
  591. LCID *pLocale,
  592. LPSTR pszRfc1766
  593. );
  594. typedef HRESULT (WINAPI *RFC1766TOLCIDW)(
  595. LCID *pLocale,
  596. LPWSTR pszRfc1766
  597. );
  598. static CONVERTINETSTRING ConvertINetString;
  599. static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode;
  600. static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte;
  601. static ISCONVERTINETSTRINGAVAILABLE IsConvertINetStringAvailable;
  602. static LCIDTORFC1766A LcidToRfc1766A;
  603. static RFC1766TOLCIDA Rfc1766ToLcidA;
  604. static int
  605. load_mlang()
  606. {
  607. HMODULE h;
  608. if (ConvertINetString != NULL)
  609. return TRUE;
  610. h = LoadLibrary(TEXT("mlang.dll"));
  611. if (!h)
  612. return FALSE;
  613. ConvertINetString = (CONVERTINETSTRING)GetProcAddressA(h, "ConvertINetString");
  614. ConvertINetMultiByteToUnicode = (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA(h, "ConvertINetMultiByteToUnicode");
  615. ConvertINetUnicodeToMultiByte = (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA(h, "ConvertINetUnicodeToMultiByte");
  616. IsConvertINetStringAvailable = (ISCONVERTINETSTRINGAVAILABLE)GetProcAddressA(h, "IsConvertINetStringAvailable");
  617. LcidToRfc1766A = (LCIDTORFC1766A)GetProcAddressA(h, "LcidToRfc1766A");
  618. Rfc1766ToLcidA = (RFC1766TOLCIDA)GetProcAddressA(h, "Rfc1766ToLcidA");
  619. return TRUE;
  620. }
  621. iconv_t
  622. iconv_open(const char *tocode, const char *fromcode)
  623. {
  624. rec_iconv_t *cd;
  625. cd = (rec_iconv_t *)calloc(1, sizeof(rec_iconv_t));
  626. if (cd == NULL)
  627. return (iconv_t)(-1);
  628. #if defined(USE_LIBICONV_DLL)
  629. errno = 0;
  630. if (libiconv_iconv_open(cd, tocode, fromcode))
  631. return (iconv_t)cd;
  632. #endif
  633. /* reset the errno to prevent reporting wrong error code.
  634. * 0 for unsorted error. */
  635. errno = 0;
  636. if (win_iconv_open(cd, tocode, fromcode))
  637. return (iconv_t)cd;
  638. free(cd);
  639. return (iconv_t)(-1);
  640. }
  641. int
  642. iconv_close(iconv_t _cd)
  643. {
  644. rec_iconv_t *cd = (rec_iconv_t *)_cd;
  645. int r = cd->iconv_close(cd->cd);
  646. int e = *(cd->_errno());
  647. #if defined(USE_LIBICONV_DLL)
  648. if (cd->hlibiconv != NULL)
  649. FreeLibrary(cd->hlibiconv);
  650. #endif
  651. free(cd);
  652. errno = e;
  653. return r;
  654. }
  655. size_t
  656. iconv(iconv_t _cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
  657. {
  658. rec_iconv_t *cd = (rec_iconv_t *)_cd;
  659. size_t r = cd->iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft);
  660. errno = *(cd->_errno());
  661. return r;
  662. }
  663. static int
  664. win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode)
  665. {
  666. if (!make_csconv(fromcode, &cd->from) || !make_csconv(tocode, &cd->to))
  667. return FALSE;
  668. cd->iconv_close = win_iconv_close;
  669. cd->iconv = win_iconv;
  670. cd->_errno = _errno;
  671. cd->cd = (iconv_t)cd;
  672. return TRUE;
  673. }
  674. static int
  675. win_iconv_close(iconv_t cd)
  676. {
  677. return 0;
  678. }
  679. static size_t
  680. win_iconv(iconv_t _cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
  681. {
  682. rec_iconv_t *cd = (rec_iconv_t *)_cd;
  683. ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */
  684. int insize;
  685. int outsize;
  686. int wsize;
  687. DWORD frommode;
  688. DWORD tomode;
  689. uint wc;
  690. compat_t *cp;
  691. int i;
  692. if (inbuf == NULL || *inbuf == NULL)
  693. {
  694. if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL)
  695. {
  696. tomode = cd->to.mode;
  697. outsize = cd->to.flush(&cd->to, (uchar *)*outbuf, *outbytesleft);
  698. if (outsize == -1)
  699. {
  700. if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG)
  701. {
  702. outsize = 0;
  703. }
  704. else
  705. {
  706. cd->to.mode = tomode;
  707. return (size_t)(-1);
  708. }
  709. }
  710. *outbuf += outsize;
  711. *outbytesleft -= outsize;
  712. }
  713. cd->from.mode = 0;
  714. cd->to.mode = 0;
  715. return 0;
  716. }
  717. while (*inbytesleft != 0)
  718. {
  719. frommode = cd->from.mode;
  720. tomode = cd->to.mode;
  721. wsize = MB_CHAR_MAX;
  722. insize = cd->from.mbtowc(&cd->from, (const uchar *)*inbuf, *inbytesleft, wbuf, &wsize);
  723. if (insize == -1)
  724. {
  725. if (cd->to.flags & FLAG_IGNORE)
  726. {
  727. cd->from.mode = frommode;
  728. insize = 1;
  729. wsize = 0;
  730. }
  731. else
  732. {
  733. cd->from.mode = frommode;
  734. return (size_t)(-1);
  735. }
  736. }
  737. if (wsize == 0)
  738. {
  739. *inbuf += insize;
  740. *inbytesleft -= insize;
  741. continue;
  742. }
  743. if (cd->from.compat != NULL)
  744. {
  745. wc = utf16_to_ucs4(wbuf);
  746. cp = cd->from.compat;
  747. for (i = 0; cp[i].in != 0; ++i)
  748. {
  749. if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc)
  750. {
  751. ucs4_to_utf16(cp[i].in, wbuf, &wsize);
  752. break;
  753. }
  754. }
  755. }
  756. if (cd->to.compat != NULL)
  757. {
  758. wc = utf16_to_ucs4(wbuf);
  759. cp = cd->to.compat;
  760. for (i = 0; cp[i].in != 0; ++i)
  761. {
  762. if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc)
  763. {
  764. ucs4_to_utf16(cp[i].out, wbuf, &wsize);
  765. break;
  766. }
  767. }
  768. }
  769. outsize = cd->to.wctomb(&cd->to, wbuf, wsize, (uchar *)*outbuf, *outbytesleft);
  770. if (outsize == -1)
  771. {
  772. if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG)
  773. {
  774. cd->to.mode = tomode;
  775. outsize = 0;
  776. }
  777. else
  778. {
  779. cd->from.mode = frommode;
  780. cd->to.mode = tomode;
  781. return (size_t)(-1);
  782. }
  783. }
  784. *inbuf += insize;
  785. *outbuf += outsize;
  786. *inbytesleft -= insize;
  787. *outbytesleft -= outsize;
  788. }
  789. return 0;
  790. }
  791. static int
  792. make_csconv(const char *_name, csconv_t *cv)
  793. {
  794. CPINFO cpinfo;
  795. int use_compat = TRUE;
  796. int flag = 0;
  797. char *name;
  798. char *p;
  799. name = xstrndup(_name, strlen(_name));
  800. if (name == NULL)
  801. return FALSE;
  802. /* check for option "enc_name//opt1//opt2" */
  803. while ((p = strrstr(name, "//")) != NULL)
  804. {
  805. if (_stricmp(p + 2, "nocompat") == 0)
  806. use_compat = FALSE;
  807. else if (_stricmp(p + 2, "translit") == 0)
  808. flag |= FLAG_TRANSLIT;
  809. else if (_stricmp(p + 2, "ignore") == 0)
  810. flag |= FLAG_IGNORE;
  811. *p = 0;
  812. }
  813. cv->mode = 0;
  814. cv->flags = flag;
  815. cv->mblen = NULL;
  816. cv->flush = NULL;
  817. cv->compat = NULL;
  818. cv->codepage = name_to_codepage(name);
  819. if (cv->codepage == 1200 || cv->codepage == 1201)
  820. {
  821. cv->mbtowc = utf16_mbtowc;
  822. cv->wctomb = utf16_wctomb;
  823. if (_stricmp(name, "UTF-16") == 0 || _stricmp(name, "UTF16") == 0 ||
  824. _stricmp(name, "UCS-2") == 0 || _stricmp(name, "UCS2") == 0)
  825. cv->flags |= FLAG_USE_BOM;
  826. }
  827. else if (cv->codepage == 12000 || cv->codepage == 12001)
  828. {
  829. cv->mbtowc = utf32_mbtowc;
  830. cv->wctomb = utf32_wctomb;
  831. if (_stricmp(name, "UTF-32") == 0 || _stricmp(name, "UTF32") == 0 ||
  832. _stricmp(name, "UCS-4") == 0 || _stricmp(name, "UCS4") == 0)
  833. cv->flags |= FLAG_USE_BOM;
  834. }
  835. else if (cv->codepage == 65001)
  836. {
  837. cv->mbtowc = kernel_mbtowc;
  838. cv->wctomb = kernel_wctomb;
  839. cv->mblen = utf8_mblen;
  840. }
  841. else if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && load_mlang())
  842. {
  843. cv->mbtowc = iso2022jp_mbtowc;
  844. cv->wctomb = iso2022jp_wctomb;
  845. cv->flush = iso2022jp_flush;
  846. }
  847. else if (cv->codepage == 51932 && load_mlang())
  848. {
  849. cv->mbtowc = mlang_mbtowc;
  850. cv->wctomb = mlang_wctomb;
  851. cv->mblen = eucjp_mblen;
  852. }
  853. else if (IsValidCodePage(cv->codepage)
  854. && GetCPInfo(cv->codepage, &cpinfo) != 0)
  855. {
  856. cv->mbtowc = kernel_mbtowc;
  857. cv->wctomb = kernel_wctomb;
  858. if (cpinfo.MaxCharSize == 1)
  859. cv->mblen = sbcs_mblen;
  860. else if (cpinfo.MaxCharSize == 2)
  861. cv->mblen = dbcs_mblen;
  862. else
  863. cv->mblen = mbcs_mblen;
  864. }
  865. else
  866. {
  867. /* not supported */
  868. free(name);
  869. errno = EINVAL;
  870. return FALSE;
  871. }
  872. if (use_compat)
  873. {
  874. switch (cv->codepage)
  875. {
  876. case 932: cv->compat = cp932_compat; break;
  877. case 20932: cv->compat = cp20932_compat; break;
  878. case 51932: cv->compat = cp51932_compat; break;
  879. case 50220: case 50221: case 50222: cv->compat = cp5022x_compat; break;
  880. }
  881. }
  882. free(name);
  883. return TRUE;
  884. }
  885. static int
  886. name_to_codepage(const char *name)
  887. {
  888. int i;
  889. if (*name == '\0' ||
  890. strcmp(name, "char") == 0)
  891. return GetACP();
  892. else if (strcmp(name, "wchar_t") == 0)
  893. return 1200;
  894. else if (_strnicmp(name, "cp", 2) == 0)
  895. return atoi(name + 2); /* CP123 */
  896. else if ('0' <= name[0] && name[0] <= '9')
  897. return atoi(name); /* 123 */
  898. else if (_strnicmp(name, "xx", 2) == 0)
  899. return atoi(name + 2); /* XX123 for debug */
  900. for (i = 0; codepage_alias[i].name != NULL; ++i)
  901. if (_stricmp(name, codepage_alias[i].name) == 0)
  902. return codepage_alias[i].codepage;
  903. return -1;
  904. }
  905. /*
  906. * http://www.faqs.org/rfcs/rfc2781.html
  907. */
  908. static uint
  909. utf16_to_ucs4(const ushort *wbuf)
  910. {
  911. uint wc = wbuf[0];
  912. if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
  913. wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000;
  914. return wc;
  915. }
  916. static void
  917. ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize)
  918. {
  919. if (wc < 0x10000)
  920. {
  921. wbuf[0] = wc;
  922. *wbufsize = 1;
  923. }
  924. else
  925. {
  926. wc -= 0x10000;
  927. wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF);
  928. wbuf[1] = 0xDC00 | (wc & 0x3FF);
  929. *wbufsize = 2;
  930. }
  931. }
  932. /*
  933. * Check if codepage is one of those for which the dwFlags parameter
  934. * to MultiByteToWideChar() must be zero. Return zero or
  935. * MB_ERR_INVALID_CHARS. The docs in Platform SDK for for Windows
  936. * Server 2003 R2 claims that also codepage 65001 is one of these, but
  937. * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave
  938. * out 65001 (UTF-8), and that indeed seems to be the case on XP, it
  939. * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting
  940. * from UTF-8.
  941. */
  942. static int
  943. mbtowc_flags(int codepage)
  944. {
  945. return (codepage == 50220 || codepage == 50221 ||
  946. codepage == 50222 || codepage == 50225 ||
  947. codepage == 50227 || codepage == 50229 ||
  948. codepage == 52936 || codepage == 54936 ||
  949. (codepage >= 57002 && codepage <= 57011) ||
  950. codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS;
  951. }
  952. /*
  953. * Check if codepage is one those for which the lpUsedDefaultChar
  954. * parameter to WideCharToMultiByte() must be NULL. The docs in
  955. * Platform SDK for for Windows Server 2003 R2 claims that this is the
  956. * list below, while the MSDN docs for MSVS2008 claim that it is only
  957. * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform
  958. * SDK seems to be correct, at least for XP.
  959. */
  960. static int
  961. must_use_null_useddefaultchar(int codepage)
  962. {
  963. return (codepage == 65000 || codepage == 65001 ||
  964. codepage == 50220 || codepage == 50221 ||
  965. codepage == 50222 || codepage == 50225 ||
  966. codepage == 50227 || codepage == 50229 ||
  967. codepage == 52936 || codepage == 54936 ||
  968. (codepage >= 57002 && codepage <= 57011) ||
  969. codepage == 42);
  970. }
  971. static char *
  972. strrstr(const char *str, const char *token)
  973. {
  974. int len = strlen(token);
  975. const char *p = str + strlen(str);
  976. while (str <= --p)
  977. if (p[0] == token[0] && strncmp(p, token, len) == 0)
  978. return (char *)p;
  979. return NULL;
  980. }
  981. static char *
  982. xstrndup(const char *s, size_t n)
  983. {
  984. char *p;
  985. p = (char *)malloc(n + 1);
  986. if (p == NULL)
  987. return NULL;
  988. memcpy(p, s, n);
  989. p[n] = '\0';
  990. return p;
  991. }
  992. static int
  993. seterror(int err)
  994. {
  995. errno = err;
  996. return -1;
  997. }
  998. #if defined(USE_LIBICONV_DLL)
  999. static int
  1000. libiconv_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode)
  1001. {
  1002. HMODULE hlibiconv = NULL;
  1003. HMODULE hmsvcrt = NULL;
  1004. char *dllname;
  1005. const char *p;
  1006. const char *e;
  1007. f_iconv_open _iconv_open;
  1008. /*
  1009. * always try to load dll, so that we can switch dll in runtime.
  1010. */
  1011. /* XXX: getenv() can't get variable set by SetEnvironmentVariable() */
  1012. p = getenv("WINICONV_LIBICONV_DLL");
  1013. if (p == NULL)
  1014. p = DEFAULT_LIBICONV_DLL;
  1015. /* parse comma separated value */
  1016. for ( ; *p != 0; p = (*e == ',') ? e + 1 : e)
  1017. {
  1018. e = strchr(p, ',');
  1019. if (p == e)
  1020. continue;
  1021. else if (e == NULL)
  1022. e = p + strlen(p);
  1023. dllname = xstrndup(p, e - p);
  1024. if (dllname == NULL)
  1025. return FALSE;
  1026. hlibiconv = LoadLibraryA(dllname);
  1027. free(dllname);
  1028. if (hlibiconv != NULL)
  1029. {
  1030. if (hlibiconv == hwiniconv)
  1031. {
  1032. FreeLibrary(hlibiconv);
  1033. hlibiconv = NULL;
  1034. continue;
  1035. }
  1036. break;
  1037. }
  1038. }
  1039. if (hlibiconv == NULL)
  1040. goto failed;
  1041. hmsvcrt = find_imported_module_by_funcname(hlibiconv, "_errno");
  1042. if (hmsvcrt == NULL)
  1043. goto failed;
  1044. _iconv_open = (f_iconv_open)GetProcAddressA(hlibiconv, "libiconv_open");
  1045. if (_iconv_open == NULL)
  1046. _iconv_open = (f_iconv_open)GetProcAddressA(hlibiconv, "iconv_open");
  1047. cd->iconv_close = (f_iconv_close)GetProcAddressA(hlibiconv, "libiconv_close");
  1048. if (cd->iconv_close == NULL)
  1049. cd->iconv_close = (f_iconv_close)GetProcAddressA(hlibiconv, "iconv_close");
  1050. cd->iconv = (f_iconv)GetProcAddressA(hlibiconv, "libiconv");
  1051. if (cd->iconv == NULL)
  1052. cd->iconv = (f_iconv)GetProcAddressA(hlibiconv, "iconv");
  1053. cd->_errno = (f_errno)GetProcAddressA(hmsvcrt, "_errno");
  1054. if (_iconv_open == NULL || cd->iconv_close == NULL
  1055. || cd->iconv == NULL || cd->_errno == NULL)
  1056. goto failed;
  1057. cd->cd = _iconv_open(tocode, fromcode);
  1058. if (cd->cd == (iconv_t)(-1))
  1059. goto failed;
  1060. cd->hlibiconv = hlibiconv;
  1061. return TRUE;
  1062. failed:
  1063. if (hlibiconv != NULL)
  1064. FreeLibrary(hlibiconv);
  1065. /* do not free hmsvcrt which is obtained by GetModuleHandle() */
  1066. return FALSE;
  1067. }
  1068. /*
  1069. * Reference:
  1070. * http://forums.belution.com/ja/vc/000/234/78s.shtml
  1071. * http://nienie.com/~masapico/api_ImageDirectoryEntryToData.html
  1072. *
  1073. * The formal way is
  1074. * imagehlp.h or dbghelp.h
  1075. * imagehlp.lib or dbghelp.lib
  1076. * ImageDirectoryEntryToData()
  1077. */
  1078. #define TO_DOS_HEADER(base) ((PIMAGE_DOS_HEADER)(base))
  1079. #define TO_NT_HEADERS(base) ((PIMAGE_NT_HEADERS)((LPBYTE)(base) + TO_DOS_HEADER(base)->e_lfanew))
  1080. static PVOID
  1081. MyImageDirectoryEntryToData(LPVOID Base, BOOLEAN MappedAsImage, USHORT DirectoryEntry, PULONG Size)
  1082. {
  1083. /* TODO: MappedAsImage? */
  1084. PIMAGE_DATA_DIRECTORY p;
  1085. p = TO_NT_HEADERS(Base)->OptionalHeader.DataDirectory + DirectoryEntry;
  1086. if (p->VirtualAddress == 0) {
  1087. *Size = 0;
  1088. return NULL;
  1089. }
  1090. *Size = p->Size;
  1091. return (PVOID)((LPBYTE)Base + p->VirtualAddress);
  1092. }
  1093. static HMODULE
  1094. find_imported_module_by_funcname(HMODULE hModule, const char *funcname)
  1095. {
  1096. DWORD_PTR Base;
  1097. ULONG Size;
  1098. PIMAGE_IMPORT_DESCRIPTOR Imp;
  1099. PIMAGE_THUNK_DATA Name; /* Import Name Table */
  1100. PIMAGE_IMPORT_BY_NAME ImpName;
  1101. Base = (DWORD_PTR)hModule;
  1102. Imp = (PIMAGE_IMPORT_DESCRIPTOR)MyImageDirectoryEntryToData(
  1103. (LPVOID)Base,
  1104. TRUE,
  1105. IMAGE_DIRECTORY_ENTRY_IMPORT,
  1106. &Size);
  1107. if (Imp == NULL)
  1108. return NULL;
  1109. for ( ; Imp->OriginalFirstThunk != 0; ++Imp)
  1110. {
  1111. Name = (PIMAGE_THUNK_DATA)(Base + Imp->OriginalFirstThunk);
  1112. for ( ; Name->u1.Ordinal != 0; ++Name)
  1113. {
  1114. if (!IMAGE_SNAP_BY_ORDINAL(Name->u1.Ordinal))
  1115. {
  1116. ImpName = (PIMAGE_IMPORT_BY_NAME)
  1117. (Base + (DWORD_PTR)Name->u1.AddressOfData);
  1118. if (strcmp((char *)ImpName->Name, funcname) == 0)
  1119. return GetModuleHandleA((char *)(Base + Imp->Name));
  1120. }
  1121. }
  1122. }
  1123. return NULL;
  1124. }
  1125. #endif
  1126. static int
  1127. sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize)
  1128. {
  1129. return 1;
  1130. }
  1131. static int
  1132. dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize)
  1133. {
  1134. int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1;
  1135. if (bufsize < len)
  1136. return seterror(EINVAL);
  1137. return len;
  1138. }
  1139. static int
  1140. mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize)
  1141. {
  1142. int len = 0;
  1143. if (cv->codepage == 54936) {
  1144. if (buf[0] <= 0x7F) len = 1;
  1145. else if (buf[0] >= 0x81 && buf[0] <= 0xFE &&
  1146. bufsize >= 2 &&
  1147. ((buf[1] >= 0x40 && buf[1] <= 0x7E) ||
  1148. (buf[1] >= 0x80 && buf[1] <= 0xFE))) len = 2;
  1149. else if (buf[0] >= 0x81 && buf[0] <= 0xFE &&
  1150. bufsize >= 4 &&
  1151. buf[1] >= 0x30 && buf[1] <= 0x39) len = 4;
  1152. else
  1153. return seterror(EINVAL);
  1154. return len;
  1155. }
  1156. else
  1157. return seterror(EINVAL);
  1158. }
  1159. static int
  1160. utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize)
  1161. {
  1162. int len = 0;
  1163. if (buf[0] < 0x80) len = 1;
  1164. else if ((buf[0] & 0xE0) == 0xC0) len = 2;
  1165. else if ((buf[0] & 0xF0) == 0xE0) len = 3;
  1166. else if ((buf[0] & 0xF8) == 0xF0) len = 4;
  1167. else if ((buf[0] & 0xFC) == 0xF8) len = 5;
  1168. else if ((buf[0] & 0xFE) == 0xFC) len = 6;
  1169. if (len == 0)
  1170. return seterror(EILSEQ);
  1171. else if (bufsize < len)
  1172. return seterror(EINVAL);
  1173. return len;
  1174. }
  1175. static int
  1176. eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize)
  1177. {
  1178. if (buf[0] < 0x80) /* ASCII */
  1179. return 1;
  1180. else if (buf[0] == 0x8E) /* JIS X 0201 */
  1181. {
  1182. if (bufsize < 2)
  1183. return seterror(EINVAL);
  1184. else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF))
  1185. return seterror(EILSEQ);
  1186. return 2;
  1187. }
  1188. else if (buf[0] == 0x8F) /* JIS X 0212 */
  1189. {
  1190. if (bufsize < 3)
  1191. return seterror(EINVAL);
  1192. else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE)
  1193. || !(0xA1 <= buf[2] && buf[2] <= 0xFE))
  1194. return seterror(EILSEQ);
  1195. return 3;
  1196. }
  1197. else /* JIS X 0208 */
  1198. {
  1199. if (bufsize < 2)
  1200. return seterror(EINVAL);
  1201. else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE)
  1202. || !(0xA1 <= buf[1] && buf[1] <= 0xFE))
  1203. return seterror(EILSEQ);
  1204. return 2;
  1205. }
  1206. }
  1207. static int
  1208. kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
  1209. {
  1210. int len;
  1211. len = cv->mblen(cv, buf, bufsize);
  1212. if (len == -1)
  1213. return -1;
  1214. *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage),
  1215. (const char *)buf, len, (wchar_t *)wbuf, *wbufsize);
  1216. if (*wbufsize == 0)
  1217. return seterror(EILSEQ);
  1218. return len;
  1219. }
  1220. static int
  1221. kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
  1222. {
  1223. BOOL usedDefaultChar = 0;
  1224. BOOL *p = NULL;
  1225. int flags = 0;
  1226. int len;
  1227. if (bufsize == 0)
  1228. return seterror(E2BIG);
  1229. if (!must_use_null_useddefaultchar(cv->codepage))
  1230. {
  1231. p = &usedDefaultChar;
  1232. #ifdef WC_NO_BEST_FIT_CHARS
  1233. if (!(cv->flags & FLAG_TRANSLIT))
  1234. flags |= WC_NO_BEST_FIT_CHARS;
  1235. #endif
  1236. }
  1237. len = WideCharToMultiByte(cv->codepage, flags,
  1238. (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p);
  1239. if (len == 0)
  1240. {
  1241. if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
  1242. return seterror(E2BIG);
  1243. return seterror(EILSEQ);
  1244. }
  1245. else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT))
  1246. return seterror(EILSEQ);
  1247. else if (cv->mblen(cv, buf, len) != len) /* validate result */
  1248. return seterror(EILSEQ);
  1249. return len;
  1250. }
  1251. /*
  1252. * It seems that the mode (cv->mode) is fixnum.
  1253. * For example, when converting iso-2022-jp(cp50221) to unicode:
  1254. * in ascii sequence: mode=0xC42C0000
  1255. * in jisx0208 sequence: mode=0xC42C0001
  1256. * "C42C" is same for each convert session.
  1257. * It should be: ((codepage-1)<<16)|state
  1258. */
  1259. static int
  1260. mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
  1261. {
  1262. int len;
  1263. int insize;
  1264. HRESULT hr;
  1265. len = cv->mblen(cv, buf, bufsize);
  1266. if (len == -1)
  1267. return -1;
  1268. insize = len;
  1269. hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage,
  1270. (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize);
  1271. if (hr != S_OK || insize != len)
  1272. return seterror(EILSEQ);
  1273. return len;
  1274. }
  1275. static int
  1276. mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
  1277. {
  1278. char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */
  1279. int tmpsize = MB_CHAR_MAX;
  1280. int insize = wbufsize;
  1281. HRESULT hr;
  1282. hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage,
  1283. (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize);
  1284. if (hr != S_OK || insize != wbufsize)
  1285. return seterror(EILSEQ);
  1286. else if (bufsize < tmpsize)
  1287. return seterror(E2BIG);
  1288. else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize)
  1289. return seterror(EILSEQ);
  1290. memcpy(buf, tmpbuf, tmpsize);
  1291. return tmpsize;
  1292. }
  1293. static int
  1294. utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
  1295. {
  1296. int codepage = cv->codepage;
  1297. /* swap endian: 1200 <-> 1201 */
  1298. if (cv->mode & UNICODE_MODE_SWAPPED)
  1299. codepage ^= 1;
  1300. if (bufsize < 2)
  1301. return seterror(EINVAL);
  1302. if (codepage == 1200) /* little endian */
  1303. wbuf[0] = (buf[1] << 8) | buf[0];
  1304. else if (codepage == 1201) /* big endian */
  1305. wbuf[0] = (buf[0] << 8) | buf[1];
  1306. if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
  1307. {
  1308. cv->mode |= UNICODE_MODE_BOM_DONE;
  1309. if (wbuf[0] == 0xFFFE)
  1310. {
  1311. cv->mode |= UNICODE_MODE_SWAPPED;
  1312. *wbufsize = 0;
  1313. return 2;
  1314. }
  1315. else if (wbuf[0] == 0xFEFF)
  1316. {
  1317. *wbufsize = 0;
  1318. return 2;
  1319. }
  1320. }
  1321. if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF)
  1322. return seterror(EILSEQ);
  1323. if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
  1324. {
  1325. if (bufsize < 4)
  1326. return seterror(EINVAL);
  1327. if (codepage == 1200) /* little endian */
  1328. wbuf[1] = (buf[3] << 8) | buf[2];
  1329. else if (codepage == 1201) /* big endian */
  1330. wbuf[1] = (buf[2] << 8) | buf[3];
  1331. if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF))
  1332. return seterror(EILSEQ);
  1333. *wbufsize = 2;
  1334. return 4;
  1335. }
  1336. *wbufsize = 1;
  1337. return 2;
  1338. }
  1339. static int
  1340. utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
  1341. {
  1342. if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
  1343. {
  1344. int r;
  1345. cv->mode |= UNICODE_MODE_BOM_DONE;
  1346. if (bufsize < 2)
  1347. return seterror(E2BIG);
  1348. if (cv->codepage == 1200) /* little endian */
  1349. memcpy(buf, "\xFF\xFE", 2);
  1350. else if (cv->codepage == 1201) /* big endian */
  1351. memcpy(buf, "\xFE\xFF", 2);
  1352. r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2);
  1353. if (r == -1)
  1354. return -1;
  1355. return r + 2;
  1356. }
  1357. if (bufsize < 2)
  1358. return seterror(E2BIG);
  1359. if (cv->codepage == 1200) /* little endian */
  1360. {
  1361. buf[0] = (wbuf[0] & 0x00FF);
  1362. buf[1] = (wbuf[0] & 0xFF00) >> 8;
  1363. }
  1364. else if (cv->codepage == 1201) /* big endian */
  1365. {
  1366. buf[0] = (wbuf[0] & 0xFF00) >> 8;
  1367. buf[1] = (wbuf[0] & 0x00FF);
  1368. }
  1369. if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF)
  1370. {
  1371. if (bufsize < 4)
  1372. return seterror(E2BIG);
  1373. if (cv->codepage == 1200) /* little endian */
  1374. {
  1375. buf[2] = (wbuf[1] & 0x00FF);
  1376. buf[3] = (wbuf[1] & 0xFF00) >> 8;
  1377. }
  1378. else if (cv->codepage == 1201) /* big endian */
  1379. {
  1380. buf[2] = (wbuf[1] & 0xFF00) >> 8;
  1381. buf[3] = (wbuf[1] & 0x00FF);
  1382. }
  1383. return 4;
  1384. }
  1385. return 2;
  1386. }
  1387. static int
  1388. utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
  1389. {
  1390. int codepage = cv->codepage;
  1391. uint wc;
  1392. /* swap endian: 12000 <-> 12001 */
  1393. if (cv->mode & UNICODE_MODE_SWAPPED)
  1394. codepage ^= 1;
  1395. if (bufsize < 4)
  1396. return seterror(EINVAL);
  1397. if (codepage == 12000) /* little endian */
  1398. wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0];
  1399. else if (codepage == 12001) /* big endian */
  1400. wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
  1401. if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
  1402. {
  1403. cv->mode |= UNICODE_MODE_BOM_DONE;
  1404. if (wc == 0xFFFE0000)
  1405. {
  1406. cv->mode |= UNICODE_MODE_SWAPPED;
  1407. *wbufsize = 0;
  1408. return 4;
  1409. }
  1410. else if (wc == 0x0000FEFF)
  1411. {
  1412. *wbufsize = 0;
  1413. return 4;
  1414. }
  1415. }
  1416. if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc)
  1417. return seterror(EILSEQ);
  1418. ucs4_to_utf16(wc, wbuf, wbufsize);
  1419. return 4;
  1420. }
  1421. static int
  1422. utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
  1423. {
  1424. uint wc;
  1425. if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE))
  1426. {
  1427. int r;
  1428. cv->mode |= UNICODE_MODE_BOM_DONE;
  1429. if (bufsize < 4)
  1430. return seterror(E2BIG);
  1431. if (cv->codepage == 12000) /* little endian */
  1432. memcpy(buf, "\xFF\xFE\x00\x00", 4);
  1433. else if (cv->codepage == 12001) /* big endian */
  1434. memcpy(buf, "\x00\x00\xFE\xFF", 4);
  1435. r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4);
  1436. if (r == -1)
  1437. return -1;
  1438. return r + 4;
  1439. }
  1440. if (bufsize < 4)
  1441. return seterror(E2BIG);
  1442. wc = utf16_to_ucs4(wbuf);
  1443. if (cv->codepage == 12000) /* little endian */
  1444. {
  1445. buf[0] = wc & 0x000000FF;
  1446. buf[1] = (wc & 0x0000FF00) >> 8;
  1447. buf[2] = (wc & 0x00FF0000) >> 16;
  1448. buf[3] = (wc & 0xFF000000) >> 24;
  1449. }
  1450. else if (cv->codepage == 12001) /* big endian */
  1451. {
  1452. buf[0] = (wc & 0xFF000000) >> 24;
  1453. buf[1] = (wc & 0x00FF0000) >> 16;
  1454. buf[2] = (wc & 0x0000FF00) >> 8;
  1455. buf[3] = wc & 0x000000FF;
  1456. }
  1457. return 4;
  1458. }
  1459. /*
  1460. * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
  1461. * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow
  1462. * 1 byte Kana)
  1463. * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte
  1464. * Kana - SO/SI)
  1465. *
  1466. * MultiByteToWideChar() and WideCharToMultiByte() behave differently
  1467. * depending on Windows version. On XP, WideCharToMultiByte() doesn't
  1468. * terminate result sequence with ascii escape. But Vista does.
  1469. * Use MLang instead.
  1470. */
  1471. #define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift))
  1472. #define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF)
  1473. #define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF)
  1474. #define ISO2022_SI 0
  1475. #define ISO2022_SO 1
  1476. /* shift in */
  1477. static const char iso2022_SI_seq[] = "\x0F";
  1478. /* shift out */
  1479. static const char iso2022_SO_seq[] = "\x0E";
  1480. typedef struct iso2022_esc_t iso2022_esc_t;
  1481. struct iso2022_esc_t {
  1482. const char *esc;
  1483. int esc_len;
  1484. int len;
  1485. int cs;
  1486. };
  1487. #define ISO2022JP_CS_ASCII 0
  1488. #define ISO2022JP_CS_JISX0201_ROMAN 1
  1489. #define ISO2022JP_CS_JISX0201_KANA 2
  1490. #define ISO2022JP_CS_JISX0208_1978 3
  1491. #define ISO2022JP_CS_JISX0208_1983 4
  1492. #define ISO2022JP_CS_JISX0212 5
  1493. static iso2022_esc_t iso2022jp_esc[] = {
  1494. {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII},
  1495. {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN},
  1496. {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA},
  1497. {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, /* unify 1978 with 1983 */
  1498. {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983},
  1499. {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212},
  1500. {NULL, 0, 0, 0}
  1501. };
  1502. static int
  1503. iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize)
  1504. {
  1505. iso2022_esc_t *iesc = iso2022jp_esc;
  1506. char tmp[MB_CHAR_MAX];
  1507. int insize;
  1508. HRESULT hr;
  1509. DWORD dummy = 0;
  1510. int len;
  1511. int esc_len;
  1512. int cs;
  1513. int shift;
  1514. int i;
  1515. if (buf[0] == 0x1B)
  1516. {
  1517. for (i = 0; iesc[i].esc != NULL; ++i)
  1518. {
  1519. esc_len = iesc[i].esc_len;
  1520. if (bufsize < esc_len)
  1521. {
  1522. if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0)
  1523. return seterror(EINVAL);
  1524. }
  1525. else
  1526. {
  1527. if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0)
  1528. {
  1529. cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI);
  1530. *wbufsize = 0;
  1531. return esc_len;
  1532. }
  1533. }
  1534. }
  1535. /* not supported escape sequence */
  1536. return seterror(EILSEQ);
  1537. }
  1538. else if (buf[0] == iso2022_SO_seq[0])
  1539. {
  1540. cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO);
  1541. *wbufsize = 0;
  1542. return 1;
  1543. }
  1544. else if (buf[0] == iso2022_SI_seq[0])
  1545. {
  1546. cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI);
  1547. *wbufsize = 0;
  1548. return 1;
  1549. }
  1550. cs = ISO2022_MODE_CS(cv->mode);
  1551. shift = ISO2022_MODE_SHIFT(cv->mode);
  1552. /* reset the mode for informal sequence */
  1553. if (buf[0] < 0x20)
  1554. {
  1555. cs = ISO2022JP_CS_ASCII;
  1556. shift = ISO2022_SI;
  1557. }
  1558. len = iesc[cs].len;
  1559. if (bufsize < len)
  1560. return seterror(EINVAL);
  1561. for (i = 0; i < len; ++i)
  1562. if (!(buf[i] < 0x80))
  1563. return seterror(EILSEQ);
  1564. esc_len = iesc[cs].esc_len;
  1565. memcpy(tmp, iesc[cs].esc, esc_len);
  1566. if (shift == ISO2022_SO)
  1567. {
  1568. memcpy(tmp + esc_len, iso2022_SO_seq, 1);
  1569. esc_len += 1;
  1570. }
  1571. memcpy(tmp + esc_len, buf, len);
  1572. if ((cv->codepage == 50220 || cv->codepage == 50221
  1573. || cv->codepage == 50222) && shift == ISO2022_SO)
  1574. {
  1575. /* XXX: shift-out cannot be used for mbtowc (both kernel and
  1576. * mlang) */
  1577. esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len;
  1578. memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len);
  1579. memcpy(tmp + esc_len, buf, len);
  1580. }
  1581. insize = len + esc_len;
  1582. hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage,
  1583. (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize);
  1584. if (hr != S_OK || insize != len + esc_len)
  1585. return seterror(EILSEQ);
  1586. /* Check for conversion error. Assuming defaultChar is 0x3F. */
  1587. /* ascii should be converted from ascii */
  1588. if (wbuf[0] == buf[0]
  1589. && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI))
  1590. return seterror(EILSEQ);
  1591. /* reset the mode for informal sequence */
  1592. if (cv->mode != ISO2022_MODE(cs, shift))
  1593. cv->mode = ISO2022_MODE(cs, shift);
  1594. return len;
  1595. }
  1596. static int
  1597. iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize)
  1598. {
  1599. iso2022_esc_t *iesc = iso2022jp_esc;
  1600. char tmp[MB_CHAR_MAX];
  1601. int tmpsize = MB_CHAR_MAX;
  1602. int insize = wbufsize;
  1603. HRESULT hr;
  1604. DWORD dummy = 0;
  1605. int len;
  1606. int esc_len;
  1607. int cs;
  1608. int shift;
  1609. int i;
  1610. /*
  1611. * MultiByte = [escape sequence] + character + [escape sequence]
  1612. *
  1613. * Whether trailing escape sequence is added depends on which API is
  1614. * used (kernel or MLang, and its version).
  1615. */
  1616. hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage,
  1617. (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize);
  1618. if (hr != S_OK || insize != wbufsize)
  1619. return seterror(EILSEQ);
  1620. else if (bufsize < tmpsize)
  1621. return seterror(E2BIG);
  1622. if (tmpsize == 1)
  1623. {
  1624. cs = ISO2022JP_CS_ASCII;
  1625. esc_len = 0;
  1626. }
  1627. else
  1628. {
  1629. for (i = 1; iesc[i].esc != NULL; ++i)
  1630. {
  1631. esc_len = iesc[i].esc_len;
  1632. if (strncmp(tmp, iesc[i].esc, esc_len) == 0)
  1633. {
  1634. cs = iesc[i].cs;
  1635. break;
  1636. }
  1637. }
  1638. if (iesc[i].esc == NULL)
  1639. /* not supported escape sequence */
  1640. return seterror(EILSEQ);
  1641. }
  1642. shift = ISO2022_SI;
  1643. if (tmp[esc_len] == iso2022_SO_seq[0])
  1644. {
  1645. shift = ISO2022_SO;
  1646. esc_len += 1;
  1647. }
  1648. len = iesc[cs].len;
  1649. /* Check for converting error. Assuming defaultChar is 0x3F. */
  1650. /* ascii should be converted from ascii */
  1651. if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80))
  1652. return seterror(EILSEQ);
  1653. else if (tmpsize < esc_len + len)
  1654. return seterror(EILSEQ);
  1655. if (cv->mode == ISO2022_MODE(cs, shift))
  1656. {
  1657. /* remove escape sequence */
  1658. if (esc_len != 0)
  1659. memmove(tmp, tmp + esc_len, len);
  1660. esc_len = 0;
  1661. }
  1662. else
  1663. {
  1664. if (cs == ISO2022JP_CS_ASCII)
  1665. {
  1666. esc_len = iesc[ISO2022JP_CS_ASCII].esc_len;
  1667. memmove(tmp + esc_len, tmp, len);
  1668. memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len);
  1669. }
  1670. if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO)
  1671. {
  1672. /* shift-in before changing to other mode */
  1673. memmove(tmp + 1, tmp, len + esc_len);
  1674. memcpy(tmp, iso2022_SI_seq, 1);
  1675. esc_len += 1;
  1676. }
  1677. }
  1678. if (bufsize < len + esc_len)
  1679. return seterror(E2BIG);
  1680. memcpy(buf, tmp, len + esc_len);
  1681. cv->mode = ISO2022_MODE(cs, shift);
  1682. return len + esc_len;
  1683. }
  1684. static int
  1685. iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize)
  1686. {
  1687. iso2022_esc_t *iesc = iso2022jp_esc;
  1688. int esc_len;
  1689. if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI))
  1690. {
  1691. esc_len = 0;
  1692. if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI)
  1693. esc_len += 1;
  1694. if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII)
  1695. esc_len += iesc[ISO2022JP_CS_ASCII].esc_len;
  1696. if (bufsize < esc_len)
  1697. return seterror(E2BIG);
  1698. esc_len = 0;
  1699. if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI)
  1700. {
  1701. memcpy(buf, iso2022_SI_seq, 1);
  1702. esc_len += 1;
  1703. }
  1704. if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII)
  1705. {
  1706. memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc,
  1707. iesc[ISO2022JP_CS_ASCII].esc_len);
  1708. esc_len += iesc[ISO2022JP_CS_ASCII].esc_len;
  1709. }
  1710. return esc_len;
  1711. }
  1712. return 0;
  1713. }
  1714. #if defined(MAKE_DLL) && defined(USE_LIBICONV_DLL)
  1715. BOOL WINAPI
  1716. DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved)
  1717. {
  1718. switch( fdwReason )
  1719. {
  1720. case DLL_PROCESS_ATTACH:
  1721. hwiniconv = (HMODULE)hinstDLL;
  1722. break;
  1723. case DLL_THREAD_ATTACH:
  1724. case DLL_THREAD_DETACH:
  1725. case DLL_PROCESS_DETACH:
  1726. break;
  1727. }
  1728. return TRUE;
  1729. }
  1730. #endif
  1731. #if defined(MAKE_EXE)
  1732. #include <stdio.h>
  1733. #include <fcntl.h>
  1734. #include <io.h>
  1735. int
  1736. main(int argc, char **argv)
  1737. {
  1738. char *fromcode = NULL;
  1739. char *tocode = NULL;
  1740. int i;
  1741. char inbuf[BUFSIZ];
  1742. char outbuf[BUFSIZ];
  1743. char *pin;
  1744. char *pout;
  1745. size_t inbytesleft;
  1746. size_t outbytesleft;
  1747. size_t rest = 0;
  1748. iconv_t cd;
  1749. size_t r;
  1750. FILE *in = stdin;
  1751. int ignore = 0;
  1752. char *p;
  1753. _setmode(_fileno(stdin), _O_BINARY);
  1754. _setmode(_fileno(stdout), _O_BINARY);
  1755. for (i = 1; i < argc; ++i)
  1756. {
  1757. if (strcmp(argv[i], "-l") == 0)
  1758. {
  1759. for (i = 0; codepage_alias[i].name != NULL; ++i)
  1760. printf("%s\n", codepage_alias[i].name);
  1761. return 0;
  1762. }
  1763. if (strcmp(argv[i], "-f") == 0)
  1764. fromcode = argv[++i];
  1765. else if (strcmp(argv[i], "-t") == 0)
  1766. tocode = argv[++i];
  1767. else if (strcmp(argv[i], "-c") == 0)
  1768. ignore = 1;
  1769. else
  1770. {
  1771. in = fopen(argv[i], "rb");
  1772. if (in == NULL)
  1773. {
  1774. fprintf(stderr, "cannot open %s\n", argv[i]);
  1775. return 1;
  1776. }
  1777. break;
  1778. }
  1779. }
  1780. if (fromcode == NULL || tocode == NULL)
  1781. {
  1782. printf("usage: %s [-c] -f from-enc -t to-enc [file]\n", argv[0]);
  1783. return 0;
  1784. }
  1785. if (ignore)
  1786. {
  1787. p = tocode;
  1788. tocode = (char *)malloc(strlen(p) + strlen("//IGNORE") + 1);
  1789. if (tocode == NULL)
  1790. {
  1791. perror("fatal error");
  1792. return 1;
  1793. }
  1794. strcpy(tocode, p);
  1795. strcat(tocode, "//IGNORE");
  1796. }
  1797. cd = iconv_open(tocode, fromcode);
  1798. if (cd == (iconv_t)(-1))
  1799. {
  1800. perror("iconv_open error");
  1801. return 1;
  1802. }
  1803. while ((inbytesleft = fread(inbuf + rest, 1, sizeof(inbuf) - rest, in)) != 0
  1804. || rest != 0)
  1805. {
  1806. inbytesleft += rest;
  1807. pin = inbuf;
  1808. pout = outbuf;
  1809. outbytesleft = sizeof(outbuf);
  1810. r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft);
  1811. fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, stdout);
  1812. if (r == (size_t)(-1) && errno != E2BIG && (errno != EINVAL || feof(in)))
  1813. {
  1814. perror("conversion error");
  1815. return 1;
  1816. }
  1817. memmove(inbuf, pin, inbytesleft);
  1818. rest = inbytesleft;
  1819. }
  1820. pout = outbuf;
  1821. outbytesleft = sizeof(outbuf);
  1822. r = iconv(cd, NULL, NULL, &pout, &outbytesleft);
  1823. fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, stdout);
  1824. if (r == (size_t)(-1))
  1825. {
  1826. perror("conversion error");
  1827. return 1;
  1828. }
  1829. iconv_close(cd);
  1830. return 0;
  1831. }
  1832. #endif