utf8_to_ucs2.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. /**************************************************************************
  2. * Copyright (C), AirM2M Tech. Co., Ltd.
  3. *
  4. * Name: utf8_to_ucs2.c
  5. * Author: liweiqiang
  6. * Version: V0.1
  7. * Date: 2013/7/19
  8. *
  9. * Description:
  10. * utf8 转换为 ucs2 le/be
  11. **************************************************************************/
  12. #include "stdio.h"
  13. #include "errno.h"
  14. #if 0
  15. /*****************************************************************************
  16. * 将一个字符的Unicode(UCS-2和UCS-4)编码转换成UTF-8编码.
  17. *
  18. * 参数:
  19. * unic 字符的Unicode编码值
  20. * pOutput 指向输出的用于存储UTF8编码值的缓冲区的指针
  21. * outsize pOutput缓冲的大小
  22. *
  23. * 返回值:
  24. * 返回转换后的字符的UTF8编码所占的字节数, 如果出错则返回 0 .
  25. *
  26. * 注意:
  27. * 1. UTF8没有字节序问题, 但是Unicode有字节序要求;
  28. * 字节序分为大端(Big Endian)和小端(Little Endian)两种;
  29. * 在Intel处理器中采用小端法表示, 在此采用小端法表示. (低地址存低位)
  30. * 2. 请保证 pOutput 缓冲区有最少有 6 字节的空间大小!
  31. ****************************************************************************/
  32. static int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput,
  33. int outSize)
  34. {
  35. ASSERT(pOutput != NULL);
  36. ASSERT(outSize >= 6);
  37. if ( unic <= 0x0000007F )
  38. {
  39. // * U-00000000 - U-0000007F: 0xxxxxxx
  40. *pOutput = (unic & 0x7F);
  41. return 1;
  42. }
  43. else if ( unic >= 0x00000080 && unic <= 0x000007FF )
  44. {
  45. // * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
  46. *(pOutput+1) = (unic & 0x3F) | 0x80;
  47. *pOutput = ((unic >> 6) & 0x1F) | 0xC0;
  48. return 2;
  49. }
  50. else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
  51. {
  52. // * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
  53. *(pOutput+2) = (unic & 0x3F) | 0x80;
  54. *(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80;
  55. *pOutput = ((unic >> 12) & 0x0F) | 0xE0;
  56. return 3;
  57. }
  58. else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
  59. {
  60. // * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  61. *(pOutput+3) = (unic & 0x3F) | 0x80;
  62. *(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80;
  63. *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
  64. *pOutput = ((unic >> 18) & 0x07) | 0xF0;
  65. return 4;
  66. }
  67. else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
  68. {
  69. // * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  70. *(pOutput+4) = (unic & 0x3F) | 0x80;
  71. *(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80;
  72. *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
  73. *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
  74. *pOutput = ((unic >> 24) & 0x03) | 0xF8;
  75. return 5;
  76. }
  77. else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
  78. {
  79. // * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  80. *(pOutput+5) = (unic & 0x3F) | 0x80;
  81. *(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80;
  82. *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
  83. *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
  84. *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
  85. *pOutput = ((unic >> 30) & 0x01) | 0xFC;
  86. return 6;
  87. }
  88. return 0;
  89. }
  90. #endif
  91. static int enc_get_utf8_size(char c)
  92. {
  93. int count = 0;
  94. while (c & (1<<7))
  95. {
  96. c = c << 1;
  97. count++;
  98. }
  99. return count == 0 ? 1 : count;
  100. }
  101. /*****************************************************************************
  102. * 将一个字符的UTF8编码转换成Unicode(UCS-2和UCS-4)编码.
  103. *
  104. * 参数:
  105. * pInput 指向输入缓冲区, 以UTF-8编码
  106. * pOutput 指向输出缓冲区, 其保存的数据即是Unicode编码值,
  107. * endian 1 - big endian
  108. * 0 - little endian
  109. *
  110. * 返回值:
  111. * 成功则返回该字符的UTF8编码所占用的字节数; 失败则返回0.
  112. *
  113. * 注意:
  114. * 1. UTF8没有字节序问题, 但是Unicode有字节序要求;
  115. * 字节序分为大端(Big Endian)和小端(Little Endian)两种;
  116. * 在Intel处理器中采用小端法表示, 在此采用小端法表示. (低地址存低位)
  117. ****************************************************************************/
  118. static int enc_utf8_to_unicode_one(const char* pInput, char* pOutput, int endian)
  119. {
  120. char b1, b2, b3/*, b4, b5, b6*/;
  121. int utfbytes = enc_get_utf8_size(*pInput);
  122. switch ( utfbytes )
  123. {
  124. case 1:
  125. if(endian)
  126. {
  127. *pOutput++ = 0x00;
  128. *pOutput = *pInput;
  129. }
  130. else
  131. {
  132. *pOutput++ = *pInput;
  133. *pOutput = 0x00;
  134. }
  135. return 2;
  136. //break;
  137. case 2:
  138. b1 = *pInput;
  139. b2 = *(pInput + 1);
  140. /*+\BUG\wangyuan\2020.11.18\遇到一些特殊字符会返回错误*/
  141. /*UTF-8二进制形式为 1100xxxx 10xxxxxx
  142. 例如:'·'的 UTF-8编码 11000010 10110111*/
  143. if ( (b2 & 0xC0) != 0x80 )
  144. return -1;
  145. /*-\BUG\wangyuan\2020.11.18\遇到一些特殊字符会返回错误*/
  146. if(endian)
  147. {
  148. *pOutput++ = (b1 >> 2) & 0x07;
  149. *pOutput = (b1 << 6) + (b2 & 0x3F);
  150. }
  151. else
  152. {
  153. *pOutput++ = (b1 << 6) + (b2 & 0x3F);
  154. *pOutput = (b1 >> 2) & 0x07;
  155. }
  156. return 2;
  157. //break;
  158. case 3:
  159. b1 = *pInput;
  160. b2 = *(pInput + 1);
  161. b3 = *(pInput + 2);
  162. if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )
  163. return -1;
  164. if(endian)
  165. {
  166. *pOutput++ = (b1 << 4) + ((b2 >> 2) & 0x0F);
  167. *pOutput = (b2 << 6) + (b3 & 0x3F);
  168. }
  169. else
  170. {
  171. *pOutput++ = (b2 << 6) + (b3 & 0x3F);
  172. *pOutput = (b1 << 4) + ((b2 >> 2) & 0x0F);
  173. }
  174. return 2;
  175. //break;
  176. #if 0
  177. case 4:
  178. b1 = *pInput;
  179. b2 = *(pInput + 1);
  180. b3 = *(pInput + 2);
  181. b4 = *(pInput + 3);
  182. if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
  183. || ((b4 & 0xC0) != 0x80) )
  184. return -1;
  185. *pOutput = (b3 << 6) + (b4 & 0x3F);
  186. *(pOutput+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);
  187. *(pOutput+2) = ((b1 << 2) & 0x1C) + ((b2 >> 4) & 0x03);
  188. return 3;
  189. break;
  190. case 5:
  191. b1 = *pInput;
  192. b2 = *(pInput + 1);
  193. b3 = *(pInput + 2);
  194. b4 = *(pInput + 3);
  195. b5 = *(pInput + 4);
  196. if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
  197. || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )
  198. return -1;
  199. *pOutput = (b4 << 6) + (b5 & 0x3F);
  200. *(pOutput+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);
  201. *(pOutput+2) = (b2 << 2) + ((b3 >> 4) & 0x03);
  202. *(pOutput+3) = (b1 << 6);
  203. return 4;
  204. break;
  205. case 6:
  206. b1 = *pInput;
  207. b2 = *(pInput + 1);
  208. b3 = *(pInput + 2);
  209. b4 = *(pInput + 3);
  210. b5 = *(pInput + 4);
  211. b6 = *(pInput + 5);
  212. if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
  213. || ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)
  214. || ((b6 & 0xC0) != 0x80) )
  215. return -1;
  216. *pOutput = (b5 << 6) + (b6 & 0x3F);
  217. *(pOutput+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);
  218. *(pOutput+2) = (b3 << 2) + ((b4 >> 4) & 0x03);
  219. *(pOutput+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);
  220. return 4;
  221. break;
  222. #endif
  223. default:
  224. break;
  225. }
  226. return -1;
  227. }
  228. static size_t enc_utf8_to_unicode(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft, int endian)
  229. {
  230. size_t maxOLen = *outbytesleft;
  231. size_t iLen = *inbytesleft;
  232. char *src = *_inbuf;
  233. char *dst = *_outbuf;
  234. size_t iPos, oPos;
  235. size_t utfbytes = 0;
  236. size_t unicodeBytes = 0;
  237. int result = 0;
  238. for(iPos = 0, oPos = 0; iPos < iLen; )
  239. {
  240. if(oPos + 2 > maxOLen)
  241. {
  242. errno = E2BIG;
  243. result = -1;
  244. goto utf8_to_ucs2_exit;
  245. }
  246. utfbytes = enc_get_utf8_size(src[iPos]);
  247. if(utfbytes == 0)
  248. utfbytes = 1;
  249. if((unicodeBytes = enc_utf8_to_unicode_one(&src[iPos], &dst[oPos], endian)) == -1)
  250. {
  251. errno = EINVAL;
  252. result = -1;
  253. break;
  254. }
  255. oPos += unicodeBytes;
  256. iPos += utfbytes;
  257. }
  258. utf8_to_ucs2_exit:
  259. *inbytesleft -= iPos;
  260. *_inbuf += iPos;
  261. *outbytesleft -= oPos;
  262. return (size_t)result;
  263. }
  264. size_t iconv_utf8_to_ucs2(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft)
  265. {
  266. return enc_utf8_to_unicode(_inbuf, inbytesleft, _outbuf, outbytesleft, 0);
  267. }
  268. size_t iconv_utf8_to_ucs2be(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft)
  269. {
  270. return enc_utf8_to_unicode(_inbuf, inbytesleft, _outbuf, outbytesleft, 1);
  271. }
  272. static size_t enc_unicode_to_utf8(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft, int endian)
  273. {
  274. size_t maxOLen = *outbytesleft;
  275. size_t iLen = *inbytesleft;
  276. char *src = *_inbuf;
  277. char *dst = *_outbuf;
  278. size_t iPos, oPos;
  279. size_t utfbytes = 0;
  280. int result = 0;
  281. size_t unicodeVal = 0;
  282. for(iPos = 0, oPos = 0; iPos+1 < iLen;)
  283. {
  284. unicodeVal = ((endian==1) ? ((unsigned char)src[iPos]*256+(unsigned char)src[iPos+1]) : ((unsigned char)src[iPos+1]*256+(unsigned char)src[iPos]));
  285. //printf("test1:%d,%X,%X,%X\n",endian,unicodeVal,src[iPos],src[iPos+1]);
  286. if(unicodeVal <= 0x7F)
  287. {
  288. utfbytes = 1;
  289. }
  290. else if(unicodeVal > 0x7F && unicodeVal <= 0x07FF)
  291. {
  292. utfbytes = 2;
  293. }
  294. else if(unicodeVal > 0x07FF)
  295. {
  296. utfbytes = 3;
  297. }
  298. else
  299. {
  300. errno = EINVAL;
  301. result = -1;
  302. goto ucs2_to_utf8_exit;
  303. }
  304. if(oPos + utfbytes > maxOLen)
  305. {
  306. errno = E2BIG;
  307. result = -1;
  308. goto ucs2_to_utf8_exit;
  309. }
  310. switch ( utfbytes )
  311. {
  312. case 1:
  313. dst[oPos] = unicodeVal;
  314. break;
  315. case 2:
  316. dst[oPos] = ((unicodeVal>>6)|0xE0)&0xDF;
  317. dst[oPos+1] = ((char)(unicodeVal&0xFF)|0xC0)&0xBF;
  318. break;
  319. case 3:
  320. dst[oPos] = (((unicodeVal>>12)&0xFF)|0xF0)&0xEF;
  321. dst[oPos+1] = (((unicodeVal>>6)&0xFF)|0xC0)&0xBF;
  322. //printf("test:%X,%X,%X,%X,%X,",unicodeVal,unicodeVal>>6,((unicodeVal>>6)&0xFF),((unicodeVal>>6)&0xFF)|0xC0,dst[oPos+1]);
  323. dst[oPos+2] =((unicodeVal&0xFF)|0xC0)&0xBF;
  324. break;
  325. default:
  326. break;
  327. }
  328. iPos += 2;
  329. oPos += utfbytes;
  330. }
  331. ucs2_to_utf8_exit:
  332. *inbytesleft -= iPos;
  333. *_inbuf += iPos;
  334. *outbytesleft -= oPos;
  335. return (size_t)result;
  336. }
  337. size_t iconv_ucs2_to_utf8(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft)
  338. {
  339. return enc_unicode_to_utf8(_inbuf, inbytesleft, _outbuf, outbytesleft, 0);
  340. }
  341. size_t iconv_ucs2be_to_utf8(char **_inbuf, size_t *inbytesleft, char **_outbuf, size_t *outbytesleft)
  342. {
  343. return enc_unicode_to_utf8(_inbuf, inbytesleft, _outbuf, outbytesleft, 1);
  344. }