qrdectxt.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. /*Copyright (C) 2008-2009 Timothy B. Terriberry (tterribe@xiph.org)
  2. You can redistribute this library and/or modify it under the terms of the
  3. GNU Lesser General Public License as published by the Free Software
  4. Foundation; either version 2.1 of the License, or (at your option) any later
  5. version.*/
  6. #include <stdio.h>
  7. //#include <stdlib.h>
  8. #include <string.h>
  9. #include <iconv.h>
  10. #include "qrcode.h"
  11. #include "qrdec.h"
  12. #include "util.h"
  13. #include "image.h"
  14. #include "error.h"
  15. #include "img_scanner.h"
  16. static int text_is_ascii(const unsigned char *_text,int _len){
  17. int i;
  18. for(i=0;i<_len;i++)if(_text[i]>=0x80)return 0;
  19. return 1;
  20. }
  21. static int text_is_latin1(const unsigned char *_text,int _len){
  22. int i;
  23. for(i=0;i<_len;i++){
  24. /*The following line fails to compile correctly with gcc 3.4.4 on ARM with
  25. any optimizations enabled.*/
  26. if(_text[i]>=0x80&&_text[i]<0xA0)return 0;
  27. }
  28. return 1;
  29. }
  30. static void enc_list_mtf(iconv_t _enc_list[3],iconv_t _enc){
  31. int i;
  32. for(i=0;i<3;i++)if(_enc_list[i]==_enc){
  33. int j;
  34. for(j=i;j-->0;)_enc_list[j+1]=_enc_list[j];
  35. _enc_list[0]=_enc;
  36. break;
  37. }
  38. }
  39. //#include "assert.h"
  40. int qr_code_data_list_extract_text(const qr_code_data_list *_qrlist,
  41. zbar_image_scanner_t *iscn,
  42. zbar_image_t *img)
  43. {
  44. iconv_t sjis_cd;
  45. iconv_t utf8_cd;
  46. iconv_t latin1_cd;
  47. const qr_code_data *qrdata;
  48. int nqrdata;
  49. unsigned char *mark;
  50. // char **text;
  51. int ntext;
  52. int i;
  53. qrdata=_qrlist->qrdata;
  54. nqrdata=_qrlist->nqrdata;
  55. // text=(char **)malloc(nqrdata*sizeof(*text));
  56. mark=(unsigned char *)calloc(nqrdata,sizeof(*mark));
  57. ntext=0;
  58. /*This is the encoding the standard says is the default.*/
  59. latin1_cd=iconv_open_ext("UTF-8","ISO8859-1");
  60. /*But this one is often used, as well.*/
  61. sjis_cd=iconv_open_ext("UTF-8","SJIS");
  62. /*This is a trivial conversion just to check validity without extra code.*/
  63. utf8_cd=iconv_open_ext("UTF-8","UTF-8");
  64. for(i=0;i<nqrdata;i++)if(!mark[i]){
  65. const qr_code_data *qrdataj;
  66. const qr_code_data_entry *entry;
  67. iconv_t enc_list[3];
  68. iconv_t eci_cd;
  69. int sa[16];
  70. int sa_size;
  71. char *sa_text;
  72. size_t sa_ntext;
  73. size_t sa_ctext;
  74. int fnc1;
  75. int eci;
  76. int err;
  77. int j;
  78. int k;
  79. /*Step 0: Collect the other QR codes belonging to this S-A group.*/
  80. if(qrdata[i].sa_size){
  81. unsigned sa_parity;
  82. sa_size=qrdata[i].sa_size;
  83. sa_parity=qrdata[i].sa_parity;
  84. for(j=0;j<sa_size;j++)sa[j]=-1;
  85. for(j=i;j<nqrdata;j++)if(!mark[j]){
  86. /*TODO: We could also match version, ECC level, etc. if size and
  87. parity alone are too ambiguous.*/
  88. if(qrdata[j].sa_size==sa_size&&qrdata[j].sa_parity==sa_parity&&
  89. sa[qrdata[j].sa_index]<0){
  90. sa[qrdata[j].sa_index]=j;
  91. mark[j]=1;
  92. }
  93. }
  94. /*TODO: If the S-A group is complete, check the parity.*/
  95. }
  96. else{
  97. sa[0]=i;
  98. sa_size=1;
  99. }
  100. sa_ctext=0;
  101. fnc1=0;
  102. /*Step 1: Detect FNC1 markers and estimate the required buffer size.*/
  103. for(j=0;j<sa_size;j++)if(sa[j]>=0){
  104. qrdataj=qrdata+sa[j];
  105. for(k=0;k<qrdataj->nentries;k++){
  106. int shift;
  107. entry=qrdataj->entries+k;
  108. shift=0;
  109. switch(entry->mode){
  110. /*FNC1 applies to the entire code and ignores subsequent markers.*/
  111. case QR_MODE_FNC1_1ST:
  112. case QR_MODE_FNC1_2ND:fnc1=1;break;
  113. /*2 SJIS bytes will be at most 4 UTF-8 bytes.*/
  114. case QR_MODE_KANJI:shift++;
  115. /*We assume at most 4 UTF-8 bytes per input byte.
  116. I believe this is true for all the encodings we actually use.*/
  117. case QR_MODE_BYTE:shift++;
  118. default:{
  119. /*The remaining two modes are already valid UTF-8.*/
  120. if(QR_MODE_HAS_DATA(entry->mode)){
  121. sa_ctext+=entry->payload.data.len<<shift;
  122. }
  123. }break;
  124. }
  125. }
  126. }
  127. /*Step 2: Convert the entries.*/
  128. sa_text=(char *)malloc((sa_ctext+1)*sizeof(*sa_text));
  129. sa_ntext=0;
  130. eci=-1;
  131. enc_list[0]=sjis_cd;
  132. enc_list[1]=latin1_cd;
  133. enc_list[2]=utf8_cd;
  134. eci_cd=(iconv_t)-1;
  135. err=0;
  136. zbar_symbol_t *syms = NULL, **sym = &syms;
  137. for(j = 0; j < sa_size && !err; j++, sym = &(*sym)->next) {
  138. *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
  139. (*sym)->datalen = sa_ntext;
  140. if(sa[j]<0){
  141. /* generic placeholder for unfinished results */
  142. (*sym)->type = ZBAR_PARTIAL;
  143. /*Skip all contiguous missing segments.*/
  144. for(j++;j<sa_size&&sa[j]<0;j++);
  145. /*If there aren't any more, stop.*/
  146. if(j>=sa_size)break;
  147. /* mark break in data */
  148. sa_text[sa_ntext++]='\0';
  149. (*sym)->datalen = sa_ntext;
  150. /* advance to next symbol */
  151. sym = &(*sym)->next;
  152. *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
  153. }
  154. qrdataj=qrdata+sa[j];
  155. /* expose bounding box */
  156. sym_add_point(*sym, qrdataj->bbox[0][0], qrdataj->bbox[0][1]);
  157. sym_add_point(*sym, qrdataj->bbox[2][0], qrdataj->bbox[2][1]);
  158. sym_add_point(*sym, qrdataj->bbox[3][0], qrdataj->bbox[3][1]);
  159. sym_add_point(*sym, qrdataj->bbox[1][0], qrdataj->bbox[1][1]);
  160. for(k=0;k<qrdataj->nentries&&!err;k++){
  161. size_t inleft;
  162. size_t outleft;
  163. char *in;
  164. char *out;
  165. entry=qrdataj->entries+k;
  166. switch(entry->mode){
  167. case QR_MODE_NUM:{
  168. if(sa_ctext-sa_ntext>=(size_t)entry->payload.data.len){
  169. memcpy(sa_text+sa_ntext,entry->payload.data.buf,
  170. entry->payload.data.len*sizeof(*sa_text));
  171. sa_ntext+=entry->payload.data.len;
  172. }
  173. else err=1;
  174. }break;
  175. case QR_MODE_ALNUM:{
  176. char *p;
  177. in=(char *)entry->payload.data.buf;
  178. inleft=entry->payload.data.len;
  179. /*FNC1 uses '%' as an escape character.*/
  180. if(fnc1)for(;;){
  181. size_t plen;
  182. char c;
  183. p=memchr(in,'%',inleft*sizeof(*in));
  184. if(p==NULL)break;
  185. plen=p-in;
  186. if(sa_ctext-sa_ntext<plen+1)break;
  187. memcpy(sa_text+sa_ntext,in,plen*sizeof(*in));
  188. sa_ntext+=plen;
  189. /*Two '%'s is a literal '%'*/
  190. if(plen+1<inleft&&p[1]=='%'){
  191. c='%';
  192. plen++;
  193. p++;
  194. }
  195. /*One '%' is the ASCII group separator.*/
  196. else c=0x1D;
  197. sa_text[sa_ntext++]=c;
  198. inleft-=plen+1;
  199. in=p+1;
  200. }
  201. else p=NULL;
  202. if(p!=NULL||sa_ctext-sa_ntext<inleft)err=1;
  203. else{
  204. memcpy(sa_text+sa_ntext,in,inleft*sizeof(*sa_text));
  205. sa_ntext+=inleft;
  206. }
  207. }break;
  208. /*TODO: This will not handle a multi-byte sequence split between
  209. multiple data blocks.
  210. Does such a thing occur?
  211. Is it allowed?
  212. It requires copying buffers around to handle correctly.*/
  213. case QR_MODE_BYTE:{
  214. in=(char *)entry->payload.data.buf;
  215. inleft=entry->payload.data.len;
  216. out=sa_text+sa_ntext;
  217. outleft=sa_ctext-sa_ntext;
  218. /*If we have no specified encoding, attempt to auto-detect it.*/
  219. if(eci<0){
  220. int ei;
  221. /*First check for the UTF-8 BOM.*/
  222. if(inleft>=3&&
  223. in[0]==(char)0xEF&&in[1]==(char)0xBB&&in[2]==(char)0xBF){
  224. in+=3;
  225. inleft-=3;
  226. /*Actually try converting (to check validity).*/
  227. err=utf8_cd==(iconv_t)-1||
  228. iconv_ext(utf8_cd,&in,&inleft,&out,&outleft)==(size_t)-1;
  229. if(!err){
  230. sa_ntext=out-sa_text;
  231. enc_list_mtf(enc_list,utf8_cd);
  232. continue;
  233. }
  234. in=(char *)entry->payload.data.buf;
  235. inleft=entry->payload.data.len;
  236. out=sa_text+sa_ntext;
  237. outleft=sa_ctext-sa_ntext;
  238. }
  239. /*If the text is 8-bit clean, prefer UTF-8 over SJIS, since SJIS
  240. will corrupt the backslashes used for DoCoMo formats.*/
  241. else if(text_is_ascii((unsigned char *)in,inleft)){
  242. //enc_list_mtf(enc_list,utf8_cd);
  243. memcpy(sa_text+sa_ntext,
  244. entry->payload.data.buf,
  245. entry->payload.data.len*sizeof(*sa_text));
  246. sa_ntext+=entry->payload.data.len;
  247. break;
  248. }
  249. /*Try our list of encodings.*/
  250. for(ei=0;ei<3;ei++)if(enc_list[ei]!=(iconv_t)-1){
  251. /*According to the standard, ISO/IEC 8859-1 (one hyphen) is
  252. supposed to be used, but reality is not always so.
  253. It's got an invalid range that is used often with SJIS
  254. and UTF-8, though, which makes detection easier.
  255. However, iconv() does not properly reject characters in
  256. those ranges, since ISO-8859-1 (two hyphens) defines a
  257. number of seldom-used control code characters there.
  258. So if we see any of those characters, move this
  259. conversion to the end of the list.*/
  260. if(ei<2&&enc_list[ei]==latin1_cd&&
  261. !text_is_latin1((unsigned char *)in,inleft)){
  262. int ej;
  263. for(ej=ei+1;ej<3;ej++)enc_list[ej-1]=enc_list[ej];
  264. enc_list[2]=latin1_cd;
  265. }
  266. err=iconv_ext(enc_list[ei],&in,&inleft,&out,&outleft)==(size_t)-1;
  267. if(!err){
  268. sa_ntext=out-sa_text;
  269. enc_list_mtf(enc_list,enc_list[ei]);
  270. break;
  271. }
  272. in=(char *)entry->payload.data.buf;
  273. inleft=entry->payload.data.len;
  274. out=sa_text+sa_ntext;
  275. outleft=sa_ctext-sa_ntext;
  276. }
  277. }
  278. /*We were actually given a character set; use it.*/
  279. else{
  280. err=eci_cd==(iconv_t)-1||
  281. iconv_ext(eci_cd,&in,&inleft,&out,&outleft)==(size_t)-1;
  282. if(!err)sa_ntext=out-sa_text;
  283. }
  284. }break;
  285. /*Kanji mode always uses SJIS.*/
  286. case QR_MODE_KANJI:{
  287. in=(char *)entry->payload.data.buf;
  288. inleft=entry->payload.data.len;
  289. out=sa_text+sa_ntext;
  290. outleft=sa_ctext-sa_ntext;
  291. err=sjis_cd==(iconv_t)-1||
  292. iconv_ext(sjis_cd,&in,&inleft,&out,&outleft)==(size_t)-1;
  293. if(!err)sa_ntext=out-sa_text;
  294. }break;
  295. /*Check to see if a character set was specified.*/
  296. case QR_MODE_ECI:{
  297. const char *enc;
  298. char buf[16];
  299. unsigned cur_eci;
  300. cur_eci=entry->payload.eci;
  301. if(cur_eci<=QR_ECI_ISO8859_16&&cur_eci!=14){
  302. if(cur_eci!=QR_ECI_GLI0&&cur_eci!=QR_ECI_CP437){
  303. sprintf_(buf,"ISO8859-%i",QR_MAXI(cur_eci,3)-2);
  304. enc=buf;
  305. }
  306. /*Note that CP437 requires an iconv compiled with
  307. --enable-extra-encodings, and thus may not be available.*/
  308. else enc="CP437";
  309. }
  310. else if(cur_eci==QR_ECI_SJIS)enc="SJIS";
  311. /*Don't know what this ECI code specifies, but not an encoding that
  312. we recognize.*/
  313. else continue;
  314. eci=cur_eci;
  315. eci_cd=iconv_open_ext("UTF-8",enc);
  316. }break;
  317. /*Silence stupid compiler warnings.*/
  318. default:break;
  319. }
  320. }
  321. /*If eci should be reset between codes, do so.*/
  322. if(eci<=QR_ECI_GLI1){
  323. eci=-1;
  324. if(eci_cd!=(iconv_t)-1)iconv_close_ext(eci_cd);
  325. }
  326. }
  327. if(eci_cd!=(iconv_t)-1)iconv_close_ext(eci_cd);
  328. if(!err){
  329. sa_text[sa_ntext++]='\0';
  330. if(sa_ctext+1>sa_ntext){
  331. sa_text=(char *)realloc(sa_text,sa_ntext*sizeof(*sa_text));
  332. }
  333. zbar_symbol_t *sa_sym;
  334. if(sa_size == 1)
  335. sa_sym = syms;
  336. else {
  337. /* create "virtual" container symbol for composite result */
  338. sa_sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
  339. sa_sym->syms = _zbar_symbol_set_create();
  340. sa_sym->syms->head = syms;
  341. /* cheap out w/axis aligned bbox for now */
  342. int xmin = img->width, xmax = -2;
  343. int ymin = img->height, ymax = -2;
  344. /* fixup data references */
  345. for(; syms; syms = syms->next) {
  346. _zbar_symbol_refcnt(syms, 1);
  347. if(syms->type == ZBAR_PARTIAL)
  348. sa_sym->type = ZBAR_PARTIAL;
  349. else
  350. for(j = 0; j < syms->npts; j++) {
  351. int u = syms->pts[j].x;
  352. if(xmin >= u) xmin = u - 1;
  353. if(xmax <= u) xmax = u + 1;
  354. u = syms->pts[j].y;
  355. if(ymin >= u) ymin = u - 1;
  356. if(ymax <= u) ymax = u + 1;
  357. }
  358. syms->data = sa_text + syms->datalen;
  359. int next = (syms->next) ? syms->next->datalen : sa_ntext;
  360. assert(next > syms->datalen);
  361. syms->datalen = next - syms->datalen - 1;
  362. }
  363. if(xmax >= -1) {
  364. sym_add_point(sa_sym, xmin, ymin);
  365. sym_add_point(sa_sym, xmin, ymax);
  366. sym_add_point(sa_sym, xmax, ymax);
  367. sym_add_point(sa_sym, xmax, ymin);
  368. }
  369. }
  370. sa_sym->data = sa_text;
  371. sa_sym->data_alloc = sa_ntext;
  372. sa_sym->datalen = sa_ntext - 1;
  373. _zbar_image_scanner_add_sym(iscn, sa_sym);
  374. }
  375. else {
  376. _zbar_image_scanner_recycle_syms(iscn, syms);
  377. free(sa_text);
  378. }
  379. }
  380. if(utf8_cd!=(iconv_t)-1)iconv_close_ext(utf8_cd);
  381. if(sjis_cd!=(iconv_t)-1)iconv_close_ext(sjis_cd);
  382. if(latin1_cd!=(iconv_t)-1)iconv_close_ext(latin1_cd);
  383. free(mark);
  384. return ntext;
  385. }