diff options
Diffstat (limited to 'zbar/qrcode/qrdectxt.c')
-rw-r--r-- | zbar/qrcode/qrdectxt.c | 573 |
1 files changed, 573 insertions, 0 deletions
diff --git a/zbar/qrcode/qrdectxt.c b/zbar/qrcode/qrdectxt.c new file mode 100644 index 0000000..5676e33 --- /dev/null +++ b/zbar/qrcode/qrdectxt.c @@ -0,0 +1,573 @@ +/*Copyright (C) 2008-2010 Timothy B. Terriberry (tterribe@xiph.org) + You can redistribute this library and/or modify it under the terms of the + GNU Lesser General Public License as published by the Free Software + Foundation; either version 2.1 of the License, or (at your option) any later + version.*/ +#include "config.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <iconv.h> + +#include "decoder.h" +#include "error.h" +#include "image.h" +#include "img_scanner.h" +#include "qrcode.h" +#include "qrdec.h" +#include "util.h" + +#define ENC_LIST_SIZE 4 + +static int text_is_ascii(const unsigned char *_text, int _len) +{ + int i; + for (i = 0; i < _len; i++) + if (_text[i] >= 0x80) + return 0; + return 1; +} + +static int text_is_latin1(const unsigned char *_text, int _len) +{ + int i; + for (i = 0; i < _len; i++) { + /*The following line fails to compile correctly with gcc 3.4.4 on ARM with + any optimizations enabled.*/ + if (_text[i] >= 0x80 && _text[i] < 0xA0) + return 0; + } + return 1; +} + +static int text_is_big5(const unsigned char *_text, int _len) +{ + int i; + for (i = 0; i < _len; i++) { + if (_text[i] == 0xFF) + return 0; + else if (_text[i] >= 0x80) { // first byte is big5 + i++; + if (i >= _len) // second byte not exists + return 0; + if (_text[i] < 0x40 || (_text[i] > 0x7E && _text[i] < 0xA1) || + _text[i] > 0xFE) { // second byte not in range + return 0; + } + } else { // normal ascii encoding, it's okay + } + } + return 1; +} + +static void enc_list_mtf(iconv_t _enc_list[ENC_LIST_SIZE], iconv_t _enc) +{ + int i; + for (i = 0; i < ENC_LIST_SIZE; i++) + if (_enc_list[i] == _enc) { + int j; + for (j = i; j-- > 0;) + _enc_list[j + 1] = _enc_list[j]; + _enc_list[0] = _enc; + break; + } +} + +int qr_code_data_list_extract_text(const qr_code_data_list *_qrlist, + zbar_image_scanner_t *iscn, + zbar_image_t *img) +{ + iconv_t sjis_cd; + iconv_t utf8_cd; + iconv_t latin1_cd; + iconv_t big5_cd; + const qr_code_data *qrdata; + int nqrdata; + unsigned char *mark; + int ntext; + int i; + int raw_binary = 0; + zbar_image_scanner_get_config(iscn, ZBAR_QRCODE, ZBAR_CFG_BINARY, + &raw_binary); + qrdata = _qrlist->qrdata; + nqrdata = _qrlist->nqrdata; + mark = (unsigned char *)calloc(nqrdata, sizeof(*mark)); + ntext = 0; + /*This is the encoding the standard says is the default.*/ + latin1_cd = iconv_open("UTF-8", "ISO8859-1"); + /*But this one is often used, as well.*/ + sjis_cd = iconv_open("UTF-8", "SJIS"); + /*This is a trivial conversion just to check validity without extra code.*/ + utf8_cd = iconv_open("UTF-8", "UTF-8"); + /* add support for big5 encoding. */ + big5_cd = iconv_open("UTF-8", "BIG-5"); + for (i = 0; i < nqrdata; i++) + if (!mark[i]) { + const qr_code_data *qrdataj; + const qr_code_data_entry *entry; + iconv_t enc_list[ENC_LIST_SIZE]; + iconv_t eci_cd; + int sa[16]; + int sa_size; + char *sa_text; + size_t sa_ntext; + size_t sa_ctext; + int fnc1; + int fnc1_2ai; + int has_kanji; + int eci; + int err; + int j; + int k; + zbar_symbol_t *syms = NULL, **sym = &syms; + qr_point dir; + int horiz; + char *bytebuf_text; + size_t bytebuf_ntext; + + /*Step 0: Collect the other QR codes belonging to this S-A group.*/ + if (qrdata[i].sa_size) { + unsigned sa_parity; + sa_size = qrdata[i].sa_size; + sa_parity = qrdata[i].sa_parity; + for (j = 0; j < sa_size; j++) + sa[j] = -1; + for (j = i; j < nqrdata; j++) + if (!mark[j]) { + /*TODO: We could also match version, ECC level, etc. if size and + parity alone are too ambiguous.*/ + if (qrdata[j].sa_size == sa_size && + qrdata[j].sa_parity == sa_parity && + sa[qrdata[j].sa_index] < 0) { + sa[qrdata[j].sa_index] = j; + mark[j] = 1; + } + } + /*TODO: If the S-A group is complete, check the parity.*/ + } else { + sa[0] = i; + sa_size = 1; + } + + sa_ctext = 0; + fnc1 = 0; + fnc1_2ai = 0; + has_kanji = 0; + /*Step 1: Detect FNC1 markers and estimate the required buffer size.*/ + for (j = 0; j < sa_size; j++) + if (sa[j] >= 0) { + qrdataj = qrdata + sa[j]; + for (k = 0; k < qrdataj->nentries; k++) { + int shift; + entry = qrdataj->entries + k; + shift = 0; + switch (entry->mode) { + /*FNC1 applies to the entire code and ignores subsequent markers.*/ + case QR_MODE_FNC1_1ST: { + if (!fnc1) + fnc1 = MOD(ZBAR_MOD_GS1); + } break; + case QR_MODE_FNC1_2ND: { + if (!fnc1) { + fnc1 = MOD(ZBAR_MOD_AIM); + fnc1_2ai = entry->payload.ai; + sa_ctext += 2; + } + } break; + /*We assume at most 4 UTF-8 bytes per input byte. + I believe this is true for all the encodings we actually use.*/ + case QR_MODE_KANJI: + has_kanji = 1; + case QR_MODE_BYTE: + shift = 2; + default: { + /*The remaining two modes are already valid UTF-8.*/ + if (QR_MODE_HAS_DATA(entry->mode)) { + sa_ctext += entry->payload.data.len << shift; + } + } break; + } + } + } + + /*Step 2: Convert the entries.*/ + sa_text = (char *)malloc((sa_ctext + 1) * sizeof(*sa_text)); + sa_ntext = 0; + /*Add the encoded Application Indicator for FNC1 in the second position.*/ + if (fnc1 == MOD(ZBAR_MOD_AIM)) { + if (fnc1_2ai < 100) { + /*The Application Indicator is a 2-digit number.*/ + sa_text[sa_ntext++] = '0' + fnc1_2ai / 10; + sa_text[sa_ntext++] = '0' + fnc1_2ai % 10; + } + /*The Application Indicator is a single letter. + We already checked that it lies in one of the ranges A...Z, a...z + when we decoded it.*/ + else + sa_text[sa_ntext++] = (char)(fnc1_2ai - 100); + } + eci = -1; + enc_list[0] = sjis_cd; + enc_list[1] = latin1_cd; + enc_list[2] = big5_cd; + enc_list[3] = utf8_cd; + eci_cd = (iconv_t)-1; + err = 0; + + bytebuf_text = (char *)malloc((sa_ctext + 1) * sizeof(*sa_text)); + bytebuf_ntext = 0; + + for (j = 0; j < sa_size && !err; j++, sym = &(*sym)->next) { + *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); + (*sym)->datalen = sa_ntext; + if (sa[j] < 0) { + /* generic placeholder for unfinished results */ + (*sym)->type = ZBAR_PARTIAL; + + /*Skip all contiguous missing segments.*/ + for (j++; j < sa_size && sa[j] < 0; j++) + ; + /*If there aren't any more, stop.*/ + if (j >= sa_size) + break; + + /* mark break in data */ + sa_text[sa_ntext++] = '\0'; + (*sym)->datalen = sa_ntext; + + /* advance to next symbol */ + sym = &(*sym)->next; + *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); + } + + qrdataj = qrdata + sa[j]; + /* expose bounding box */ + sym_add_point(*sym, qrdataj->bbox[0][0], qrdataj->bbox[0][1]); + sym_add_point(*sym, qrdataj->bbox[2][0], qrdataj->bbox[2][1]); + sym_add_point(*sym, qrdataj->bbox[3][0], qrdataj->bbox[3][1]); + sym_add_point(*sym, qrdataj->bbox[1][0], qrdataj->bbox[1][1]); + + /* approx symbol "up" direction */ + dir[0] = (qrdataj->bbox[0][0] - qrdataj->bbox[2][0] + + qrdataj->bbox[1][0] - qrdataj->bbox[3][0]); + dir[1] = (qrdataj->bbox[2][1] - qrdataj->bbox[0][1] + + qrdataj->bbox[3][1] - qrdataj->bbox[1][1]); + horiz = abs(dir[0]) > abs(dir[1]); + (*sym)->orient = horiz + 2 * (dir[1 - horiz] < 0); + + for (k = 0; k <= qrdataj->nentries && !err; k++) { + size_t inleft; + size_t outleft; + char *in; + char *out; + + // Check if bytebuf_text is empty INSIDE for loop. + if (bytebuf_ntext > 0) { + entry = (k == qrdataj->nentries) ? NULL : + qrdataj->entries + k; + // next entry is not byte mode, convert bytes to text. + if (entry == NULL || (entry->mode != QR_MODE_BYTE && + entry->mode != QR_MODE_KANJI)) { + in = bytebuf_text; + inleft = bytebuf_ntext; + out = sa_text + sa_ntext; + outleft = sa_ctext - sa_ntext; + /*If we have no specified encoding, attempt to auto-detect it + unless configured with ZBAR_CFG_BINARY.*/ + if (eci < 0) { + if (raw_binary) { + /* copy all remaining bytes to output buffer. */ + memcpy(out, in, inleft); + sa_ntext += inleft; + bytebuf_ntext = 0; + } else { + int ei; + /*If there was data encoded in kanji mode, assume it's SJIS.*/ + if (has_kanji) + enc_list_mtf(enc_list, sjis_cd); + /*Otherwise check for the UTF-8 BOM. + UTF-8 is rarely specified with ECI, and few decoders + currently support doing so, so this is the best way for + encoders to reliably indicate it.*/ + else if (inleft >= 3 && + in[0] == (char)0xEF && + in[1] == (char)0xBB && + in[2] == (char)0xBF) { + in += 3; + inleft -= 3; + /*Actually try converting (to check validity).*/ + err = utf8_cd == (iconv_t)-1 || + iconv(utf8_cd, &in, &inleft, &out, + &outleft) == (size_t)-1; + if (!err) { + sa_ntext = out - sa_text; + enc_list_mtf(enc_list, utf8_cd); + bytebuf_ntext = 0; + } + in = bytebuf_text; + inleft = bytebuf_ntext; + out = sa_text + sa_ntext; + outleft = sa_ctext - sa_ntext; + } + /*If the text is 8-bit clean, prefer UTF-8 over SJIS, since + SJIS will corrupt the backslashes used for DoCoMo formats.*/ + else if (text_is_ascii((unsigned char *)in, + inleft)) { + enc_list_mtf(enc_list, utf8_cd); + } + /* Check if it's big5 encoding. */ + else if (text_is_big5((unsigned char *)in, + inleft)) { + enc_list_mtf(enc_list, big5_cd); + } + + /*Try our list of encodings.*/ + for (ei = 0; ei < ENC_LIST_SIZE; ei++) + if (enc_list[ei] != (iconv_t)-1) { + /*According to the 2005 version of the standard, + ISO/IEC 8859-1 (one hyphen) is supposed to be used, but + reality is not always so (and in the 2000 version of the + standard, it was JIS8/SJIS that was the default). + It's got an invalid range that is used often with SJIS + and UTF-8, though, which makes detection easier. + However, iconv() does not properly reject characters in + those ranges, since ISO-8859-1 (two hyphens) defines a + number of seldom-used control code characters there. + So if we see any of those characters, move this + conversion to the end of the list.*/ + if (ei < 3 && + enc_list[ei] == latin1_cd && + !text_is_latin1( + (unsigned char *)in, + inleft)) { + int ej; + for (ej = ei + 1; + ej < ENC_LIST_SIZE; ej++) + enc_list[ej - 1] = + enc_list[ej]; + enc_list[3] = latin1_cd; + } + err = iconv(enc_list[ei], &in, + &inleft, &out, + &outleft) == (size_t)-1; + if (!err) { + sa_ntext = out - sa_text; + enc_list_mtf(enc_list, + enc_list[ei]); + break; + } + in = bytebuf_text; + inleft = bytebuf_ntext; + out = sa_text + sa_ntext; + outleft = sa_ctext - sa_ntext; + } + } + } + /*We were actually given a character set; use it. + The spec says that in this case, data should be treated as if it + came from the given character set even when encoded in kanji + mode.*/ + else { + err = eci_cd == (iconv_t)-1 || + iconv(eci_cd, &in, &inleft, &out, + &outleft) == (size_t)-1; + if (!err) + sa_ntext = out - sa_text; + } + bytebuf_ntext = 0; + } + } + if (k == qrdataj->nentries) + break; + + entry = qrdataj->entries + k; + switch (entry->mode) { + case QR_MODE_NUM: { + if (sa_ctext - sa_ntext >= + (size_t)entry->payload.data.len) { + memcpy(sa_text + sa_ntext, entry->payload.data.buf, + entry->payload.data.len * sizeof(*sa_text)); + sa_ntext += entry->payload.data.len; + } else + err = 1; + } break; + case QR_MODE_ALNUM: { + char *p; + in = (char *)entry->payload.data.buf; + inleft = entry->payload.data.len; + /*FNC1 uses '%' as an escape character.*/ + if (fnc1) + for (;;) { + size_t plen; + char c; + p = memchr(in, '%', inleft * sizeof(*in)); + if (p == NULL) + break; + plen = p - in; + if (sa_ctext - sa_ntext < plen + 1) + break; + memcpy(sa_text + sa_ntext, in, + plen * sizeof(*in)); + sa_ntext += plen; + /*Two '%'s is a literal '%'*/ + if (plen + 1 < inleft && p[1] == '%') { + c = '%'; + plen++; + p++; + } + /*One '%' is the ASCII group separator.*/ + else + c = 0x1D; + sa_text[sa_ntext++] = c; + inleft -= plen + 1; + in = p + 1; + } + else + p = NULL; + if (p != NULL || sa_ctext - sa_ntext < inleft) + err = 1; + else { + memcpy(sa_text + sa_ntext, in, + inleft * sizeof(*sa_text)); + sa_ntext += inleft; + } + } break; + /* DONE: This handles a multi-byte sequence split between + multiple data blocks. */ + case QR_MODE_BYTE: + case QR_MODE_KANJI: { + // copy byte to bytebuf + in = (char *)entry->payload.data.buf; + inleft = entry->payload.data.len; + memcpy(bytebuf_text + bytebuf_ntext, in, + inleft * sizeof(*bytebuf_text)); + bytebuf_ntext += inleft; + } break; + /*Check to see if a character set was specified.*/ + case QR_MODE_ECI: { + const char *enc; + char buf[16]; + unsigned cur_eci; + cur_eci = entry->payload.eci; + if (cur_eci <= QR_ECI_ISO8859_16 && cur_eci != 14) { + if (cur_eci != QR_ECI_GLI0 && + cur_eci != QR_ECI_CP437) { + sprintf(buf, "ISO8859-%i", + QR_MAXI(cur_eci, 3) - 2); + enc = buf; + } + /*Note that CP437 requires an iconv compiled with + --enable-extra-encodings, and thus may not be available.*/ + else + enc = "CP437"; + } else if (cur_eci == QR_ECI_SJIS) + enc = "SJIS"; + else if (cur_eci == QR_ECI_UTF8) + enc = "UTF-8"; + /*Don't know what this ECI code specifies, but not an encoding that + we recognize.*/ + else + continue; + eci = cur_eci; + eci_cd = iconv_open("UTF-8", enc); + } break; + /*Silence stupid compiler warnings.*/ + default: + break; + } + } + /*If eci should be reset between codes, do so.*/ + if (eci <= QR_ECI_GLI1) { + eci = -1; + if (eci_cd != (iconv_t)-1) { + iconv_close(eci_cd); + eci_cd = (iconv_t)-1; + } + } + } + + free(bytebuf_text); + + if (eci_cd != (iconv_t)-1) + iconv_close(eci_cd); + if (!err) { + zbar_symbol_t *sa_sym; + sa_text[sa_ntext++] = '\0'; + if (sa_ctext + 1 > sa_ntext) { + sa_text = + (char *)realloc(sa_text, sa_ntext * sizeof(*sa_text)); + } + + if (sa_size == 1) + sa_sym = syms; + else { + /* cheap out w/axis aligned bbox for now */ + int xmin = img->width, xmax = -2; + int ymin = img->height, ymax = -2; + + /* create "virtual" container symbol for composite result */ + sa_sym = + _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0); + sa_sym->syms = _zbar_symbol_set_create(); + sa_sym->syms->head = syms; + + /* fixup data references */ + for (; syms; syms = syms->next) { + int next; + _zbar_symbol_refcnt(syms, 1); + if (syms->type == ZBAR_PARTIAL) + sa_sym->type = ZBAR_PARTIAL; + else + for (j = 0; j < syms->npts; j++) { + int u = syms->pts[j].x; + if (xmin >= u) + xmin = u - 1; + if (xmax <= u) + xmax = u + 1; + u = syms->pts[j].y; + if (ymin >= u) + ymin = u - 1; + if (ymax <= u) + ymax = u + 1; + } + syms->data = sa_text + syms->datalen; + next = (syms->next) ? syms->next->datalen : sa_ntext; + if (next > syms->datalen) + syms->datalen = next - syms->datalen - 1; + else { + zprintf( + 1, "Assertion `next > syms->datalen' failed\n"); + syms->datalen = 0; + } + } + if (xmax >= -1) { + sym_add_point(sa_sym, xmin, ymin); + sym_add_point(sa_sym, xmin, ymax); + sym_add_point(sa_sym, xmax, ymax); + sym_add_point(sa_sym, xmax, ymin); + } + } + sa_sym->data = sa_text; + sa_sym->data_alloc = sa_ntext; + sa_sym->datalen = sa_ntext - 1; + sa_sym->modifiers = fnc1; + + _zbar_image_scanner_add_sym(iscn, sa_sym); + } else { + _zbar_image_scanner_recycle_syms(iscn, syms); + free(sa_text); + } + } + if (utf8_cd != (iconv_t)-1) + iconv_close(utf8_cd); + if (sjis_cd != (iconv_t)-1) + iconv_close(sjis_cd); + if (latin1_cd != (iconv_t)-1) + iconv_close(latin1_cd); + if (big5_cd != (iconv_t)-1) + iconv_close(big5_cd); + free(mark); + return ntext; +} |