summaryrefslogtreecommitdiffstats
path: root/zbar/qrcode/qrdectxt.c
diff options
context:
space:
mode:
Diffstat (limited to 'zbar/qrcode/qrdectxt.c')
-rw-r--r--zbar/qrcode/qrdectxt.c573
1 files changed, 573 insertions, 0 deletions
diff --git a/zbar/qrcode/qrdectxt.c b/zbar/qrcode/qrdectxt.c
new file mode 100644
index 0000000..5676e33
--- /dev/null
+++ b/zbar/qrcode/qrdectxt.c
@@ -0,0 +1,573 @@
+/*Copyright (C) 2008-2010 Timothy B. Terriberry (tterribe@xiph.org)
+ You can redistribute this library and/or modify it under the terms of the
+ GNU Lesser General Public License as published by the Free Software
+ Foundation; either version 2.1 of the License, or (at your option) any later
+ version.*/
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <iconv.h>
+
+#include "decoder.h"
+#include "error.h"
+#include "image.h"
+#include "img_scanner.h"
+#include "qrcode.h"
+#include "qrdec.h"
+#include "util.h"
+
+#define ENC_LIST_SIZE 4
+
+static int text_is_ascii(const unsigned char *_text, int _len)
+{
+ int i;
+ for (i = 0; i < _len; i++)
+ if (_text[i] >= 0x80)
+ return 0;
+ return 1;
+}
+
+static int text_is_latin1(const unsigned char *_text, int _len)
+{
+ int i;
+ for (i = 0; i < _len; i++) {
+ /*The following line fails to compile correctly with gcc 3.4.4 on ARM with
+ any optimizations enabled.*/
+ if (_text[i] >= 0x80 && _text[i] < 0xA0)
+ return 0;
+ }
+ return 1;
+}
+
+static int text_is_big5(const unsigned char *_text, int _len)
+{
+ int i;
+ for (i = 0; i < _len; i++) {
+ if (_text[i] == 0xFF)
+ return 0;
+ else if (_text[i] >= 0x80) { // first byte is big5
+ i++;
+ if (i >= _len) // second byte not exists
+ return 0;
+ if (_text[i] < 0x40 || (_text[i] > 0x7E && _text[i] < 0xA1) ||
+ _text[i] > 0xFE) { // second byte not in range
+ return 0;
+ }
+ } else { // normal ascii encoding, it's okay
+ }
+ }
+ return 1;
+}
+
+static void enc_list_mtf(iconv_t _enc_list[ENC_LIST_SIZE], iconv_t _enc)
+{
+ int i;
+ for (i = 0; i < ENC_LIST_SIZE; i++)
+ if (_enc_list[i] == _enc) {
+ int j;
+ for (j = i; j-- > 0;)
+ _enc_list[j + 1] = _enc_list[j];
+ _enc_list[0] = _enc;
+ break;
+ }
+}
+
+int qr_code_data_list_extract_text(const qr_code_data_list *_qrlist,
+ zbar_image_scanner_t *iscn,
+ zbar_image_t *img)
+{
+ iconv_t sjis_cd;
+ iconv_t utf8_cd;
+ iconv_t latin1_cd;
+ iconv_t big5_cd;
+ const qr_code_data *qrdata;
+ int nqrdata;
+ unsigned char *mark;
+ int ntext;
+ int i;
+ int raw_binary = 0;
+ zbar_image_scanner_get_config(iscn, ZBAR_QRCODE, ZBAR_CFG_BINARY,
+ &raw_binary);
+ qrdata = _qrlist->qrdata;
+ nqrdata = _qrlist->nqrdata;
+ mark = (unsigned char *)calloc(nqrdata, sizeof(*mark));
+ ntext = 0;
+ /*This is the encoding the standard says is the default.*/
+ latin1_cd = iconv_open("UTF-8", "ISO8859-1");
+ /*But this one is often used, as well.*/
+ sjis_cd = iconv_open("UTF-8", "SJIS");
+ /*This is a trivial conversion just to check validity without extra code.*/
+ utf8_cd = iconv_open("UTF-8", "UTF-8");
+ /* add support for big5 encoding. */
+ big5_cd = iconv_open("UTF-8", "BIG-5");
+ for (i = 0; i < nqrdata; i++)
+ if (!mark[i]) {
+ const qr_code_data *qrdataj;
+ const qr_code_data_entry *entry;
+ iconv_t enc_list[ENC_LIST_SIZE];
+ iconv_t eci_cd;
+ int sa[16];
+ int sa_size;
+ char *sa_text;
+ size_t sa_ntext;
+ size_t sa_ctext;
+ int fnc1;
+ int fnc1_2ai;
+ int has_kanji;
+ int eci;
+ int err;
+ int j;
+ int k;
+ zbar_symbol_t *syms = NULL, **sym = &syms;
+ qr_point dir;
+ int horiz;
+ char *bytebuf_text;
+ size_t bytebuf_ntext;
+
+ /*Step 0: Collect the other QR codes belonging to this S-A group.*/
+ if (qrdata[i].sa_size) {
+ unsigned sa_parity;
+ sa_size = qrdata[i].sa_size;
+ sa_parity = qrdata[i].sa_parity;
+ for (j = 0; j < sa_size; j++)
+ sa[j] = -1;
+ for (j = i; j < nqrdata; j++)
+ if (!mark[j]) {
+ /*TODO: We could also match version, ECC level, etc. if size and
+ parity alone are too ambiguous.*/
+ if (qrdata[j].sa_size == sa_size &&
+ qrdata[j].sa_parity == sa_parity &&
+ sa[qrdata[j].sa_index] < 0) {
+ sa[qrdata[j].sa_index] = j;
+ mark[j] = 1;
+ }
+ }
+ /*TODO: If the S-A group is complete, check the parity.*/
+ } else {
+ sa[0] = i;
+ sa_size = 1;
+ }
+
+ sa_ctext = 0;
+ fnc1 = 0;
+ fnc1_2ai = 0;
+ has_kanji = 0;
+ /*Step 1: Detect FNC1 markers and estimate the required buffer size.*/
+ for (j = 0; j < sa_size; j++)
+ if (sa[j] >= 0) {
+ qrdataj = qrdata + sa[j];
+ for (k = 0; k < qrdataj->nentries; k++) {
+ int shift;
+ entry = qrdataj->entries + k;
+ shift = 0;
+ switch (entry->mode) {
+ /*FNC1 applies to the entire code and ignores subsequent markers.*/
+ case QR_MODE_FNC1_1ST: {
+ if (!fnc1)
+ fnc1 = MOD(ZBAR_MOD_GS1);
+ } break;
+ case QR_MODE_FNC1_2ND: {
+ if (!fnc1) {
+ fnc1 = MOD(ZBAR_MOD_AIM);
+ fnc1_2ai = entry->payload.ai;
+ sa_ctext += 2;
+ }
+ } break;
+ /*We assume at most 4 UTF-8 bytes per input byte.
+ I believe this is true for all the encodings we actually use.*/
+ case QR_MODE_KANJI:
+ has_kanji = 1;
+ case QR_MODE_BYTE:
+ shift = 2;
+ default: {
+ /*The remaining two modes are already valid UTF-8.*/
+ if (QR_MODE_HAS_DATA(entry->mode)) {
+ sa_ctext += entry->payload.data.len << shift;
+ }
+ } break;
+ }
+ }
+ }
+
+ /*Step 2: Convert the entries.*/
+ sa_text = (char *)malloc((sa_ctext + 1) * sizeof(*sa_text));
+ sa_ntext = 0;
+ /*Add the encoded Application Indicator for FNC1 in the second position.*/
+ if (fnc1 == MOD(ZBAR_MOD_AIM)) {
+ if (fnc1_2ai < 100) {
+ /*The Application Indicator is a 2-digit number.*/
+ sa_text[sa_ntext++] = '0' + fnc1_2ai / 10;
+ sa_text[sa_ntext++] = '0' + fnc1_2ai % 10;
+ }
+ /*The Application Indicator is a single letter.
+ We already checked that it lies in one of the ranges A...Z, a...z
+ when we decoded it.*/
+ else
+ sa_text[sa_ntext++] = (char)(fnc1_2ai - 100);
+ }
+ eci = -1;
+ enc_list[0] = sjis_cd;
+ enc_list[1] = latin1_cd;
+ enc_list[2] = big5_cd;
+ enc_list[3] = utf8_cd;
+ eci_cd = (iconv_t)-1;
+ err = 0;
+
+ bytebuf_text = (char *)malloc((sa_ctext + 1) * sizeof(*sa_text));
+ bytebuf_ntext = 0;
+
+ for (j = 0; j < sa_size && !err; j++, sym = &(*sym)->next) {
+ *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
+ (*sym)->datalen = sa_ntext;
+ if (sa[j] < 0) {
+ /* generic placeholder for unfinished results */
+ (*sym)->type = ZBAR_PARTIAL;
+
+ /*Skip all contiguous missing segments.*/
+ for (j++; j < sa_size && sa[j] < 0; j++)
+ ;
+ /*If there aren't any more, stop.*/
+ if (j >= sa_size)
+ break;
+
+ /* mark break in data */
+ sa_text[sa_ntext++] = '\0';
+ (*sym)->datalen = sa_ntext;
+
+ /* advance to next symbol */
+ sym = &(*sym)->next;
+ *sym = _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
+ }
+
+ qrdataj = qrdata + sa[j];
+ /* expose bounding box */
+ sym_add_point(*sym, qrdataj->bbox[0][0], qrdataj->bbox[0][1]);
+ sym_add_point(*sym, qrdataj->bbox[2][0], qrdataj->bbox[2][1]);
+ sym_add_point(*sym, qrdataj->bbox[3][0], qrdataj->bbox[3][1]);
+ sym_add_point(*sym, qrdataj->bbox[1][0], qrdataj->bbox[1][1]);
+
+ /* approx symbol "up" direction */
+ dir[0] = (qrdataj->bbox[0][0] - qrdataj->bbox[2][0] +
+ qrdataj->bbox[1][0] - qrdataj->bbox[3][0]);
+ dir[1] = (qrdataj->bbox[2][1] - qrdataj->bbox[0][1] +
+ qrdataj->bbox[3][1] - qrdataj->bbox[1][1]);
+ horiz = abs(dir[0]) > abs(dir[1]);
+ (*sym)->orient = horiz + 2 * (dir[1 - horiz] < 0);
+
+ for (k = 0; k <= qrdataj->nentries && !err; k++) {
+ size_t inleft;
+ size_t outleft;
+ char *in;
+ char *out;
+
+ // Check if bytebuf_text is empty INSIDE for loop.
+ if (bytebuf_ntext > 0) {
+ entry = (k == qrdataj->nentries) ? NULL :
+ qrdataj->entries + k;
+ // next entry is not byte mode, convert bytes to text.
+ if (entry == NULL || (entry->mode != QR_MODE_BYTE &&
+ entry->mode != QR_MODE_KANJI)) {
+ in = bytebuf_text;
+ inleft = bytebuf_ntext;
+ out = sa_text + sa_ntext;
+ outleft = sa_ctext - sa_ntext;
+ /*If we have no specified encoding, attempt to auto-detect it
+ unless configured with ZBAR_CFG_BINARY.*/
+ if (eci < 0) {
+ if (raw_binary) {
+ /* copy all remaining bytes to output buffer. */
+ memcpy(out, in, inleft);
+ sa_ntext += inleft;
+ bytebuf_ntext = 0;
+ } else {
+ int ei;
+ /*If there was data encoded in kanji mode, assume it's SJIS.*/
+ if (has_kanji)
+ enc_list_mtf(enc_list, sjis_cd);
+ /*Otherwise check for the UTF-8 BOM.
+ UTF-8 is rarely specified with ECI, and few decoders
+ currently support doing so, so this is the best way for
+ encoders to reliably indicate it.*/
+ else if (inleft >= 3 &&
+ in[0] == (char)0xEF &&
+ in[1] == (char)0xBB &&
+ in[2] == (char)0xBF) {
+ in += 3;
+ inleft -= 3;
+ /*Actually try converting (to check validity).*/
+ err = utf8_cd == (iconv_t)-1 ||
+ iconv(utf8_cd, &in, &inleft, &out,
+ &outleft) == (size_t)-1;
+ if (!err) {
+ sa_ntext = out - sa_text;
+ enc_list_mtf(enc_list, utf8_cd);
+ bytebuf_ntext = 0;
+ }
+ in = bytebuf_text;
+ inleft = bytebuf_ntext;
+ out = sa_text + sa_ntext;
+ outleft = sa_ctext - sa_ntext;
+ }
+ /*If the text is 8-bit clean, prefer UTF-8 over SJIS, since
+ SJIS will corrupt the backslashes used for DoCoMo formats.*/
+ else if (text_is_ascii((unsigned char *)in,
+ inleft)) {
+ enc_list_mtf(enc_list, utf8_cd);
+ }
+ /* Check if it's big5 encoding. */
+ else if (text_is_big5((unsigned char *)in,
+ inleft)) {
+ enc_list_mtf(enc_list, big5_cd);
+ }
+
+ /*Try our list of encodings.*/
+ for (ei = 0; ei < ENC_LIST_SIZE; ei++)
+ if (enc_list[ei] != (iconv_t)-1) {
+ /*According to the 2005 version of the standard,
+ ISO/IEC 8859-1 (one hyphen) is supposed to be used, but
+ reality is not always so (and in the 2000 version of the
+ standard, it was JIS8/SJIS that was the default).
+ It's got an invalid range that is used often with SJIS
+ and UTF-8, though, which makes detection easier.
+ However, iconv() does not properly reject characters in
+ those ranges, since ISO-8859-1 (two hyphens) defines a
+ number of seldom-used control code characters there.
+ So if we see any of those characters, move this
+ conversion to the end of the list.*/
+ if (ei < 3 &&
+ enc_list[ei] == latin1_cd &&
+ !text_is_latin1(
+ (unsigned char *)in,
+ inleft)) {
+ int ej;
+ for (ej = ei + 1;
+ ej < ENC_LIST_SIZE; ej++)
+ enc_list[ej - 1] =
+ enc_list[ej];
+ enc_list[3] = latin1_cd;
+ }
+ err = iconv(enc_list[ei], &in,
+ &inleft, &out,
+ &outleft) == (size_t)-1;
+ if (!err) {
+ sa_ntext = out - sa_text;
+ enc_list_mtf(enc_list,
+ enc_list[ei]);
+ break;
+ }
+ in = bytebuf_text;
+ inleft = bytebuf_ntext;
+ out = sa_text + sa_ntext;
+ outleft = sa_ctext - sa_ntext;
+ }
+ }
+ }
+ /*We were actually given a character set; use it.
+ The spec says that in this case, data should be treated as if it
+ came from the given character set even when encoded in kanji
+ mode.*/
+ else {
+ err = eci_cd == (iconv_t)-1 ||
+ iconv(eci_cd, &in, &inleft, &out,
+ &outleft) == (size_t)-1;
+ if (!err)
+ sa_ntext = out - sa_text;
+ }
+ bytebuf_ntext = 0;
+ }
+ }
+ if (k == qrdataj->nentries)
+ break;
+
+ entry = qrdataj->entries + k;
+ switch (entry->mode) {
+ case QR_MODE_NUM: {
+ if (sa_ctext - sa_ntext >=
+ (size_t)entry->payload.data.len) {
+ memcpy(sa_text + sa_ntext, entry->payload.data.buf,
+ entry->payload.data.len * sizeof(*sa_text));
+ sa_ntext += entry->payload.data.len;
+ } else
+ err = 1;
+ } break;
+ case QR_MODE_ALNUM: {
+ char *p;
+ in = (char *)entry->payload.data.buf;
+ inleft = entry->payload.data.len;
+ /*FNC1 uses '%' as an escape character.*/
+ if (fnc1)
+ for (;;) {
+ size_t plen;
+ char c;
+ p = memchr(in, '%', inleft * sizeof(*in));
+ if (p == NULL)
+ break;
+ plen = p - in;
+ if (sa_ctext - sa_ntext < plen + 1)
+ break;
+ memcpy(sa_text + sa_ntext, in,
+ plen * sizeof(*in));
+ sa_ntext += plen;
+ /*Two '%'s is a literal '%'*/
+ if (plen + 1 < inleft && p[1] == '%') {
+ c = '%';
+ plen++;
+ p++;
+ }
+ /*One '%' is the ASCII group separator.*/
+ else
+ c = 0x1D;
+ sa_text[sa_ntext++] = c;
+ inleft -= plen + 1;
+ in = p + 1;
+ }
+ else
+ p = NULL;
+ if (p != NULL || sa_ctext - sa_ntext < inleft)
+ err = 1;
+ else {
+ memcpy(sa_text + sa_ntext, in,
+ inleft * sizeof(*sa_text));
+ sa_ntext += inleft;
+ }
+ } break;
+ /* DONE: This handles a multi-byte sequence split between
+ multiple data blocks. */
+ case QR_MODE_BYTE:
+ case QR_MODE_KANJI: {
+ // copy byte to bytebuf
+ in = (char *)entry->payload.data.buf;
+ inleft = entry->payload.data.len;
+ memcpy(bytebuf_text + bytebuf_ntext, in,
+ inleft * sizeof(*bytebuf_text));
+ bytebuf_ntext += inleft;
+ } break;
+ /*Check to see if a character set was specified.*/
+ case QR_MODE_ECI: {
+ const char *enc;
+ char buf[16];
+ unsigned cur_eci;
+ cur_eci = entry->payload.eci;
+ if (cur_eci <= QR_ECI_ISO8859_16 && cur_eci != 14) {
+ if (cur_eci != QR_ECI_GLI0 &&
+ cur_eci != QR_ECI_CP437) {
+ sprintf(buf, "ISO8859-%i",
+ QR_MAXI(cur_eci, 3) - 2);
+ enc = buf;
+ }
+ /*Note that CP437 requires an iconv compiled with
+ --enable-extra-encodings, and thus may not be available.*/
+ else
+ enc = "CP437";
+ } else if (cur_eci == QR_ECI_SJIS)
+ enc = "SJIS";
+ else if (cur_eci == QR_ECI_UTF8)
+ enc = "UTF-8";
+ /*Don't know what this ECI code specifies, but not an encoding that
+ we recognize.*/
+ else
+ continue;
+ eci = cur_eci;
+ eci_cd = iconv_open("UTF-8", enc);
+ } break;
+ /*Silence stupid compiler warnings.*/
+ default:
+ break;
+ }
+ }
+ /*If eci should be reset between codes, do so.*/
+ if (eci <= QR_ECI_GLI1) {
+ eci = -1;
+ if (eci_cd != (iconv_t)-1) {
+ iconv_close(eci_cd);
+ eci_cd = (iconv_t)-1;
+ }
+ }
+ }
+
+ free(bytebuf_text);
+
+ if (eci_cd != (iconv_t)-1)
+ iconv_close(eci_cd);
+ if (!err) {
+ zbar_symbol_t *sa_sym;
+ sa_text[sa_ntext++] = '\0';
+ if (sa_ctext + 1 > sa_ntext) {
+ sa_text =
+ (char *)realloc(sa_text, sa_ntext * sizeof(*sa_text));
+ }
+
+ if (sa_size == 1)
+ sa_sym = syms;
+ else {
+ /* cheap out w/axis aligned bbox for now */
+ int xmin = img->width, xmax = -2;
+ int ymin = img->height, ymax = -2;
+
+ /* create "virtual" container symbol for composite result */
+ sa_sym =
+ _zbar_image_scanner_alloc_sym(iscn, ZBAR_QRCODE, 0);
+ sa_sym->syms = _zbar_symbol_set_create();
+ sa_sym->syms->head = syms;
+
+ /* fixup data references */
+ for (; syms; syms = syms->next) {
+ int next;
+ _zbar_symbol_refcnt(syms, 1);
+ if (syms->type == ZBAR_PARTIAL)
+ sa_sym->type = ZBAR_PARTIAL;
+ else
+ for (j = 0; j < syms->npts; j++) {
+ int u = syms->pts[j].x;
+ if (xmin >= u)
+ xmin = u - 1;
+ if (xmax <= u)
+ xmax = u + 1;
+ u = syms->pts[j].y;
+ if (ymin >= u)
+ ymin = u - 1;
+ if (ymax <= u)
+ ymax = u + 1;
+ }
+ syms->data = sa_text + syms->datalen;
+ next = (syms->next) ? syms->next->datalen : sa_ntext;
+ if (next > syms->datalen)
+ syms->datalen = next - syms->datalen - 1;
+ else {
+ zprintf(
+ 1, "Assertion `next > syms->datalen' failed\n");
+ syms->datalen = 0;
+ }
+ }
+ if (xmax >= -1) {
+ sym_add_point(sa_sym, xmin, ymin);
+ sym_add_point(sa_sym, xmin, ymax);
+ sym_add_point(sa_sym, xmax, ymax);
+ sym_add_point(sa_sym, xmax, ymin);
+ }
+ }
+ sa_sym->data = sa_text;
+ sa_sym->data_alloc = sa_ntext;
+ sa_sym->datalen = sa_ntext - 1;
+ sa_sym->modifiers = fnc1;
+
+ _zbar_image_scanner_add_sym(iscn, sa_sym);
+ } else {
+ _zbar_image_scanner_recycle_syms(iscn, syms);
+ free(sa_text);
+ }
+ }
+ if (utf8_cd != (iconv_t)-1)
+ iconv_close(utf8_cd);
+ if (sjis_cd != (iconv_t)-1)
+ iconv_close(sjis_cd);
+ if (latin1_cd != (iconv_t)-1)
+ iconv_close(latin1_cd);
+ if (big5_cd != (iconv_t)-1)
+ iconv_close(big5_cd);
+ free(mark);
+ return ntext;
+}