summaryrefslogtreecommitdiffstats
path: root/src/UCAux.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 16:37:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 16:37:15 +0000
commitae5d181b854d3ccb373b6bc01b4869e44ff4d87a (patch)
tree91f59efb48c56a84cc798e012fccb667b63d3fee /src/UCAux.c
parentInitial commit. (diff)
downloadlynx-ae5d181b854d3ccb373b6bc01b4869e44ff4d87a.tar.xz
lynx-ae5d181b854d3ccb373b6bc01b4869e44ff4d87a.zip
Adding upstream version 2.9.0dev.12.upstream/2.9.0dev.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/UCAux.c')
-rw-r--r--src/UCAux.c800
1 files changed, 800 insertions, 0 deletions
diff --git a/src/UCAux.c b/src/UCAux.c
new file mode 100644
index 0000000..44495a6
--- /dev/null
+++ b/src/UCAux.c
@@ -0,0 +1,800 @@
+/*
+ * $LynxId: UCAux.c,v 1.58 2021/07/01 23:34:24 tom Exp $
+ */
+#include <HTUtils.h>
+
+#include <HTCJK.h>
+#include <UCMap.h>
+#include <UCDefs.h>
+#include <HTStream.h>
+#include <UCAux.h>
+#include <LYCharSets.h>
+#include <LYCurses.h>
+#include <LYStrings.h>
+
+BOOL UCCanUniTranslateFrom(int from)
+{
+ if (from < 0)
+ return NO;
+#ifndef USE_JAPANESEUTF8_SUPPORT
+ if (LYCharSet_UC[from].enc == UCT_ENC_CJK)
+ return NO;
+#endif
+ if (!strcmp(LYCharSet_UC[from].MIMEname, "x-transparent"))
+ return NO;
+
+ /* others YES */
+ return YES;
+}
+
+BOOL UCCanTranslateUniTo(int to)
+{
+ if (to < 0)
+ return NO;
+
+ return YES; /* well at least some characters... */
+}
+
+BOOL UCCanTranslateFromTo(int from,
+ int to)
+{
+ if (from == to)
+ return YES;
+ if (from < 0 || to < 0)
+ return NO;
+ if (from == LATIN1)
+ return UCCanTranslateUniTo(to);
+ if (to == LATIN1 || LYCharSet_UC[to].enc == UCT_ENC_UTF8)
+ return UCCanUniTranslateFrom(from);
+ {
+ const char *fromname = LYCharSet_UC[from].MIMEname;
+ const char *toname = LYCharSet_UC[to].MIMEname;
+
+ if (!strcmp(fromname, "x-transparent") ||
+ !strcmp(toname, "x-transparent")) {
+ return YES; /* ??? */
+ } else if (!strcmp(fromname, "us-ascii")) {
+ return YES;
+ }
+ if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
+ /*
+ * CJK mode may be off (i.e., !IS_CJK_TTY) because the current
+ * document is not CJK, but the check may be for capability in
+ * relation to another document, for which CJK mode might be turned
+ * on when retrieved. Thus, when the from charset is CJK, check if
+ * the to charset is CJK, and return NO or YES in relation to that.
+ * - FM
+ */
+ if (LYCharSet_UC[to].enc != UCT_ENC_CJK)
+ return NO;
+ if ((!strcmp(toname, "euc-jp") ||
+ !strcmp(toname, "shift_jis")) &&
+ (!strcmp(fromname, "euc-jp") ||
+ !strcmp(fromname, "shift_jis")))
+ return YES;
+ /*
+ * The euc-cn and euc-kr charsets were handled by the (from == to)
+ * above, so we need not check those. - FM
+ */
+ return NO;
+ }
+ }
+ return YES; /* others YES */
+}
+
+/*
+ * Returns YES if no translation necessary (because
+ * charsets are equal, are equivalent, etc.).
+ */
+BOOL UCNeedNotTranslate(int from,
+ int to)
+{
+ const char *fromname;
+ const char *toname;
+
+ if (from == to)
+ return YES;
+ if (from < 0)
+ return NO; /* ??? */
+ if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) {
+ return YES; /* Only 7bit chars. */
+ }
+ fromname = LYCharSet_UC[from].MIMEname;
+ if (!strcmp(fromname, "x-transparent") ||
+ !strcmp(fromname, "us-ascii")) {
+ return YES;
+ }
+ if (to < 0)
+ return NO; /* ??? */
+ if (to == LATIN1) {
+ if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
+ return YES;
+ }
+ toname = LYCharSet_UC[to].MIMEname;
+ if (!strcmp(toname, "x-transparent")) {
+ return YES;
+ }
+ if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
+ return NO;
+ }
+ if (from == LATIN1) {
+ if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1))
+ return YES;
+ }
+ if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
+ if (!IS_CJK_TTY) /* Use that global flag, for now. */
+ return NO;
+ if (HTCJK == JAPANESE &&
+ (!strcmp(fromname, "euc-jp") ||
+ !strcmp(fromname, "shift_jis")))
+ return YES; /* translate internally by lynx, no unicode */
+ return NO; /* If not handled by (from == to) above. */
+ }
+ return NO;
+}
+
+/*
+ * The idea here is that any stage of the stream pipe which is interested
+ * in some charset dependent processing will call this function.
+ * Given input and output charsets, this function will set various flags
+ * in a UCTransParams structure that _suggest_ to the caller what to do.
+ *
+ * Should be called once when a stage starts processing text (and the
+ * input and output charsets are known), or whenever one of input or
+ * output charsets has changed (e.g., by SGML.c stage after HTML.c stage
+ * has processed a META tag).
+ * The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
+ * not taken into account here (except for HTCJK, somewhat), it's still
+ * up to the caller to do something about them. - KW
+ */
+void UCSetTransParams(UCTransParams * pT, int cs_in,
+ const LYUCcharset *p_in,
+ int cs_out,
+ const LYUCcharset *p_out)
+{
+ CTRACE((tfp, "UCSetTransParams: from %s(%d) to %s(%d)\n",
+ p_in->MIMEname, UCGetLYhndl_byMIME(p_in->MIMEname),
+ p_out->MIMEname, UCGetLYhndl_byMIME(p_out->MIMEname)));
+
+ /*
+ * Initialize this element to FALSE, and set it TRUE below if we're dealing
+ * with VISCII. - FM
+ */
+ pT->trans_C0_to_uni = FALSE;
+
+ /*
+ * The "transparent" display character set is a "super raw mode". - FM
+ */
+ pT->transp = (BOOL) (!strcmp(p_in->MIMEname, "x-transparent") ||
+ !strcmp(p_out->MIMEname, "x-transparent"));
+
+ /*
+ * UCS-2 is handled as a special case in SGML_write().
+ */
+ pT->ucs_mode = 0;
+
+ if (pT->transp) {
+ /*
+ * Set up the structure for "transparent". - FM
+ */
+ pT->do_cjk = FALSE;
+ pT->decode_utf8 = FALSE;
+ pT->output_utf8 = FALSE; /* We may, but won't know about it. - KW */
+ pT->do_8bitraw = TRUE;
+ pT->use_raw_char_in = TRUE;
+ pT->strip_raw_char_in = FALSE;
+ pT->pass_160_173_raw = TRUE;
+ pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
+ pT->trans_C0_to_uni = (BOOL) (p_in->enc == UCT_ENC_8BIT_C0 ||
+ p_out->enc == UCT_ENC_8BIT_C0);
+ } else {
+ /*
+ * Initialize local flags. - FM
+ */
+ BOOL intm_ucs = FALSE;
+ BOOL use_ucs = FALSE;
+
+ /*
+ * Set this element if we want to treat the input as CJK. - FM
+ */
+ pT->do_cjk = (BOOL) ((p_in->enc == UCT_ENC_CJK) &&
+ (
+ IS_CJK_TTY
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ || !strcmp(p_in->MIMEname, "euc-cn")
+ || !strcmp(p_in->MIMEname, "big5")
+ || !strcmp(p_in->MIMEname, "euc-kr")
+#endif
+ )
+ );
+ /*
+ * Set these elements based on whether we are dealing with UTF-8. - FM
+ */
+ pT->decode_utf8 = (BOOL) (p_in->enc == UCT_ENC_UTF8);
+ pT->output_utf8 = (BOOL) (p_out->enc == UCT_ENC_UTF8);
+ if (pT->do_cjk) {
+ /*
+ * Set up the structure for a CJK input with
+ * a CJK output (IS_CJK_TTY). - FM
+ */
+ pT->trans_to_uni = FALSE;
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ if (!strcmp(p_in->MIMEname, "euc-cn") ||
+ !strcmp(p_in->MIMEname, "big5") ||
+ !strcmp(p_in->MIMEname, "euc-kr")) {
+ pT->trans_to_uni = (BOOL) UCCanUniTranslateFrom(cs_in);
+ }
+#endif
+ pT->do_8bitraw = FALSE;
+ pT->pass_160_173_raw = TRUE;
+ pT->use_raw_char_in = FALSE; /* Not used for CJK. - KW */
+ pT->repl_translated_C0 = FALSE;
+ pT->trans_from_uni = FALSE; /* Not used for CJK. - KW */
+ } else {
+ /*
+ * Set up for all other charset combinations. The intm_ucs flag is
+ * set TRUE if the input charset is iso-8859-1 or UTF-8, or largely
+ * equivalent to them, i.e., if we have UCS without having to do a
+ * table translation.
+ */
+ intm_ucs = (BOOL) (cs_in == LATIN1 || pT->decode_utf8 ||
+ (p_in->codepoints &
+ (UCT_CP_SUBSETOF_LAT1 | UCT_CP_SUBSETOF_UCS2)));
+ /*
+ * pT->trans_to_uni is set TRUE if we do not have that as input
+ * already, and we can translate to Unicode. Note that UTF-8
+ * always is converted to Unicode in functions that use the
+ * transformation structure, so it is treated as already Unicode
+ * here.
+ */
+ pT->trans_to_uni = (BOOL) (!intm_ucs &&
+ UCCanUniTranslateFrom(cs_in));
+ /*
+ * We set this if we are translating to Unicode and what normally
+ * are low value control characters in fact are encoding octets for
+ * the input charset (presently, this applies to VISCII). - FM
+ */
+ pT->trans_C0_to_uni = (BOOL) (pT->trans_to_uni &&
+ p_in->enc == UCT_ENC_8BIT_C0);
+ /*
+ * We set this, presently, for VISCII. - FM
+ */
+ pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
+ /*
+ * Currently unused for any charset combination.
+ * Should always be FALSE
+ */
+ pT->strip_raw_char_in = FALSE;
+ /*
+ * use_ucs should be set TRUE if we have or will create Unicode
+ * values for input octets or UTF multibytes. - FM
+ */
+ use_ucs = (BOOL) (intm_ucs || pT->trans_to_uni);
+ /*
+ * This is set TRUE if use_ucs was set FALSE. It is complementary
+ * to the HTPassEightBitRaw flag, which is set TRUE or FALSE
+ * elsewhere based on the raw mode setting in relation to the
+ * current Display Character Set. - FM
+ */
+ pT->do_8bitraw = (BOOL) (!use_ucs);
+ /*
+ * This is set TRUE when 160 and 173 should not be treated as nbsp
+ * and shy, respectively. - FM
+ */
+ pT->pass_160_173_raw = (BOOL) (!use_ucs &&
+ !(p_in->like8859 & UCT_R_8859SPECL));
+ /*
+ * This is set when the input and output charsets match, and they
+ * are not ones which should go through a Unicode translation
+ * process anyway. - FM
+ */
+ pT->use_raw_char_in = (BOOL) (!pT->output_utf8 &&
+ cs_in == cs_out &&
+ !pT->trans_C0_to_uni);
+ /*
+ * This should be set TRUE when we expect to have done translation
+ * to Unicode or had the equivalent as input, can translate it to
+ * our output charset, and normally want to do so. The latter
+ * depends on the pT->do_8bitraw and pT->use_raw_char_in values set
+ * above, but also on HTPassEightBitRaw in any functions which use
+ * the transformation structure.. - FM
+ */
+ pT->trans_from_uni = (BOOL) (use_ucs && !pT->do_8bitraw &&
+ !pT->use_raw_char_in &&
+ UCCanTranslateUniTo(cs_out));
+ }
+ }
+ CTRACE((tfp, "UCSetTransParams (done):\n"));
+ CTRACE((tfp, " transp: %d\n", pT->transp));
+ CTRACE((tfp, " do_cjk: %d\n", pT->do_cjk));
+ CTRACE((tfp, " decode_utf8: %d\n", pT->decode_utf8));
+ CTRACE((tfp, " output_utf8: %d\n", pT->output_utf8));
+ CTRACE((tfp, " do_8bitraw: %d\n", pT->do_8bitraw));
+ CTRACE((tfp, " use_raw_char_in: %d\n", pT->use_raw_char_in));
+ CTRACE((tfp, " strip_raw_char_in: %d\n", pT->strip_raw_char_in));
+ CTRACE((tfp, " pass_160_173_raw: %d\n", pT->pass_160_173_raw));
+ CTRACE((tfp, " trans_to_uni: %d\n", pT->trans_to_uni));
+ CTRACE((tfp, " trans_C0_to_uni: %d\n", pT->trans_C0_to_uni));
+ CTRACE((tfp, " repl_translated_C0: %d\n", pT->repl_translated_C0));
+ CTRACE((tfp, " trans_from_uni: %d\n", pT->trans_from_uni));
+}
+
+/*
+ * This function initializes the transformation
+ * structure by setting all its elements to
+ * FALSE. - KW
+ */
+void UCTransParams_clear(UCTransParams * pT)
+{
+ pT->transp = FALSE;
+ pT->do_cjk = FALSE;
+ pT->decode_utf8 = FALSE;
+ pT->output_utf8 = FALSE;
+ pT->do_8bitraw = FALSE;
+ pT->use_raw_char_in = FALSE;
+ pT->strip_raw_char_in = FALSE;
+ pT->pass_160_173_raw = FALSE;
+ pT->trans_to_uni = FALSE;
+ pT->trans_C0_to_uni = FALSE;
+ pT->repl_translated_C0 = FALSE;
+ pT->trans_from_uni = FALSE;
+}
+
+/*
+ * If terminal is in UTF-8 mode, it probably cannot understand box drawing
+ * chars as the 8-bit (n)curses handles them. (This may also be true for other
+ * display character sets, but isn't currently checked.) In that case set the
+ * chars for horizontal and vertical drawing chars to displayable ASCII chars
+ * if '0' was requested. They'll stay as they are otherwise. -KW, TD
+ *
+ * If we're able to obtain a character set based on the locale settings,
+ * assume that the user has setup $TERM and the fonts already so line-drawing
+ * works.
+ */
+void UCSetBoxChars(int cset,
+ int *pvert_out,
+ int *phori_out,
+ int vert_in,
+ int hori_in)
+{
+ BOOL fix_lines = FALSE;
+
+ if (cset >= 0) {
+#ifndef WIDEC_CURSES
+ if (LYCharSet_UC[cset].enc == UCT_ENC_UTF8) {
+ fix_lines = TRUE;
+ }
+#endif
+ /*
+ * If we've identified a charset that works, require it.
+ * This is important if we have loaded a font, which would
+ * confuse curses.
+ */
+ /* US-ASCII vs Latin-1 is safe (usually) */
+ if ((cset == US_ASCII
+ || cset == LATIN1)
+ && (linedrawing_char_set == US_ASCII
+ || linedrawing_char_set == LATIN1)) {
+#if (defined(FANCY_CURSES) && defined(A_ALTCHARSET)) || defined(USE_SLANG)
+ vert_in = 0;
+ hori_in = 0;
+#else
+ ;
+#endif
+ }
+#ifdef EXP_CHARTRANS_AUTOSWITCH
+#if defined(NCURSES_VERSION) || defined(HAVE_TIGETSTR)
+ else {
+ static BOOL first = TRUE;
+ static int last_cset = -99;
+ static BOOL last_result = TRUE;
+ /* *INDENT-OFF* */
+ static struct {
+ int mapping;
+ UCode_t internal;
+ int external;
+ } table[] = {
+ { 'j', 0x2518, 0 }, /* BOX DRAWINGS LIGHT UP AND LEFT */
+ { 'k', 0x2510, 0 }, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
+ { 'l', 0x250c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
+ { 'm', 0x2514, 0 }, /* BOX DRAWINGS LIGHT UP AND RIGHT */
+ { 'n', 0x253c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
+ { 'q', 0x2500, 0 }, /* BOX DRAWINGS LIGHT HORIZONTAL */
+ { 't', 0x251c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
+ { 'u', 0x2524, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
+ { 'v', 0x2534, 0 }, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
+ { 'w', 0x252c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
+ { 'x', 0x2502, 0 }, /* BOX DRAWINGS LIGHT VERTICAL */
+ };
+ /* *INDENT-ON* */
+
+ unsigned n;
+
+ if (first) {
+ static char acsc_name[] = "acsc";
+ char *map = tigetstr(acsc_name);
+
+ if (map != 0) {
+ CTRACE((tfp, "build terminal line-drawing map\n"));
+ while (map[0] != 0 && map[1] != 0) {
+ for (n = 0; n < TABLESIZE(table); ++n) {
+ if (table[n].mapping == map[0]) {
+ table[n].external = UCH(map[1]);
+ CTRACE((tfp,
+ " map[%c] %#" PRI_UCode_t " -> %#x\n",
+ table[n].mapping,
+ CAST_UCode_t (table[n].internal),
+ (unsigned)table[n].external));
+ break;
+ }
+ }
+ map += 2;
+ }
+ }
+ first = FALSE;
+ }
+
+ if (cset == last_cset) {
+ fix_lines = last_result;
+ } else if (cset == UTF8_handle) {
+ last_result = FALSE;
+ last_cset = cset;
+ } else {
+ CTRACE((tfp, "check terminal line-drawing map\n"));
+ for (n = 0; n < TABLESIZE(table); ++n) {
+ int test = UCTransUniChar(table[n].internal, cset);
+
+ if (test != table[n].external) {
+ CTRACE((tfp,
+ "line-drawing map %c mismatch (have %#x, want %#x)\n",
+ table[n].mapping,
+ (unsigned) test,
+ (unsigned) table[n].external));
+ fix_lines = TRUE;
+ break;
+ }
+ }
+ last_result = fix_lines;
+ last_cset = cset;
+ }
+ }
+#else
+ else if (cset != linedrawing_char_set && linedrawing_char_set >= 0) {
+ fix_lines = TRUE;
+ }
+#endif
+#endif
+ }
+ if (fix_lines) {
+ if (!vert_in)
+ vert_in = '|';
+ if (!hori_in)
+ hori_in = '-';
+ }
+ *pvert_out = vert_in;
+ *phori_out = hori_in;
+}
+
+/*
+ * Given an output target HTStream* (can also be a HTStructured* via
+ * typecast), the target stream's put_character method, and a Unicode
+ * character, CPutUtf8_charstring() will either output the UTF8
+ * encoding of the Unicode and return YES, or do nothing and return
+ * NO (if conversion would be unnecessary or the Unicode character is
+ * considered invalid).
+ *
+ * [Could be used more generally, but is currently only used for &#nnnnn
+ * stuff - generation of UTF8 from 8-bit encoded charsets not yet done
+ * by SGML.c etc.]
+ */
+#define PUTC(ch) ((*myPutc)(target, (char)(ch)))
+#define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))
+
+BOOL UCPutUtf8_charstring(HTStream *target, putc_func_t *myPutc, UCode_t code)
+{
+ if (code < 128)
+ return NO; /* indicate to caller we didn't handle it */
+ else if (code < 0x800L) {
+ PUTC(0xc0 | (code >> 6));
+ PUTC2(code);
+ } else if (code < 0x10000L) {
+ PUTC(0xe0 | (code >> 12));
+ PUTC2(code >> 6);
+ PUTC2(code);
+ } else if (code < 0x200000L) {
+ PUTC(0xf0 | (code >> 18));
+ PUTC2(code >> 12);
+ PUTC2(code >> 6);
+ PUTC2(code);
+ } else if (code < 0x4000000L) {
+ PUTC(0xf8 | (code >> 24));
+ PUTC2(code >> 18);
+ PUTC2(code >> 12);
+ PUTC2(code >> 6);
+ PUTC2(code);
+ } else if (code <= 0x7fffffffL) {
+ PUTC(0xfc | (code >> 30));
+ PUTC2(code >> 24);
+ PUTC2(code >> 18);
+ PUTC2(code >> 12);
+ PUTC2(code >> 6);
+ PUTC2(code);
+ } else
+ return NO;
+ return YES;
+}
+
+/*
+ * This function converts a Unicode (UCode_t) value
+ * to a multibyte UTF-8 character, which is loaded
+ * into the buffer received as an argument. The
+ * buffer should be large enough to hold at least
+ * seven characters (but should be declared as 8
+ * to minimize byte alignment problems with some
+ * compilers). - FM
+ */
+BOOL UCConvertUniToUtf8(UCode_t code, char *buffer)
+{
+ char *ch = buffer;
+
+ if (!ch)
+ return NO;
+
+ if (code <= 0 || code > 0x7fffffffL) {
+ *ch = '\0';
+ return NO;
+ }
+
+ if (code < 0x800L) {
+ *ch++ = (char) (0xc0 | (code >> 6));
+ *ch++ = (char) (0x80 | (0x3f & (code)));
+ *ch = '\0';
+ } else if (code < 0x10000L) {
+ *ch++ = (char) (0xe0 | (code >> 12));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
+ *ch++ = (char) (0x80 | (0x3f & (code)));
+ *ch = '\0';
+ } else if (code < 0x200000L) {
+ *ch++ = (char) (0xf0 | (code >> 18));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
+ *ch++ = (char) (0x80 | (0x3f & (code)));
+ *ch = '\0';
+ } else if (code < 0x4000000L) {
+ *ch++ = (char) (0xf8 | (code >> 24));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 18)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
+ *ch++ = (char) (0x80 | (0x3f & (code)));
+ *ch = '\0';
+ } else {
+ *ch++ = (char) (0xfc | (code >> 30));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 24)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 18)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
+ *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
+ *ch++ = (char) (0x80 | (0x3f & (code)));
+ *ch = '\0';
+ }
+ return YES;
+}
+
+/*
+ * Get UCS character code for one character from UTF-8 encoded string.
+ *
+ * On entry:
+ * *ppuni should point to beginning of UTF-8 encoding character
+ * On exit:
+ * *ppuni is advanced to point to the last byte of UTF-8 sequence,
+ * if there was a valid one; otherwise unchanged.
+ * returns the UCS value
+ * returns negative value on error (invalid UTF-8 sequence)
+ */
+UCode_t UCGetUniFromUtf8String(const char **ppuni)
+{
+ UCode_t uc_out = 0;
+ const char *p = *ppuni;
+ int utf_count, i;
+
+ if (!(**ppuni & 0x80))
+ return (UCode_t) **ppuni; /* ASCII range character */
+ else if (!(**ppuni & 0x40))
+ return (-1); /* not a valid UTF-8 start */
+ if ((*p & 0xe0) == 0xc0) {
+ utf_count = 1;
+ } else if ((*p & 0xf0) == 0xe0) {
+ utf_count = 2;
+ } else if ((*p & 0xf8) == 0xf0) {
+ utf_count = 3;
+ } else if ((*p & 0xfc) == 0xf8) {
+ utf_count = 4;
+ } else if ((*p & 0xfe) == 0xfc) {
+ utf_count = 5;
+ } else { /* garbage */
+ return (-1);
+ }
+ for (p = *ppuni, i = 0; i < utf_count; i++) {
+ if ((*(++p) & 0xc0) != 0x80)
+ return (-1);
+ }
+ p = *ppuni;
+ switch (utf_count) {
+ case 1:
+ uc_out = (((*p & 0x1f) << 6) |
+ (*(p + 1) & 0x3f));
+ break;
+ case 2:
+ uc_out = (((((*p & 0x0f) << 6) |
+ (*(p + 1) & 0x3f)) << 6) |
+ (*(p + 2) & 0x3f));
+ break;
+ case 3:
+ uc_out = (((((((*p & 0x07) << 6) |
+ (*(p + 1) & 0x3f)) << 6) |
+ (*(p + 2) & 0x3f)) << 6) |
+ (*(p + 3) & 0x3f));
+ break;
+ case 4:
+ uc_out = (((((((((*p & 0x03) << 6) |
+ (*(p + 1) & 0x3f)) << 6) |
+ (*(p + 2) & 0x3f)) << 6) |
+ (*(p + 3) & 0x3f)) << 6) |
+ (*(p + 4) & 0x3f));
+ break;
+ case 5:
+ uc_out = (((((((((((*p & 0x01) << 6) |
+ (*(p + 1) & 0x3f)) << 6) |
+ (*(p + 2) & 0x3f)) << 6) |
+ (*(p + 3) & 0x3f)) << 6) |
+ (*(p + 4) & 0x3f)) << 6) |
+ (*(p + 5) & 0x3f));
+ break;
+ }
+ *ppuni = p + utf_count;
+ return uc_out;
+}
+
+/*
+ * Combine UTF-8 into Unicode. Incomplete characters are either ignored, or
+ * returned as the UCS replacement character.
+ */
+dUTF8 HTDecodeUTF8(UTFDecodeState * me, int *c_in_out, UCode_t *result)
+{
+ dUTF8 rc = dUTF8_ok;
+ int c = *c_in_out;
+ unsigned uc = UCH(c);
+
+ if (TOASCII(uc) > 127) {
+ /*
+ * continue a multibyte character...
+ */
+ if (me->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) {
+ if (me->utf_count <= 0) {
+ me->utf_char = UCS_REPL;
+ } else if (me->utf_count == 1) {
+ int limit = (int) (me->utf_buf_p - me->utf_buf) + 1;
+ int maybe = 0;
+
+ /*
+ * Check for overlong sequences (from comment in xterm):
+ * 1100000x 10xxxxxx
+ * 11100000 100xxxxx 10xxxxxx
+ * 11110000 1000xxxx 10xxxxxx 10xxxxxx
+ * 11111000 10000xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 11111100 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+ switch (limit) {
+ case 2:
+ maybe = (UCH(me->utf_buf[0]) & 0xfe) == 0xc0;
+ break;
+ case 3:
+ maybe = ((UCH(me->utf_buf[0]) == 0xe0) &&
+ (UCH(me->utf_buf[1]) & 0xf0) == 0x80);
+ break;
+ case 4:
+ maybe = ((UCH(me->utf_buf[0]) == 0xf0) &&
+ (UCH(me->utf_buf[1]) & 0xf8) == 0x80);
+ break;
+ case 5:
+ maybe = ((UCH(me->utf_buf[0]) == 0xf8) &&
+ (UCH(me->utf_buf[1]) & 0xfd) == 0x80);
+ break;
+ }
+ if (maybe) {
+ while (limit-- > 2) {
+ if ((UCH(me->utf_buf[limit]) & 0xc0) != 0x80) {
+ maybe = 0;
+ break;
+ }
+ }
+ if (maybe) {
+ me->utf_char = UCS_REPL;
+ }
+ }
+ }
+ if (me->utf_char == UCS_REPL) {
+ rc = dUTF8_err;
+ } else if (me->utf_char || ((uc & 0x7f) >> (7 - me->utf_count))) {
+ me->utf_char = (me->utf_char << 6) | (TOASCII(c) & 0x3f);
+ if ((me->utf_char >= 0xd800 &&
+ me->utf_char <= 0xdfff) ||
+ (me->utf_char == 0xfffe) ||
+ (me->utf_char == UCS_HIDE)) {
+ me->utf_char = UCS_REPL;
+ rc = dUTF8_err;
+ }
+ } else {
+ me->utf_char = UCS_REPL;
+ rc = dUTF8_err;
+ }
+ me->utf_count--;
+ *(me->utf_buf_p) = (char) c;
+ (me->utf_buf_p)++;
+
+ if (me->utf_count == 0) {
+ *(me->utf_buf_p) = '\0';
+ *result = me->utf_char;
+ if (*result < 256) {
+ *c_in_out = UCH(*result & 0xff);
+ }
+ switch (*result) {
+ case 0x200e: /* left-to-right mark */
+ case 0x200f: /* right-to-left mark */
+ /* lynx does not use these */
+ *result = '\0';
+ break;
+ }
+ } else {
+ rc = dUTF8_more;
+ }
+ } else {
+ /*
+ * begin a multibyte character
+ */
+ rc = dUTF8_more;
+ me->utf_buf_p = me->utf_buf;
+ *(me->utf_buf_p) = (char) c;
+ (me->utf_buf_p)++;
+ if ((uc & 0xe0) == 0xc0) {
+ me->utf_count = 1;
+ me->utf_char = (uc & 0x1f);
+ } else if ((uc & 0xf0) == 0xe0) {
+ me->utf_count = 2;
+ me->utf_char = (uc & 0x0f);
+ } else if ((uc & 0xf8) == 0xf0) {
+ me->utf_count = 3;
+ me->utf_char = (uc & 0x07);
+ } else if ((uc & 0xfc) == 0xf8) {
+ me->utf_count = 4;
+ me->utf_char = (uc & 0x03);
+ } else if ((uc & 0xfe) == 0xfc) {
+ me->utf_count = 5;
+ me->utf_char = (uc & 0x01);
+ } else {
+ me->utf_count = 0;
+ me->utf_buf_p = me->utf_buf;
+ *(me->utf_buf_p) = '\0';
+ rc = dUTF8_err;
+ }
+ }
+ } else {
+ me->utf_count = 0;
+ me->utf_buf_p = me->utf_buf;
+ *(me->utf_buf_p) = '\0';
+ }
+
+#if 0
+ if (rc != dUTF8_ok) {
+ CTRACE((tfp, "UTF8 %#x ->%#x %s\n",
+ uc, UCH(*c_in_out),
+ (rc == dUTF8_err) ? "err" : "more"));
+ } else {
+ if (*result > 127) {
+ CTRACE((tfp, "UTF8 %#x == %#x\n", uc, (int) *result));
+ } else if (c != UCS_REPL && !isspace(c)) {
+ CTRACE((tfp, "CHAR %#x == %c (%#x)\n", uc, uc, (int) *result));
+ }
+ }
+#endif
+ return rc;
+}