diff options
Diffstat (limited to 'src/LYCharSets.c')
-rw-r--r-- | src/LYCharSets.c | 1157 |
1 files changed, 1157 insertions, 0 deletions
diff --git a/src/LYCharSets.c b/src/LYCharSets.c new file mode 100644 index 0000000..94b7a04 --- /dev/null +++ b/src/LYCharSets.c @@ -0,0 +1,1157 @@ +/* + * $LynxId: LYCharSets.c,v 1.71 2021/06/29 22:01:12 tom Exp $ + */ +#include <HTUtils.h> +#include <HTCJK.h> +#include <HTMLDTD.h> + +#include <LYGlobalDefs.h> +#include <UCMap.h> +#include <UCdomap.h> +#include <UCDefs.h> +#include <LYCharSets.h> +#include <GridText.h> +#include <LYCurses.h> +#include <LYStrings.h> + +#include <LYLeaks.h> + +HTkcode kanji_code = NOKANJI; +BOOLEAN LYHaveCJKCharacterSet = FALSE; +BOOLEAN DisplayCharsetMatchLocale = TRUE; +BOOL force_old_UCLYhndl_on_reload = FALSE; +int forced_UCLYhdnl; +int LYNumCharsets = 0; /* Will be initialized later by UC_Register. */ +int current_char_set = -1; /* will be initialized later in LYMain.c */ +int linedrawing_char_set = -1; +STRING2PTR p_entity_values = NULL; /* Pointer, for HTML_put_entity() */ + + /* obsolete and probably not used(???) */ + /* will be initialized in HTMLUseCharacterSet */ +#ifdef USE_CHARSET_CHOICE +charset_subset_t charset_subsets[MAXCHARSETS]; +BOOL custom_display_charset = FALSE; +BOOL custom_assumed_doc_charset = FALSE; + +#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN +int display_charset_map[MAXCHARSETS]; +int assumed_doc_charset_map[MAXCHARSETS]; + +const char *display_charset_choices[MAXCHARSETS + 1]; +const char *assumed_charset_choices[MAXCHARSETS + 1]; +int displayed_display_charset_idx; +#endif +#endif /* USE_CHARSET_CHOICE */ + +/* + * New character sets now declared with UCInit() in UCdomap.c + * + * INSTRUCTIONS for adding new character sets which do not have + * Unicode tables now in UCdomap.h + * + * + * [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[] + * or SevenBitApproximations[] any more - it works now via new chartrans + * mechanism, but kept for compatibility only: we should cleanup the stuff, + * but this is not so easy...] + * + * Currently we only declare some charset's properties here (such as MIME + * names, etc.), it does not include real mapping. + * + * There is a place marked "Add your new character sets HERE" in this file. + * Make up a character set and add it in the same style as the ISO_LATIN1 set + * below, giving it a unique name. + * + * Add the name of the set to LYCharSets. Similarly add the appropriate + * information to the tables below: LYchar_set_names, LYCharSet_UC, + * LYlowest_eightbit. These 4 tables all MUST have the same order. (And this + * is the order you will see in Lynx Options Menu, which is why few + * unicode-based charsets are listed here). + * + */ + +/* Entity values -- for ISO Latin 1 local representation + * + * This MUST match exactly the table referred to in the DTD! + */ +static const char *ISO_Latin1[] = +{ + "\306", /* capital AE diphthong (ligature) (Æ) - AElig */ + "\301", /* capital A, acute accent (Á) - Aacute */ + "\302", /* capital A, circumflex accent (Â) - Acirc */ + "\300", /* capital A, grave accent (À) - Agrave */ + "\305", /* capital A, ring - Aring (Å) */ + "\303", /* capital A, tilde - Atilde (Ã) */ + "\304", /* capital A, dieresis or umlaut mark (Ä) - Auml */ + "\307", /* capital C, cedilla - Ccedil (Ç) */ + "\320", /* capital Eth or D with stroke (Ð) - Dstrok */ + "\320", /* capital Eth, Icelandic (Ð) - ETH */ + "\311", /* capital E, acute accent (É) - Eacute */ + "\312", /* capital E, circumflex accent (Ê) - Ecirc */ + "\310", /* capital E, grave accent (È) - Egrave */ + "\313", /* capital E, dieresis or umlaut mark (Ë) - Euml */ + "\315", /* capital I, acute accent (Í) - Iacute */ + "\316", /* capital I, circumflex accent (Î) - Icirc */ + "\314", /* capital I, grave accent (Ì) - Igrave */ + "\317", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ + "\321", /* capital N, tilde (Ñ) - Ntilde */ + "\323", /* capital O, acute accent (Ó) - Oacute */ + "\324", /* capital O, circumflex accent (Ô) - Ocirc */ + "\322", /* capital O, grave accent (Ò) - Ograve */ + "\330", /* capital O, slash (Ø) - Oslash */ + "\325", /* capital O, tilde (Õ) - Otilde */ + "\326", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ + "\336", /* capital THORN, Icelandic (Þ) - THORN */ + "\332", /* capital U, acute accent (Ú) - Uacute */ + "\333", /* capital U, circumflex accent (Û) - Ucirc */ + "\331", /* capital U, grave accent (Ù) - Ugrave */ + "\334", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ + "\335", /* capital Y, acute accent (Ý) - Yacute */ + "\341", /* small a, acute accent (á) - aacute */ + "\342", /* small a, circumflex accent (â) - acirc */ + "\264", /* spacing acute (´) - acute */ + "\346", /* small ae diphthong (ligature) (æ) - aelig */ + "\340", /* small a, grave accent (à) - agrave */ + "\046", /* ampersand (&) - amp */ + "\345", /* small a, ring (å) - aring */ + "\343", /* small a, tilde (ã) - atilde */ + "\344", /* small a, dieresis or umlaut mark (ä) - auml */ + "\246", /* broken vertical bar (¦) - brkbar */ + "\246", /* broken vertical bar (¦) - brvbar */ + "\347", /* small c, cedilla (ç) - ccedil */ + "\270", /* spacing cedilla (¸) - cedil */ + "\242", /* cent sign (¢) - cent */ + "\251", /* copyright sign (©) - copy */ + "\244", /* currency sign (¤) - curren */ + "\260", /* degree sign (°) - deg */ + "\250", /* spacing dieresis (¨) - die */ + "\367", /* division sign (÷) - divide */ + "\351", /* small e, acute accent (é) - eacute */ + "\352", /* small e, circumflex accent (ê) - ecirc */ + "\350", /* small e, grave accent (è) - egrave */ + "-", /* dash the width of emsp - emdash */ + "\002", /* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */ + "-", /* dash the width of ensp - endash */ + "\002", /* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */ + "\360", /* small eth, Icelandic (ð) - eth */ + "\353", /* small e, dieresis or umlaut mark (ë) - euml */ + "\275", /* fraction 1/2 (½) - frac12 */ + "\274", /* fraction 1/4 (¼) - frac14 */ + "\276", /* fraction 3/4 (¾) - frac34 */ + "\076", /* greater than (>) - gt */ + "\257", /* spacing macron (¯) - hibar */ + "\355", /* small i, acute accent (í) - iacute */ + "\356", /* small i, circumflex accent (î) - icirc */ + "\241", /* inverted exclamation mark (¡) - iexcl */ + "\354", /* small i, grave accent (ì) - igrave */ + "\277", /* inverted question mark (¿) - iquest */ + "\357", /* small i, dieresis or umlaut mark (ï) - iuml */ + "\253", /* angle quotation mark, left («) - laquo */ + "\074", /* less than (<) - lt */ + "\257", /* spacing macron (¯) - macr */ + "-", /* dash the width of emsp - mdash */ + "\265", /* micro sign (µ) - micro */ + "\267", /* middle dot (·) - middot */ + "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ + "-", /* dash the width of ensp - ndash */ + "\254", /* negation sign (¬) - not */ + "\361", /* small n, tilde (ñ) - ntilde */ + "\363", /* small o, acute accent (ó) - oacute */ + "\364", /* small o, circumflex accent (ô) - ocirc */ + "\362", /* small o, grave accent (ò) - ograve */ + "\252", /* feminine ordinal indicator (ª) - ordf */ + "\272", /* masculine ordinal indicator (º) - ordm */ + "\370", /* small o, slash (ø) - oslash */ + "\365", /* small o, tilde (õ) - otilde */ + "\366", /* small o, dieresis or umlaut mark (ö) - ouml */ + "\266", /* paragraph sign (¶) - para */ + "\261", /* plus-or-minus sign (±) - plusmn */ + "\243", /* pound sign (£) - pound */ + "\042", /* quote '"' (") - quot */ + "\273", /* angle quotation mark, right (») - raquo */ + "\256", /* circled R registered sign (®) - reg */ + "\247", /* section sign (§) - sect */ + "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ + "\271", /* superscript 1 (¹) - sup1 */ + "\262", /* superscript 2 (²) - sup2 */ + "\263", /* superscript 3 (³) - sup3 */ + "\337", /* small sharp s, German (sz ligature) (ß) - szlig */ + "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ + "\376", /* small thorn, Icelandic (þ) - thorn */ + "\327", /* multiplication sign (×) - times */ + "(TM)", /* circled TM trade mark sign (™) - trade */ + "\372", /* small u, acute accent (ú) - uacute */ + "\373", /* small u, circumflex accent (û) - ucirc */ + "\371", /* small u, grave accent (ù) - ugrave */ + "\250", /* spacing dieresis (¨) - uml */ + "\374", /* small u, dieresis or umlaut mark (ü) - uuml */ + "\375", /* small y, acute accent (ý) - yacute */ + "\245", /* yen sign (¥) - yen */ + "\377", /* small y, dieresis or umlaut mark (ÿ) - yuml */ +}; + +/* Entity values -- 7 bit character approximations + * + * This MUST match exactly the table referred to in the DTD! + */ +const char *SevenBitApproximations[] = +{ + "AE", /* capital AE diphthong (ligature) (Æ) - AElig */ + "A", /* capital A, acute accent (Á) - Aacute */ + "A", /* capital A, circumflex accent (Â) - Acirc */ + "A", /* capital A, grave accent (À) - Agrave */ + "A", /* capital A, ring - Aring (Å) */ + "A", /* capital A, tilde - Atilde (Ã) */ +#ifdef LY_UMLAUT + "Ae", /* capital A, dieresis or umlaut mark (Ä) - Auml */ +#else + "A", /* capital A, dieresis or umlaut mark (Ä) - Auml */ +#endif /* LY_UMLAUT */ + "C", /* capital C, cedilla (Ç) - Ccedil */ + "Dj", /* capital D with stroke (Ð) - Dstrok */ + "DH", /* capital Eth, Icelandic (Ð) - ETH */ + "E", /* capital E, acute accent (É) - Eacute */ + "E", /* capital E, circumflex accent (Ê) - Ecirc */ + "E", /* capital E, grave accent (È) - Egrave */ + "E", /* capital E, dieresis or umlaut mark (Ë) - Euml */ + "I", /* capital I, acute accent (Í) - Iacute */ + "I", /* capital I, circumflex accent (Î) - Icirc */ + "I", /* capital I, grave accent (Ì) - Igrave */ + "I", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ + "N", /* capital N, tilde - Ntilde (Ñ) */ + "O", /* capital O, acute accent (Ó) - Oacute */ + "O", /* capital O, circumflex accent (Ô) - Ocirc */ + "O", /* capital O, grave accent (Ò) - Ograve */ + "O", /* capital O, slash (Ø) - Oslash */ + "O", /* capital O, tilde (Õ) - Otilde */ +#ifdef LY_UMLAUT + "Oe", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ +#else + "O", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ +#endif /* LY_UMLAUT */ + "P", /* capital THORN, Icelandic (Þ) - THORN */ + "U", /* capital U, acute accent (Ú) - Uacute */ + "U", /* capital U, circumflex accent (Û) - Ucirc */ + "U", /* capital U, grave accent (Ù) - Ugrave */ +#ifdef LY_UMLAUT + "Ue", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ +#else + "U", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ +#endif /* LY_UMLAUT */ + "Y", /* capital Y, acute accent (Ý) - Yacute */ + "a", /* small a, acute accent (á) - aacute */ + "a", /* small a, circumflex accent (â) - acirc */ + "'", /* spacing acute (´) - acute */ + "ae", /* small ae diphthong (ligature) (æ) - aelig */ + "`a", /* small a, grave accent (è) - agrave */ + "&", /* ampersand (&) - amp */ + "a", /* small a, ring (å) - aring */ + "a", /* small a, tilde (ã) - atilde */ +#ifdef LY_UMLAUT + "ae", /* small a, dieresis or umlaut mark (ä) - auml */ +#else + "a", /* small a, dieresis or umlaut mark (ä) - auml */ +#endif /* LY_UMLAUT */ + "|", /* broken vertical bar (¦) - brkbar */ + "|", /* broken vertical bar (¦) - brvbar */ + "c", /* small c, cedilla (ç) - ccedil */ + ",", /* spacing cedilla (¸) - cedil */ + "-c-", /* cent sign (¢) - cent */ + "(c)", /* copyright sign (©) - copy */ + "CUR", /* currency sign (¤) - curren */ + "DEG", /* degree sign (°) - deg */ + "\042", /* spacing dieresis (¨) - die */ + "/", /* division sign (÷) - divide */ + "e", /* small e, acute accent (é) - eacute */ + "e", /* small e, circumflex accent (ê) - ecirc */ + "e", /* small e, grave accent (è) - egrave */ + "-", /* dash the width of emsp - emdash */ + "\002", /* emsp NEVER CHANGE THIS - emsp */ + "-", /* dash the width of ensp - endash */ + "\002", /* ensp NEVER CHANGE THIS - ensp */ + "dh", /* small eth, Icelandic eth (ð) */ + "e", /* small e, dieresis or umlaut mark (ë) - euml */ + " 1/2", /* fraction 1/2 (½) - frac12 */ + " 1/4", /* fraction 1/4 (¼) - frac14 */ + " 3/4", /* fraction 3/4 (¾) - frac34 */ + ">", /* greater than (>) - gt */ + "-", /* spacing macron (¯) - hibar */ + "i", /* small i, acute accent (í) - iacute */ + "i", /* small i, circumflex accent (î) - icirc */ + "!", /* inverted exclamation mark (¡) - iexcl */ + "`i", /* small i, grave accent (ì) - igrave */ + "?", /* inverted question mark (¿) - iquest */ + "i", /* small i, dieresis or umlaut mark (ï) - iuml */ + "<<", /* angle quotation mark, left («) - laquo */ + "<", /* less than - lt (<) */ + "-", /* spacing macron (¯) - macr */ + "-", /* dash the width of emsp - mdash */ + "u", /* micro sign (µ) - micro */ + ".", /* middle dot (·) - middot */ + "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ + "-", /* dash the width of ensp - ndash */ + "NOT", /* negation sign (¬) - not */ + "n", /* small n, tilde (ñ) - ntilde */ + "o", /* small o, acute accent (ó) - oacute */ + "o", /* small o, circumflex accent (ô) - ocirc */ + "o", /* small o, grave accent (ò) - ograve */ + "-a", /* feminine ordinal indicator (ª) - ordf */ + "-o", /* masculine ordinal indicator (º) - ordm */ + "o", /* small o, slash (ø) - oslash */ + "o", /* small o, tilde (õ) - otilde */ +#ifdef LY_UMLAUT + "oe", /* small o, dieresis or umlaut mark (ö) - ouml */ +#else + "o", /* small o, dieresis or umlaut mark (ö) - ouml */ +#endif /* LY_UMLAUT */ + "P:", /* paragraph sign (¶) - para */ + "+-", /* plus-or-minus sign (±) - plusmn */ + "-L-", /* pound sign (£) - pound */ + "\"", /* quote '"' (") - quot */ + ">>", /* angle quotation mark, right (») - raquo */ + "(R)", /* circled R registered sign (®) - reg */ + "S:", /* section sign (§) - sect */ + "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ + "^1", /* superscript 1 (¹) - sup1 */ + "^2", /* superscript 2 (²) - sup2 */ + "^3", /* superscript 3 (³) - sup3 */ + "ss", /* small sharp s, German (sz ligature) (ß) - szlig */ + "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ + "p", /* small thorn, Icelandic (þ) - thorn */ + "*", /* multiplication sign (×) - times */ + "(TM)", /* circled TM trade mark sign (™) - trade */ + "u", /* small u, acute accent (ú) - uacute */ + "u", /* small u, circumflex accent (û) - ucirc */ + "u", /* small u, grave accent (ù) - ugrave */ + "\042", /* spacing dieresis (¨) - uml */ +#ifdef LY_UMLAUT + "ue", /* small u, dieresis or umlaut mark (ü) - uuml */ +#else + "u", /* small u, dieresis or umlaut mark (ü) - uuml */ +#endif /* LY_UMLAUT */ + "y", /* small y, acute accent (ý) - yacute */ + "YEN", /* yen sign (¥) - yen */ + "y", /* small y, dieresis or umlaut mark (ÿ) - yuml */ +}; + +/* + * Add your new character sets HERE (but only if you can't construct Unicode + * tables for them). - FM + */ + +/* + * Add the array name to LYCharSets + */ +STRING2PTR LYCharSets[MAXCHARSETS] = +{ + ISO_Latin1, /* ISO Latin 1 */ + SevenBitApproximations, /* 7 Bit Approximations */ +}; + +/* + * Add the name that the user will see below. The order of LYCharSets and + * LYchar_set_names MUST be the same + */ +const char *LYchar_set_names[MAXCHARSETS + 1] = +{ + "Western (ISO-8859-1)", + "7 bit approximations (US-ASCII)", + (char *) 0 +}; + +/* + * Associate additional pieces of info with each of the charsets listed above. + * Will be automatically modified (and extended) by charset translations which + * are loaded using the chartrans mechanism. Most important piece of info to + * put here is a MIME charset name. Used for chartrans (see UCDefs.h). The + * order of LYCharSets and LYCharSet_UC MUST be the same. + * + * Note that most of the charsets added by the new mechanism in src/chrtrans + * don't show up here at all. They don't have to. + */ +LYUCcharset LYCharSet_UC[MAXCHARSETS] = +{ + /* + * Zero position placeholder and HTMLGetEntityUCValue() reference. - FM + */ + {-1, "iso-8859-1", UCT_ENC_8BIT, 0, + UCT_REP_IS_LAT1, + UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1}, + + /* + * Placeholders for Unicode tables. - FM + */ + {-1, "us-ascii", UCT_ENC_7BIT, 0, + UCT_REP_SUBSETOF_LAT1, + UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII}, + +}; + +/* + * Add the code of the the lowest character with the high bit set that can be + * directly displayed. The order of LYCharSets and LYlowest_eightbit MUST be + * the same. + * + * (If charset have chartrans unicode table, LYlowest_eightbit will be + * verified/modified anyway.) + */ +int LYlowest_eightbit[MAXCHARSETS] = +{ + 160, /* ISO Latin 1 */ + 999, /* 7 bit approximations */ +}; + +/* + * Function to set the handling of selected character sets based on the current + * LYUseDefaultRawMode value. - FM + */ +void HTMLSetCharacterHandling(int i) +{ + int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); + BOOLEAN LYRawMode_flag = LYRawMode; + int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec; + + if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { + HTCJK = NOCJK; + kanji_code = NOKANJI; + if (i == chndl) + LYRawMode = LYUseDefaultRawMode; + else + LYRawMode = (BOOL) (!LYUseDefaultRawMode); + + HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1) + || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT)); + + if (LYRawMode) { + HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160); + } else { + HTPassEightBitRaw = FALSE; + } + if (LYRawMode || i == chndl) { + HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130); + } else { + HTPassHighCtrlRaw = FALSE; + } + + HTPassHighCtrlNum = FALSE; + + } else { /* CJK encoding: */ + const char *mime = LYCharSet_UC[i].MIMEname; + + if (!strcmp(mime, "euc-cn")) { + HTCJK = CHINESE; + kanji_code = EUC; + } else if (!strcmp(mime, "euc-jp")) { + HTCJK = JAPANESE; + kanji_code = EUC; + } else if (!strcmp(mime, "shift_jis")) { + HTCJK = JAPANESE; + kanji_code = SJIS; + } else if (!strcmp(mime, "euc-kr")) { + HTCJK = KOREAN; + kanji_code = EUC; + } else if (!strcmp(mime, "big5")) { + HTCJK = TAIPEI; + kanji_code = EUC; + } + + /* for any CJK: */ + if (!LYUseDefaultRawMode) + HTCJK = NOCJK; + LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE); + HTPassEightBitRaw = FALSE; + HTPassEightBitNum = FALSE; + HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE); + HTPassHighCtrlNum = FALSE; + } + + /* + * Comment for coding below: + * UCLYhndl_for_unspec is "current" state with LYRawMode, but + * UCAssume_MIMEcharset is independent from LYRawMode: holds the history + * and may be changed from 'O'ptions menu only. - LP + */ + if (LYRawMode) { + UCLYhndl_for_unspec = i; /* UCAssume_MIMEcharset not changed! */ + } else { + if (chndl != i && + (LYCharSet_UC[i].enc != UCT_ENC_CJK || + LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) { + UCLYhndl_for_unspec = chndl; /* fall to UCAssume_MIMEcharset */ + } else { + UCLYhndl_for_unspec = LATIN1; /* UCAssume_MIMEcharset not changed! */ + } + } + +#ifdef USE_SLANG + if (LYlowest_eightbit[i] > 191) { + /* + * Higher than this may output cntrl chars to screen. - KW + */ + SLsmg_Display_Eight_Bit = 191; + } else { + SLsmg_Display_Eight_Bit = LYlowest_eightbit[i]; + } +#endif /* USE_SLANG */ + + ena_csi(LYlowest_eightbit[current_char_set] > 155); + + /* some diagnostics */ + if (TRACE) { + if (LYRawMode_flag != LYRawMode) + CTRACE((tfp, + "HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n", + (LYRawMode_flag ? "ON" : "OFF"), + (LYRawMode ? "ON" : "OFF"))); + if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec) + CTRACE((tfp, + "HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n", + UCLYhndl_for_unspec_flag, + UCLYhndl_for_unspec)); + } + + return; +} + +/* + * Function to set HTCJK based on "in" and "out" charsets. + */ +void Set_HTCJK(const char *inMIMEname, + const char *outMIMEname) +{ + /* need not check for synonyms: MIMEname's got from LYCharSet_UC */ + + if (LYRawMode) { + if ((!strcmp(inMIMEname, "euc-jp") || +#ifdef USE_JAPANESEUTF8_SUPPORT + !strcmp(inMIMEname, "utf-8") || +#endif + !strcmp(inMIMEname, "shift_jis")) && + (!strcmp(outMIMEname, "euc-jp") || + !strcmp(outMIMEname, "shift_jis"))) { + HTCJK = JAPANESE; + } else if (!strcmp(inMIMEname, "euc-cn") && + !strcmp(outMIMEname, "euc-cn")) { + HTCJK = CHINESE; + } else if (!strcmp(inMIMEname, "big5") && + !strcmp(outMIMEname, "big5")) { + HTCJK = TAIPEI; + } else if (!strcmp(inMIMEname, "euc-kr") && + !strcmp(outMIMEname, "euc-kr")) { + HTCJK = KOREAN; + } else { + HTCJK = NOCJK; + } + } else { + HTCJK = NOCJK; + } +} + +/* + * Function to set the LYDefaultRawMode value based on the selected character + * set. - FM + * + * Currently unused: the default value so obvious that LYUseDefaultRawMode + * utilized directly by someone's mistake. - LP + */ +static void HTMLSetRawModeDefault(int i) +{ + LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK); + return; +} + +/* + * Function to set the LYUseDefaultRawMode value based on the selected + * character set and the current LYRawMode value. - FM + */ +void HTMLSetUseDefaultRawMode(int i, + int modeflag) +{ + if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { + + int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); + + if (i == chndl) + LYUseDefaultRawMode = (BOOLEAN) modeflag; + else + LYUseDefaultRawMode = (BOOL) (!modeflag); + } else /* CJK encoding: */ + LYUseDefaultRawMode = (BOOLEAN) modeflag; + + return; +} + +/* + * Function to set the LYHaveCJKCharacterSet value based on the selected + * character set. - FM + */ +static void HTMLSetHaveCJKCharacterSet(int i) +{ + LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK); + return; +} + +/* + * Function to set the DisplayCharsetMatchLocale value based on the selected + * character set. It is used in UPPER8 for 8bit case-insensitive search by + * matching def7_uni.tbl images. - LP + */ +static void HTMLSetDisplayCharsetMatchLocale(int i) +{ + BOOLEAN match; + + if (LYHaveCJKCharacterSet) { + /* + * We have no intention to pass CJK via UCTransChar if that happened. + * Let someone from CJK correct this if necessary. + */ + DisplayCharsetMatchLocale = TRUE; /* old-style */ + return; + + } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) || + strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) { + /* + * Assume dos/windows displays usually on remote terminal, hence it + * rarely matches locale. (In fact, MS Windows codepoints locale are + * never seen on UNIX). + */ + match = FALSE; + } else { + match = TRUE; /* guess, but see below */ + +#if !defined(LOCALE) + if (LYCharSet_UC[i].enc != UCT_ENC_UTF8) + /* + * Leave true for utf-8 display - the code doesn't deal very well + * with this case. - kw + */ + match = FALSE; +#else + if (UCForce8bitTOUPPER) { + /* + * Force disable locale (from lynx.cfg) + */ + match = FALSE; + } +#endif + } + + DisplayCharsetMatchLocale = match; + return; +} + +/* + * lynx 2.8/2.7.2(and more early) compatibility code: "human-readable" charset + * names changes with time so we map that history names to MIME here to get old + * lynx.cfg and (especially) .lynxrc always recognized. Please update this + * table when you change "fullname" of any present charset. + */ +typedef struct _names_pairs { + const char *fullname; + const char *MIMEname; +} names_pairs; +/* *INDENT-OFF* */ +static const names_pairs OLD_charset_names[] = +{ + {"ISO Latin 1", "iso-8859-1"}, + {"ISO Latin 2", "iso-8859-2"}, + {"WinLatin1 (cp1252)", "windows-1252"}, + {"DEC Multinational", "dec-mcs"}, + {"Macintosh (8 bit)", "macintosh"}, + {"NeXT character set", "next"}, + {"KOI8-R Cyrillic", "koi8-r"}, + {"Chinese", "euc-cn"}, + {"Japanese (EUC)", "euc-jp"}, + {"Japanese (SJIS)", "shift_jis"}, + {"Korean", "euc-kr"}, + {"Taipei (Big5)", "big5"}, + {"Vietnamese (VISCII)", "viscii"}, + {"7 bit approximations", "us-ascii"}, + {"Transparent", "x-transparent"}, + {"DosLatinUS (cp437)", "cp437"}, + {"IBM PC character set", "cp437"}, + {"DosLatin1 (cp850)", "cp850"}, + {"IBM PC codepage 850", "cp850"}, + {"DosLatin2 (cp852)", "cp852"}, + {"PC Latin2 CP 852", "cp852"}, + {"DosCyrillic (cp866)", "cp866"}, + {"DosArabic (cp864)", "cp864"}, + {"DosGreek (cp737)", "cp737"}, + {"DosBaltRim (cp775)", "cp775"}, + {"DosGreek2 (cp869)", "cp869"}, + {"DosHebrew (cp862)", "cp862"}, + {"WinLatin2 (cp1250)", "windows-1250"}, + {"WinCyrillic (cp1251)", "windows-1251"}, + {"WinGreek (cp1253)", "windows-1253"}, + {"WinHebrew (cp1255)", "windows-1255"}, + {"WinArabic (cp1256)", "windows-1256"}, + {"WinBaltRim (cp1257)", "windows-1257"}, + {"ISO Latin 3", "iso-8859-3"}, + {"ISO Latin 4", "iso-8859-4"}, + {"ISO 8859-5 Cyrillic", "iso-8859-5"}, + {"ISO 8859-6 Arabic", "iso-8859-6"}, + {"ISO 8859-7 Greek", "iso-8859-7"}, + {"ISO 8859-8 Hebrew", "iso-8859-8"}, + {"ISO-8859-8-I", "iso-8859-8"}, + {"ISO-8859-8-E", "iso-8859-8"}, + {"ISO 8859-9 (Latin 5)", "iso-8859-9"}, + {"ISO 8859-10", "iso-8859-10"}, + {"UNICODE UTF 8", "utf-8"}, + {"RFC 1345 w/o Intro", "mnemonic+ascii+0"}, + {"RFC 1345 Mnemonic", "mnemonic"}, + {NULL, NULL}, /* terminated with NULL */ +}; +/* *INDENT-ON* */ + +/* + * lynx 2.8/2.7.2 compatibility code: read "character_set" parameter from + * lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and + * new style). Returns -1 if not recognized. + */ +int UCGetLYhndl_byAnyName(char *value) +{ + int i; + + if (value == NULL) + return -1; + + LYTrimTrailing(value); + CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value)); + + /* search by name */ + for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) { + if (!strcmp(value, LYchar_set_names[i])) { + return i; /* OK */ + } + } + + /* search by old name from 2.8/2.7.2 version */ + for (i = 0; (OLD_charset_names[i].fullname); i++) { + if (!strcmp(value, OLD_charset_names[i].fullname)) { + return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname); /* OK */ + } + } + + return UCGetLYhndl_byMIME(value); /* by MIME */ +} + +/* + * Entity names -- Ordered by ISO Latin 1 value. + * --------------------------------------------- + * For conversions of DECIMAL escaped entities. + * Must be in order of ascending value. + */ +static const char *LYEntityNames[] = +{ +/* NAME DECIMAL VALUE */ + "nbsp", /* 160, non breaking space */ + "iexcl", /* 161, inverted exclamation mark */ + "cent", /* 162, cent sign */ + "pound", /* 163, pound sign */ + "curren", /* 164, currency sign */ + "yen", /* 165, yen sign */ + "brvbar", /* 166, broken vertical bar, (brkbar) */ + "sect", /* 167, section sign */ + "uml", /* 168, spacing dieresis */ + "copy", /* 169, copyright sign */ + "ordf", /* 170, feminine ordinal indicator */ + "laquo", /* 171, angle quotation mark, left */ + "not", /* 172, negation sign */ + "shy", /* 173, soft hyphen */ + "reg", /* 174, circled R registered sign */ + "hibar", /* 175, spacing macron */ + "deg", /* 176, degree sign */ + "plusmn", /* 177, plus-or-minus sign */ + "sup2", /* 178, superscript 2 */ + "sup3", /* 179, superscript 3 */ + "acute", /* 180, spacing acute (96) */ + "micro", /* 181, micro sign */ + "para", /* 182, paragraph sign */ + "middot", /* 183, middle dot */ + "cedil", /* 184, spacing cedilla */ + "sup1", /* 185, superscript 1 */ + "ordm", /* 186, masculine ordinal indicator */ + "raquo", /* 187, angle quotation mark, right */ + "frac14", /* 188, fraction 1/4 */ + "frac12", /* 189, fraction 1/2 */ + "frac34", /* 190, fraction 3/4 */ + "iquest", /* 191, inverted question mark */ + "Agrave", /* 192, capital A, grave accent */ + "Aacute", /* 193, capital A, acute accent */ + "Acirc", /* 194, capital A, circumflex accent */ + "Atilde", /* 195, capital A, tilde */ + "Auml", /* 196, capital A, dieresis or umlaut mark */ + "Aring", /* 197, capital A, ring */ + "AElig", /* 198, capital AE diphthong (ligature) */ + "Ccedil", /* 199, capital C, cedilla */ + "Egrave", /* 200, capital E, grave accent */ + "Eacute", /* 201, capital E, acute accent */ + "Ecirc", /* 202, capital E, circumflex accent */ + "Euml", /* 203, capital E, dieresis or umlaut mark */ + "Igrave", /* 204, capital I, grave accent */ + "Iacute", /* 205, capital I, acute accent */ + "Icirc", /* 206, capital I, circumflex accent */ + "Iuml", /* 207, capital I, dieresis or umlaut mark */ + "ETH", /* 208, capital Eth, Icelandic (or Latin2 Dstrok) */ + "Ntilde", /* 209, capital N, tilde */ + "Ograve", /* 210, capital O, grave accent */ + "Oacute", /* 211, capital O, acute accent */ + "Ocirc", /* 212, capital O, circumflex accent */ + "Otilde", /* 213, capital O, tilde */ + "Ouml", /* 214, capital O, dieresis or umlaut mark */ + "times", /* 215, multiplication sign */ + "Oslash", /* 216, capital O, slash */ + "Ugrave", /* 217, capital U, grave accent */ + "Uacute", /* 218, capital U, acute accent */ + "Ucirc", /* 219, capital U, circumflex accent */ + "Uuml", /* 220, capital U, dieresis or umlaut mark */ + "Yacute", /* 221, capital Y, acute accent */ + "THORN", /* 222, capital THORN, Icelandic */ + "szlig", /* 223, small sharp s, German (sz ligature) */ + "agrave", /* 224, small a, grave accent */ + "aacute", /* 225, small a, acute accent */ + "acirc", /* 226, small a, circumflex accent */ + "atilde", /* 227, small a, tilde */ + "auml", /* 228, small a, dieresis or umlaut mark */ + "aring", /* 229, small a, ring */ + "aelig", /* 230, small ae diphthong (ligature) */ + "ccedil", /* 231, small c, cedilla */ + "egrave", /* 232, small e, grave accent */ + "eacute", /* 233, small e, acute accent */ + "ecirc", /* 234, small e, circumflex accent */ + "euml", /* 235, small e, dieresis or umlaut mark */ + "igrave", /* 236, small i, grave accent */ + "iacute", /* 237, small i, acute accent */ + "icirc", /* 238, small i, circumflex accent */ + "iuml", /* 239, small i, dieresis or umlaut mark */ + "eth", /* 240, small eth, Icelandic */ + "ntilde", /* 241, small n, tilde */ + "ograve", /* 242, small o, grave accent */ + "oacute", /* 243, small o, acute accent */ + "ocirc", /* 244, small o, circumflex accent */ + "otilde", /* 245, small o, tilde */ + "ouml", /* 246, small o, dieresis or umlaut mark */ + "divide", /* 247, division sign */ + "oslash", /* 248, small o, slash */ + "ugrave", /* 249, small u, grave accent */ + "uacute", /* 250, small u, acute accent */ + "ucirc", /* 251, small u, circumflex accent */ + "uuml", /* 252, small u, dieresis or umlaut mark */ + "yacute", /* 253, small y, acute accent */ + "thorn", /* 254, small thorn, Icelandic */ + "yuml", /* 255, small y, dieresis or umlaut mark */ +}; + +/* + * Function to return the entity names of ISO-8859-1 8-bit characters. - FM + */ +const char *HTMLGetEntityName(UCode_t code) +{ +#define IntValue code + int MaxValue = (TABLESIZE(LYEntityNames) - 1); + + if (IntValue < 0 || IntValue > MaxValue) { + return ""; + } + + return LYEntityNames[IntValue]; +} + +/* + * Function to return the UCode_t (long int) value for entity names. It + * returns 0 if not found. + * + * unicode_entities[] handles all the names from old style entities[] too. + * Lynx now calls unicode_entities[] only through this function: + * HTMLGetEntityUCValue(). Note, we need not check for special characters here + * in function or even before it, we should check them *after* invoking this + * function, see put_special_unicodes() in SGML.c. + * + * In the future we will try to isolate all calls to entities[] in favor of new + * unicode-based chartrans scheme. - LP + */ +UCode_t HTMLGetEntityUCValue(const char *name) +{ +#include <entities.h> + + UCode_t value = 0; + size_t i, high, low; + int diff = 0; + size_t number_of_unicode_entities = TABLESIZE(unicode_entities); + + /* + * Make sure we have a non-zero length name. - FM + */ + if (isEmpty(name)) + return (value); + + /* + * Try UC_entity_info unicode_entities[]. + */ + for (low = 0, high = number_of_unicode_entities; + high > low; + diff < 0 ? (low = i + 1) : (high = i)) { + /* + * Binary search. + */ + i = (low + (high - low) / 2); + diff = AS_cmp(unicode_entities[i].name, name); /* Case sensitive! */ + if (diff == 0) { + value = unicode_entities[i].code; + break; + } + } + return (value); +} + +/* + * Original comment - + * Assume these are Microsoft code points, inflicted on us by FrontPage. - FM + * + * MS FrontPage uses syntax like ™ in 128-159 range and doesn't follow + * Unicode standards for this area. Windows-1252 codepoints are assumed here. + * + * However see - + * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0 + */ +UCode_t LYcp1252ToUnicode(UCode_t code) +{ + if ((code == 1) || + (code > 127 && code < 160)) { + switch (code) { + case 1: + /* + * WHITE SMILING FACE + */ + code = 0x263a; + break; + case 128: + /* + * EURO currency sign + */ + code = 0x20ac; + break; + case 130: + /* + * SINGLE LOW-9 QUOTATION MARK (sbquo) + */ + code = 0x201a; + break; + case 131: + /* + * LATIN SMALL LETTER F WITH HOOK + */ + code = 0x192; + break; + case 132: + /* + * DOUBLE LOW-9 QUOTATION MARK (bdquo) + */ + code = 0x201e; + break; + case 133: + /* + * HORIZONTAL ELLIPSIS (hellip) + */ + code = 0x2026; + break; + case 134: + /* + * DAGGER (dagger) + */ + code = 0x2020; + break; + case 135: + /* + * DOUBLE DAGGER (Dagger) + */ + code = 0x2021; + break; + case 136: + /* + * MODIFIER LETTER CIRCUMFLEX ACCENT + */ + code = 0x2c6; + break; + case 137: + /* + * PER MILLE SIGN (permil) + */ + code = 0x2030; + break; + case 138: + /* + * LATIN CAPITAL LETTER S WITH CARON + */ + code = 0x160; + break; + case 139: + /* + * SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo) + */ + code = 0x2039; + break; + case 140: + /* + * LATIN CAPITAL LIGATURE OE + */ + code = 0x152; + break; + case 142: + /* + * LATIN CAPITAL LETTER Z WITH CARON + */ + code = 0x17d; + break; + case 145: + /* + * LEFT SINGLE QUOTATION MARK (lsquo) + */ + code = 0x2018; + break; + case 146: + /* + * RIGHT SINGLE QUOTATION MARK (rsquo) + */ + code = 0x2019; + break; + case 147: + /* + * LEFT DOUBLE QUOTATION MARK (ldquo) + */ + code = 0x201c; + break; + case 148: + /* + * RIGHT DOUBLE QUOTATION MARK (rdquo) + */ + code = 0x201d; + break; + case 149: + /* + * BULLET (bull) + */ + code = 0x2022; + break; + case 150: + /* + * EN DASH (ndash) + */ + code = 0x2013; + break; + case 151: + /* + * EM DASH (mdash) + */ + code = 0x2014; + break; + case 152: + /* + * SMALL TILDE (tilde) + */ + code = 0x02dc; + break; + case 153: + /* + * TRADE MARK SIGN (trade) + */ + code = 0x2122; + break; + case 154: + /* + * LATIN SMALL LETTER S WITH CARON + */ + code = 0x161; + break; + case 155: + /* + * SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo) + */ + code = 0x203a; + break; + case 156: + /* + * LATIN SMALL LIGATURE OE + */ + code = 0x153; + break; + case 158: + /* + * LATIN SMALL LETTER Z WITH CARON + */ + code = 0x17e; + break; + case 159: + /* + * LATIN CAPITAL LETTER Y WITH DIAERESIS + */ + code = 0x178; + break; + default: + /* + * Undefined (by convention, use the replacement character). + */ + code = UCS_REPL; + break; + } + } + return code; +} + +/* + * Function to select a character set and then set the character handling and + * LYHaveCJKCharacterSet flag. - FM + */ +void HTMLUseCharacterSet(int i) +{ + HTMLSetRawModeDefault(i); + p_entity_values = LYCharSets[i]; + HTMLSetCharacterHandling(i); /* set LYRawMode and CJK attributes */ + HTMLSetHaveCJKCharacterSet(i); + HTMLSetDisplayCharsetMatchLocale(i); + return; +} + +/* + * Initializer, calls initialization function for the CHARTRANS handling. - KW + */ +int LYCharSetsDeclared(void) +{ + UCInit(); + + return UCInitialized; +} + +#ifdef USE_CHARSET_CHOICE +void init_charset_subsets(void) +{ + int i, n; + int cur_display = 0; + int cur_assumed = 0; + + /* add them to displayed values */ + charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE; + charset_subsets[current_char_set].hide_display = FALSE; + +#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN + /*all this stuff is for supporting old menu screen... */ + for (i = 0; i < LYNumCharsets; ++i) { + if (charset_subsets[i].hide_display == FALSE) { + n = cur_display++; + if (i == current_char_set) + displayed_display_charset_idx = n; + display_charset_map[n] = i; + display_charset_choices[n] = LYchar_set_names[i]; + } + if (charset_subsets[i].hide_assumed == FALSE) { + n = cur_assumed++; + assumed_doc_charset_map[n] = i; + assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname; + charset_subsets[i].assumed_idx = n; + } + display_charset_choices[cur_display] = NULL; + assumed_charset_choices[cur_assumed] = NULL; + } +#endif +} +#endif /* USE_CHARSET_CHOICE */ |