diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 16:37:15 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 16:37:15 +0000 |
commit | ae5d181b854d3ccb373b6bc01b4869e44ff4d87a (patch) | |
tree | 91f59efb48c56a84cc798e012fccb667b63d3fee /src/LYCharUtils.c | |
parent | Initial commit. (diff) | |
download | lynx-ae5d181b854d3ccb373b6bc01b4869e44ff4d87a.tar.xz lynx-ae5d181b854d3ccb373b6bc01b4869e44ff4d87a.zip |
Adding upstream version 2.9.0dev.12.upstream/2.9.0dev.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/LYCharUtils.c')
-rw-r--r-- | src/LYCharUtils.c | 3419 |
1 files changed, 3419 insertions, 0 deletions
diff --git a/src/LYCharUtils.c b/src/LYCharUtils.c new file mode 100644 index 0000000..0013989 --- /dev/null +++ b/src/LYCharUtils.c @@ -0,0 +1,3419 @@ +/* + * $LynxId: LYCharUtils.c,v 1.137 2021/10/24 00:47:08 tom Exp $ + * + * Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM + * ========================================================================== + */ +#include <HTUtils.h> +#include <SGML.h> + +#define Lynx_HTML_Handler +#include <HTChunk.h> +#include <HText.h> +#include <HTStyle.h> +#include <HTMIME.h> +#include <HTML.h> + +#include <HTCJK.h> +#include <HTAtom.h> +#include <HTMLGen.h> +#include <HTParse.h> +#include <UCMap.h> +#include <UCDefs.h> +#include <UCAux.h> + +#include <LYGlobalDefs.h> +#include <LYCharUtils.h> +#include <LYCharSets.h> + +#include <HTAlert.h> +#include <HTForms.h> +#include <HTNestedList.h> +#include <GridText.h> +#include <LYStrings.h> +#include <LYUtils.h> +#include <LYMap.h> +#include <LYBookmark.h> +#include <LYCurses.h> +#include <LYCookie.h> + +#include <LYexit.h> +#include <LYLeaks.h> + +/* + * Used for nested lists. - FM + */ +int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */ +int OL_VOID = -29998; /* flag for whether a count is set */ + +static size_t count_char(const char *value, int ch) +{ + const char *found; + size_t result = 0; + + while ((*value != '\0') && (found = StrChr(value, ch)) != NULL) { + ++result; + value = (found + 1); + } + return result; +} + +/* + * This function converts any ampersands in a pre-allocated string to "&". + * If brackets is TRUE, it also converts any angle-brackets to "<" or ">". + */ +void LYEntify(char **in_out, + int brackets) +{ + char *source = *in_out; + char *target; + char *result = NULL; + size_t count_AMPs = 0; + size_t count_LTs = 0; + size_t count_GTs = 0; + +#ifdef CJK_EX + enum _state { + S_text, + S_esc, + S_dollar, + S_paren, + S_nonascii_text, + S_dollar_paren + } state = S_text; + int in_sjis = 0; +#endif + + if (non_empty(source)) { + count_AMPs = count_char(*in_out, '&'); + if (brackets) { + count_LTs = count_char(*in_out, '<'); + count_GTs = count_char(*in_out, '>'); + } + + if (count_AMPs != 0 || count_LTs != 0 || count_GTs != 0) { + + target = typecallocn(char, + (strlen(*in_out) + + (4 * count_AMPs) + + (3 * count_LTs) + + (3 * count_GTs) + 1)); + + if ((result = target) == NULL) + outofmem(__FILE__, "LYEntify"); + + for (source = *in_out; *source; source++) { +#ifdef CJK_EX + if (IS_CJK_TTY) { + switch (state) { + case S_text: + if (*source == '\033') { + state = S_esc; + *target++ = *source; + continue; + } + break; + + case S_esc: + if (*source == '$') { + state = S_dollar; + } else if (*source == '(') { + state = S_paren; + } else { + state = S_text; + } + *target++ = *source; + continue; + + case S_dollar: + if (*source == '@' || *source == 'B' || *source == 'A') { + state = S_nonascii_text; + } else if (*source == '(') { + state = S_dollar_paren; + } else { + state = S_text; + } + *target++ = *source; + continue; + + case S_dollar_paren: + if (*source == 'C') { + state = S_nonascii_text; + } else { + state = S_text; + } + *target++ = *source; + continue; + + case S_paren: + if (*source == 'B' || *source == 'J' || *source == 'T') { + state = S_text; + } else if (*source == 'I') { + state = S_nonascii_text; + } else if (*source == '\033') { + state = S_esc; + } + *target++ = *source; + continue; + + case S_nonascii_text: + if (*source == '\033') + state = S_esc; + *target++ = *source; + continue; + + default: + break; + } + if (*(source + 1) != '\0' && + (IS_EUC(UCH(*source), UCH(*(source + 1))) || + IS_SJIS(UCH(*source), UCH(*(source + 1)), in_sjis) || + IS_BIG5(UCH(*source), UCH(*(source + 1))))) { + *target++ = *source++; + *target++ = *source; + continue; + } + } +#endif + switch (*source) { + case '&': + *target++ = '&'; + *target++ = 'a'; + *target++ = 'm'; + *target++ = 'p'; + *target++ = ';'; + break; + case '<': + if (brackets) { + *target++ = '&'; + *target++ = 'l'; + *target++ = 't'; + *target++ = ';'; + break; + } + /* FALLTHRU */ + case '>': + if (brackets) { + *target++ = '&'; + *target++ = 'g'; + *target++ = 't'; + *target++ = ';'; + break; + } + /* FALLTHRU */ + default: + *target++ = *source; + break; + } + } + *target = '\0'; + FREE(*in_out); + *in_out = result; + } + } +} + +/* + * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param. + * Optimize things a little by avoiding the memory allocation if not needed, + * as is usually the case. + */ +static BOOL MustEntify(const char *source) +{ + BOOL result; + +#ifdef CJK_EX + if (IS_CJK_TTY && StrChr(source, '\033') != 0) { + result = TRUE; + } else +#endif + { + size_t length = strlen(source); + size_t reject = strcspn(source, "<&>"); + + result = (BOOL) (length != reject); + } + + return result; +} + +/* + * Wrappers for LYEntify() which do not assume that the source was allocated, + * e.g., output from gettext(). + */ +const char *LYEntifyTitle(char **target, const char *source) +{ + const char *result = 0; + + if (MustEntify(source)) { + StrAllocCopy(*target, source); + LYEntify(target, TRUE); + result = *target; + } else { + result = source; + } + return result; +} + +const char *LYEntifyValue(char **target, const char *source) +{ + const char *result = 0; + + if (MustEntify(source)) { + StrAllocCopy(*target, source); + LYEntify(target, FALSE); + result = *target; + } else { + result = source; + } + return result; +} + +/* + * This function trims characters <= that of a space (32), + * including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2), + * but not ESC, from the heads of strings. - FM + */ +void LYTrimHead(char *str) +{ + const char *s = str; + + if (isEmpty(s)) + return; + + while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC)) /* S/390 -- gil -- 1669 */ + s++; + if (s > str) { + char *ns = str; + + while (*s) { + *ns++ = *s++; + } + *ns = '\0'; + } +} + +/* + * This function trims characters <= that of a space (32), + * including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and + * ESC from the tails of strings. - FM + */ +void LYTrimTail(char *str) +{ + int i; + + if (isEmpty(str)) + return; + + i = (int) strlen(str) - 1; + while (i >= 0) { + if (WHITE(str[i])) + str[i] = '\0'; + else + break; + i--; + } +} + +/* + * This function should receive a pointer to the start + * of a comment. It returns a pointer to the end ('>') + * character of comment, or it's best guess if the comment + * is invalid. - FM + */ +char *LYFindEndOfComment(char *str) +{ + char *cp, *cp1; + enum comment_state { + start1, + start2, + end1, + end2 + } state; + + if (str == NULL) + /* + * We got NULL, so return NULL. - FM + */ + return NULL; + + if (StrNCmp(str, "<!--", 4)) + /* + * We don't have the start of a comment, so return the beginning of the + * string. - FM + */ + return str; + + cp = (str + 4); + if (*cp == '>') + /* + * It's an invalid comment, so + * return this end character. - FM + */ + return cp; + + if ((cp1 = StrChr(cp, '>')) == NULL) + /* + * We don't have an end character, so return the beginning of the + * string. - FM + */ + return str; + + if (*cp == '-') + /* + * Ugh, it's a "decorative" series of dashes, so return the next end + * character. - FM + */ + return cp1; + + /* + * OK, we're ready to start parsing. - FM + */ + state = start2; + while (*cp != '\0') { + switch (state) { + case start1: + if (*cp == '-') + state = start2; + else + /* + * Invalid comment, so return the first '>' from the start of + * the string. - FM + */ + return cp1; + break; + + case start2: + if (*cp == '-') + state = end1; + break; + + case end1: + if (*cp == '-') + state = end2; + else + /* + * Invalid comment, so return the first '>' from the start of + * the string. - FM + */ + return cp1; + break; + + case end2: + if (*cp == '>') + /* + * Valid comment, so return the end character. - FM + */ + return cp; + if (*cp == '-') { + state = start1; + } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) { /* S/390 -- gil -- 1686 */ + /* + * Invalid comment, so return the first '>' from the start of + * the string. - FM + */ + return cp1; + } + break; + + default: + break; + } + cp++; + } + + /* + * Invalid comment, so return the first '>' from the start of the string. + * - FM + */ + return cp1; +} + +/* + * If an HREF, itself or if resolved against a base, + * represents a file URL, and the host is defaulted, + * force in "//localhost". We need this until + * all the other Lynx code which performs security + * checks based on the "localhost" string is changed + * to assume "//localhost" when a host field is not + * present in file URLs - FM + */ +void LYFillLocalFileURL(char **href, + const char *base) +{ + char *temp = NULL; + + if (isEmpty(*href)) + return; + + if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) { + if (base != NULL && isFILE_URL(base)) { + StrAllocCopy(temp, STR_FILE_URL); + StrAllocCat(temp, *href); + StrAllocCopy(*href, temp); + } + } + if (isFILE_URL(*href)) { + if (*(*href + 5) == '\0') { + StrAllocCat(*href, "//localhost"); + } else if (!strcmp(*href, "file://")) { + StrAllocCat(*href, "localhost"); + } else if (!StrNCmp(*href, "file:///", 8)) { + StrAllocCopy(temp, (*href + 7)); + LYLocalFileToURL(href, temp); + } else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) { + StrAllocCopy(temp, (*href + 5)); + LYLocalFileToURL(href, temp); + } + } +#if defined(USE_DOS_DRIVES) + if (LYIsDosDrive(*href)) { + /* + * If it's a local DOS path beginning with drive letter, + * add file://localhost/ prefix and go ahead. + */ + StrAllocCopy(temp, *href); + LYLocalFileToURL(href, temp); + } + + /* use below: strlen("file://localhost/") = 17 */ + if (!StrNCmp(*href, "file://localhost/", 17) + && (strlen(*href) == 19) + && LYIsDosDrive(*href + 17)) { + /* + * Terminate DOS drive letter with a slash to surf root successfully. + * Here seems a proper place to do so. + */ + LYAddPathSep(href); + } +#endif /* USE_DOS_DRIVES */ + + /* + * No path in a file://localhost URL means a + * directory listing for the current default. - FM + */ + if (!strcmp(*href, "file://localhost")) { + const char *temp2; + +#ifdef VMS + temp2 = HTVMS_wwwName(LYGetEnv("PATH")); +#else + char curdir[LY_MAXPATH]; + + temp2 = wwwName(Current_Dir(curdir)); +#endif /* VMS */ + if (!LYIsHtmlSep(*temp2)) + LYAddHtmlSep(href); + /* + * Check for pathological cases - current dir has chars which MUST BE + * URL-escaped - kw + */ + if (StrChr(temp2, '%') != NULL || StrChr(temp2, '#') != NULL) { + FREE(temp); + temp = HTEscape(temp2, URL_PATH); + StrAllocCat(*href, temp); + } else { + StrAllocCat(*href, temp2); + } + } +#ifdef VMS + /* + * On VMS, a file://localhost/ URL means + * a listing for the login directory. - FM + */ + if (!strcmp(*href, "file://localhost/")) + StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1)); +#endif /* VMS */ + + FREE(temp); + return; +} + +void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl) +{ + char *buf = 0; + + if (disp_chndl == -1) + /* + * -1 means use current_char_set. + */ + disp_chndl = current_char_set; + + if (target != 0 && disp_chndl >= 0) { + HTSprintf0(&buf, "<META %s content=\"" STR_HTML ";charset=%s\">\n", + "http-equiv=\"content-type\"", + LYCharSet_UC[disp_chndl].MIMEname); + (*target->isa->put_string) (target, buf); + FREE(buf); + } +} + +/* + * This function writes a line with a META tag to an open file, + * which will specify a charset parameter to use when the file is + * read back in. It is meant for temporary HTML files used by the + * various special pages which may show titles of documents. When those + * files are created, the title strings normally have been translated and + * expanded to the display character set, so we have to make sure they + * don't get translated again. + * If the user has changed the display character set during the lifetime + * of the Lynx session (or, more exactly, during the time the title + * strings to be written were generated), they may now have different + * character encodings and there is currently no way to get it all right. + * To change this, we would have to add a variable for each string which + * keeps track of its character encoding. + * But at least we can try to ensure that reading the file after future + * display character set changes will give reasonable output. + * + * The META tag is not written if the display character set (passed as + * disp_chndl) already corresponds to the charset assumption that + * would be made when the file is read. - KW + * + * Currently this function is used for temporary files like "Lynx Info Page" + * and for one permanent - bookmarks (so it may be a problem if you change + * the display charset later: new bookmark entries may be mistranslated). + * - LP + */ +void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl) +{ + if (disp_chndl == -1) + /* + * -1 means use current_char_set. + */ + disp_chndl = current_char_set; + + if (fd == NULL || disp_chndl < 0) + /* + * Should not happen. + */ + return; + + if (UCLYhndl_HTFile_for_unspec == disp_chndl) + /* + * Not need to do, so we don't. + */ + return; + + if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT) + /* + * There shouldn't be any 8-bit characters in this case. + */ + return; + + /* + * In other cases we don't know because UCLYhndl_for_unspec may change + * during the lifetime of the file (by toggling raw mode or changing the + * display character set), so proceed. + */ + fprintf(fd, "<META %s content=\"" STR_HTML ";charset=%s\">\n", + "http-equiv=\"content-type\"", + LYCharSet_UC[disp_chndl].MIMEname); +} + +/* + * This function returns OL TYPE="A" strings in + * the range of " A." (1) to "ZZZ." (18278). - FM + */ +char *LYUppercaseA_OL_String(int seqnum) +{ + static char OLstring[8]; + + if (seqnum <= 1) { + strcpy(OLstring, " A."); + return OLstring; + } + if (seqnum < 27) { + sprintf(OLstring, " %c.", (seqnum + 64)); + return OLstring; + } + if (seqnum < 703) { + sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64), + (seqnum - ((seqnum - 1) / 26) * 26 + 64)); + return OLstring; + } + if (seqnum < 18279) { + sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64), + (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64), + (seqnum - ((seqnum - 1) / 26) * 26 + 64)); + return OLstring; + } + strcpy(OLstring, "ZZZ."); + return OLstring; +} + +/* + * This function returns OL TYPE="a" strings in + * the range of " a." (1) to "zzz." (18278). - FM + */ +char *LYLowercaseA_OL_String(int seqnum) +{ + static char OLstring[8]; + + if (seqnum <= 1) { + strcpy(OLstring, " a."); + return OLstring; + } + if (seqnum < 27) { + sprintf(OLstring, " %c.", (seqnum + 96)); + return OLstring; + } + if (seqnum < 703) { + sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96), + (seqnum - ((seqnum - 1) / 26) * 26 + 96)); + return OLstring; + } + if (seqnum < 18279) { + sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96), + (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96), + (seqnum - ((seqnum - 1) / 26) * 26 + 96)); + return OLstring; + } + strcpy(OLstring, "zzz."); + return OLstring; +} + +/* + * This function returns OL TYPE="I" strings in the + * range of " I." (1) to "MMM." (3000).- FM + * Maximum length: 16 -TD + */ +char *LYUppercaseI_OL_String(int seqnum) +{ + static char OLstring[20]; + int Arabic = seqnum; + + if (Arabic >= 3000) { + strcpy(OLstring, "MMM."); + return OLstring; + } + + switch (Arabic) { + case 1: + strcpy(OLstring, " I."); + return OLstring; + case 5: + strcpy(OLstring, " V."); + return OLstring; + case 10: + strcpy(OLstring, " X."); + return OLstring; + case 50: + strcpy(OLstring, " L."); + return OLstring; + case 100: + strcpy(OLstring, " C."); + return OLstring; + case 500: + strcpy(OLstring, " D."); + return OLstring; + case 1000: + strcpy(OLstring, " M."); + return OLstring; + default: + OLstring[0] = '\0'; + break; + } + + while (Arabic >= 1000) { + strcat(OLstring, "M"); + Arabic -= 1000; + } + + if (Arabic >= 900) { + strcat(OLstring, "CM"); + Arabic -= 900; + } + + if (Arabic >= 500) { + strcat(OLstring, "D"); + Arabic -= 500; + } + + if (Arabic >= 400) { + strcat(OLstring, "CD"); + Arabic -= 400; + } + + while (Arabic >= 100) { + strcat(OLstring, "C"); + Arabic -= 100; + } + + if (Arabic >= 90) { + strcat(OLstring, "XC"); + Arabic -= 90; + } + + if (Arabic >= 50) { + strcat(OLstring, "L"); + Arabic -= 50; + } + + if (Arabic >= 40) { + strcat(OLstring, "XL"); + Arabic -= 40; + } + + while (Arabic > 10) { + strcat(OLstring, "X"); + Arabic -= 10; + } + + switch (Arabic) { + case 1: + strcat(OLstring, "I."); + break; + case 2: + strcat(OLstring, "II."); + break; + case 3: + strcat(OLstring, "III."); + break; + case 4: + strcat(OLstring, "IV."); + break; + case 5: + strcat(OLstring, "V."); + break; + case 6: + strcat(OLstring, "VI."); + break; + case 7: + strcat(OLstring, "VII."); + break; + case 8: + strcat(OLstring, "VIII."); + break; + case 9: + strcat(OLstring, "IX."); + break; + case 10: + strcat(OLstring, "X."); + break; + default: + strcat(OLstring, "."); + break; + } + + return OLstring; +} + +/* + * This function returns OL TYPE="i" strings in + * range of " i." (1) to "mmm." (3000).- FM + * Maximum length: 16 -TD + */ +char *LYLowercaseI_OL_String(int seqnum) +{ + static char OLstring[20]; + int Arabic = seqnum; + + if (Arabic >= 3000) { + strcpy(OLstring, "mmm."); + return OLstring; + } + + switch (Arabic) { + case 1: + strcpy(OLstring, " i."); + return OLstring; + case 5: + strcpy(OLstring, " v."); + return OLstring; + case 10: + strcpy(OLstring, " x."); + return OLstring; + case 50: + strcpy(OLstring, " l."); + return OLstring; + case 100: + strcpy(OLstring, " c."); + return OLstring; + case 500: + strcpy(OLstring, " d."); + return OLstring; + case 1000: + strcpy(OLstring, " m."); + return OLstring; + default: + OLstring[0] = '\0'; + break; + } + + while (Arabic >= 1000) { + strcat(OLstring, "m"); + Arabic -= 1000; + } + + if (Arabic >= 900) { + strcat(OLstring, "cm"); + Arabic -= 900; + } + + if (Arabic >= 500) { + strcat(OLstring, "d"); + Arabic -= 500; + } + + if (Arabic >= 400) { + strcat(OLstring, "cd"); + Arabic -= 400; + } + + while (Arabic >= 100) { + strcat(OLstring, "c"); + Arabic -= 100; + } + + if (Arabic >= 90) { + strcat(OLstring, "xc"); + Arabic -= 90; + } + + if (Arabic >= 50) { + strcat(OLstring, "l"); + Arabic -= 50; + } + + if (Arabic >= 40) { + strcat(OLstring, "xl"); + Arabic -= 40; + } + + while (Arabic > 10) { + strcat(OLstring, "x"); + Arabic -= 10; + } + + switch (Arabic) { + case 1: + strcat(OLstring, "i."); + break; + case 2: + strcat(OLstring, "ii."); + break; + case 3: + strcat(OLstring, "iii."); + break; + case 4: + strcat(OLstring, "iv."); + break; + case 5: + strcat(OLstring, "v."); + break; + case 6: + strcat(OLstring, "vi."); + break; + case 7: + strcat(OLstring, "vii."); + break; + case 8: + strcat(OLstring, "viii."); + break; + case 9: + strcat(OLstring, "ix."); + break; + case 10: + strcat(OLstring, "x."); + break; + default: + strcat(OLstring, "."); + break; + } + + return OLstring; +} + +/* + * This function initializes the Ordered List counter. - FM + */ +void LYZero_OL_Counter(HTStructured * me) +{ + int i; + + if (!me) + return; + + for (i = 0; i < 12; i++) { + me->OL_Counter[i] = OL_VOID; + me->OL_Type[i] = '1'; + } + + me->Last_OL_Count = 0; + me->Last_OL_Type = '1'; + + return; +} + +/* + * This function is used by the HTML Structured object. - KW + */ +void LYGetChartransInfo(HTStructured * me) +{ + me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, + UCT_STAGE_STRUCTURED); + if (me->UCLYhndl < 0) { + int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT); + + if (chndl < 0) { + chndl = current_char_set; + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_HTEXT, + UCT_SETBY_STRUCTURED); + } + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_STRUCTURED, + UCT_SETBY_STRUCTURED); + me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, + UCT_STAGE_STRUCTURED); + } + me->UCI = HTAnchor_getUCInfoStage(me->node_anchor, + UCT_STAGE_STRUCTURED); +} + + /* as in HTParse.c, saves some calls - kw */ +static const char *hex = "0123456789ABCDEF"; + +/* + * Any raw 8-bit or multibyte characters already have been + * handled in relation to the display character set + * in SGML_character(), including named and numeric entities. + * + * This function used for translations HTML special fields inside tags + * (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'. + * It also unescapes non-ASCII characters from URL (#fragments !) + * if st_URL is active. + * + * If `do_ent' is YES, it converts named entities + * and numeric character references (NCRs) to their `cs_to' replacements. + * + * Named entities converted to unicodes. NCRs (unicodes) converted + * by UCdomap.c chartrans functions. + * ???NCRs with values in the ISO-8859-1 range 160-255 may be converted + * to their HTML entity names (via old-style entities) and then translated + * according to the LYCharSets.c array for `cs_out'???. + * + * Some characters (see descriptions in `put_special_unicodes' from SGML.c) + * translated in relation with the state of boolean variables + * `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet: + * + * If plain_space is TRUE, nbsp (160) will be treated as an ASCII + * space (32). If hidden is TRUE, entities will be translated + * (if `do_ent' is YES) but escape sequences will be passed unaltered. + * If `hidden' is FALSE, some characters are converted to Lynx special + * codes (see `put_special_unicodes') or ASCII space if `plain_space' + * applies). @@ is `use_lynx_specials' needed, does it have any effect? @@ + * If `use_lynx_specials' is YES, translate byte values 160 and 173 + * meaning U+00A0 and U+00AD given as or converted from raw char input + * are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively + * (unless input and output charset are both iso-8859-1, for compatibility + * with previous usage in HTML.c) even if `hidden' or `plain_space' is set. + * + * If `Back' is YES, the reverse is done instead i.e., Lynx special codes + * in the input are translated back to character values. + * + * If `Back' is YES, an attempt is made to use UCReverseTransChar() for + * back translation which may be more efficient. (?) + * + * If `stype' is st_URL, non-ASCII characters are URL-encoded instead. + * The sequence of bytes being URL-encoded is the raw input character if + * we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the + * UTF-8 representation if either `cs_to' requires this or if the + * character's Unicode value is > 255, otherwise it should be the iso-8859-1 + * representation. + * No general URL-encoding occurs for displayable ASCII characters and + * spaces and some C0 controls valid in HTML (LF, TAB), it is expected + * that other functions will take care of that as appropriate. + * + * Escape characters (0x1B, '\033') are + * - URL-encoded if `stype' is st_URL, otherwise + * - dropped if `stype' is st_other, otherwise (i.e., st_HTML) + * - passed if `hidden' is TRUE or HTCJK is set, otherwise + * - dropped. + * + * (If `stype' is st_URL or st_other most of the parameters really predefined: + * cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES) + * + * + * Returns pointer to the char** passed in + * if string translated or translation unnecessary, + * NULL otherwise + * (in which case something probably went wrong.) + * + * + * In general, this somehow ugly function (KW) + * cover three functions from v.2.7.2 (FM): + * extern void LYExpandString ( + * HTStructured * me, + * char ** str); + * extern void LYUnEscapeEntities ( + * HTStructured * me, + * char ** str); + * extern void LYUnEscapeToLatinOne ( + * HTStructured * me, + * char ** str, + * BOOLEAN isURL); + */ + +char **LYUCFullyTranslateString(char **str, + int cs_from, + int cs_to, + int do_ent, + int use_lynx_specials, + int plain_space, + int hidden, + int Back, + CharUtil_st stype) +{ + char *p; + char *q, *qs; + HTChunk *chunk = NULL; + char *cp = 0; + char cpe = 0; + char *esc = NULL; + char replace_buf[64]; + int uck; + int lowest_8; + UCode_t code = 0; + BOOL output_utf8 = 0, repl_translated_C0 = 0; + size_t len; + const char *name = NULL; + BOOLEAN no_bytetrans; + UCTransParams T; + BOOL from_is_utf8 = FALSE; + char *puni = 0; + enum _state { + S_text, + S_esc, + S_dollar, + S_paren, + S_nonascii_text, + S_dollar_paren, + S_trans_byte, + S_check_ent, + S_ncr, + S_check_uni, + S_named, + S_check_name, + S_recover, + S_got_oututf8, + S_got_outstring, + S_put_urlstring, + S_got_outchar, + S_put_urlchar, + S_next_char, + S_done + } state = S_text; + enum _parsing_what { + P_text, + P_utf8, + P_hex, + P_decimal, + P_named + } what = P_text; + +#ifdef KANJI_CODE_OVERRIDE + static unsigned char sjis_1st = '\0'; + + unsigned char sjis_str[3]; +#endif + + /* + * Make sure we have a non-empty string. - FM + */ + if (isEmpty(*str)) + return str; + + if (cs_from < 0 || cs_to < 0) { + CTRACE((tfp, "BUG: LYUCFullyTranslateString from=%d, to=%d\n", + cs_from, cs_to)); + return str; + } + + /* + * FIXME: something's wrong with the limit checks here (clearing the + * buffer helps). + */ + memset(replace_buf, 0, sizeof(replace_buf)); + + /* + * Don't do byte translation if original AND target character sets are both + * iso-8859-1 (and we are not called to back-translate), or if we are in + * CJK mode. + */ + if (IS_CJK_TTY +#ifdef USE_JAPANESEUTF8_SUPPORT + && (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0) + && (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0) +#endif + ) { + no_bytetrans = TRUE; + } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) { + no_bytetrans = TRUE; + } else { + /* No need to translate or examine the string any further */ + no_bytetrans = (BOOL) (!use_lynx_specials && !Back && + UCNeedNotTranslate(cs_from, cs_to)); + } + /* + * Save malloc/calloc overhead in simple case - kw + */ + if (do_ent && hidden && (stype != st_URL) && (StrChr(*str, '&') == NULL)) + do_ent = FALSE; + + /* Can't do, caller should figure out what to do... */ + if (!UCCanTranslateFromTo(cs_from, cs_to)) { + if (cs_to < 0) + return NULL; + if (!do_ent && no_bytetrans) + return NULL; + no_bytetrans = TRUE; + } else if (cs_to < 0) { + do_ent = FALSE; + } + + if (!do_ent && no_bytetrans) + return str; + p = *str; + + if (!no_bytetrans) { + UCTransParams_clear(&T); + UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from], + cs_to, &LYCharSet_UC[cs_to]); + from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8); + output_utf8 = T.output_utf8; + repl_translated_C0 = T.repl_translated_C0; + puni = p; + } else if (do_ent) { + output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 || + HText_hasUTF8OutputSet(HTMainText)); + repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0); + } + + lowest_8 = LYlowest_eightbit[cs_to]; + + /* + * Create a buffer string seven times the length of the original, so we + * have plenty of room for expansions. - FM + */ + len = strlen(p) + 16; + q = p; + + qs = q; + +/* Create the HTChunk only if we need it */ +#define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1))) + +#define REPLACE_STRING(s) \ + if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \ + HTChunkPuts(CHUNK, s); \ + qs = q = *str + +#define REPLACE_CHAR(c) if (q > p) { \ + HTChunkPutb(CHUNK, qs, (int) (q - qs)); \ + qs = q = *str; \ + *q++ = c; \ + } else \ + *q++ = c + + /* + * Loop through string, making conversions as needed. + * + * The while() checks for a non-'\0' char only for the normal text states + * since other states may temporarily modify p or *p (which should be + * restored before S_done!) - kw + */ + while (*p || (state != S_text && state != S_nonascii_text)) { + switch (state) { + case S_text: + code = UCH(*p); +#ifdef KANJI_CODE_OVERRIDE + if (HTCJK == JAPANESE && last_kcode == SJIS) { + if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) { + sjis_1st = UCH(code); + } else if (sjis_1st && IS_SJIS_LO(code)) { + sjis_1st = '\0'; + } else { + if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) { + sjis_str[2] = '\0'; + JISx0201TO0208_SJIS(UCH(code), + sjis_str, sjis_str + 1); + REPLACE_STRING(sjis_str); + p++; + continue; + } + } + } +#endif + if (*p == '\033') { + if ((IS_CJK_TTY && !hidden) || stype != st_HTML) { + state = S_esc; + if (stype == st_URL) { + REPLACE_STRING("%1B"); + p++; + continue; + } else if (stype != st_HTML) { + p++; + continue; + } else { + *q++ = *p++; + continue; + } + } else if (!hidden) { + /* + * CJK handling not on, and not a hidden INPUT, so block + * escape. - FM + */ + state = S_next_char; + } else { + state = S_trans_byte; + } + } else { + state = (do_ent ? S_check_ent : S_trans_byte); + } + break; + + case S_esc: + if (*p == '$') { + state = S_dollar; + *q++ = *p++; + continue; + } else if (*p == '(') { + state = S_paren; + *q++ = *p++; + continue; + } else { + state = S_text; + } + break; + + case S_dollar: + if (*p == '@' || *p == 'B' || *p == 'A') { + state = S_nonascii_text; + *q++ = *p++; + continue; + } else if (*p == '(') { + state = S_dollar_paren; + *q++ = *p++; + continue; + } else { + state = S_text; + } + break; + + case S_dollar_paren: + if (*p == 'C') { + state = S_nonascii_text; + *q++ = *p++; + continue; + } else { + state = S_text; + } + break; + + case S_paren: + if (*p == 'B' || *p == 'J' || *p == 'T') { + state = S_text; + *q++ = *p++; + continue; + } else if (*p == 'I') { + state = S_nonascii_text; + *q++ = *p++; + continue; + } else { + state = S_text; + } + break; + + case S_nonascii_text: + if (*p == '\033') { + if ((IS_CJK_TTY && !hidden) || stype != st_HTML) { + state = S_esc; + if (stype == st_URL) { + REPLACE_STRING("%1B"); + p++; + continue; + } else if (stype != st_HTML) { + p++; + continue; + } + } + } + *q++ = *p++; + continue; + + case S_trans_byte: + /* character translation goes here */ + /* + * Don't do anything if we have no string, or if original AND + * target character sets are both iso-8859-1, or if we are in CJK + * mode. + */ + if (*p == '\0' || no_bytetrans) { + state = S_got_outchar; + break; + } + + if (Back) { + int rev_c; + + if ((*p) == HT_NON_BREAK_SPACE || + (*p) == HT_EN_SPACE) { + if (plain_space) { + code = *p = ' '; + state = S_got_outchar; + break; + } else { + code = 160; + if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || + (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) { + state = S_got_outchar; + break; + } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859 + || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) { + state = S_check_uni; + break; + } else { + *(unsigned char *) p = UCH(160); + } + } + } else if ((*p) == LY_SOFT_HYPHEN) { + code = 173; + if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || + (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) { + state = S_got_outchar; + break; + } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859 + || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) { + state = S_check_uni; + break; + } else { + *(unsigned char *) p = UCH(173); + } +#ifdef USE_JAPANESEUTF8_SUPPORT + } else if (output_utf8) { + if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") && + (IS_EUC((unsigned char) (*p), + (unsigned char) (*(p + 1))))) || + (!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") && + (IS_SJIS_2BYTE((unsigned char) (*p), + (unsigned char) (*(p + 1)))))) { + code = UCTransJPToUni(p, 2, cs_from); + p++; + state = S_check_uni; + break; + } +#endif + } else if (code < 127 || T.transp) { + state = S_got_outchar; + break; + } + rev_c = UCReverseTransChar(*p, cs_to, cs_from); + if (rev_c > 127) { + *p = (char) rev_c; + code = rev_c; + state = S_got_outchar; + break; + } + } else if (code < 127) { + state = S_got_outchar; + break; + } + + if (from_is_utf8) { + if (((*p) & 0xc0) == 0xc0) { + const char *pq = p; + + puni = p; + code = UCGetUniFromUtf8String(&pq); + if (code <= 0) { + code = UCH(*p); + } else { + what = P_utf8; + puni += (pq - (const char *) p); + } + } + } else if (use_lynx_specials && !Back && + (code == 160 || code == 173) && + (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 || + (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) { + if (code == 160) + code = *p = HT_NON_BREAK_SPACE; + else if (code == 173) + code = *p = LY_SOFT_HYPHEN; + state = S_got_outchar; + break; + } else if (T.trans_to_uni) { + code = UCTransToUni(*p, cs_from); + if (code <= 0) { + /* What else can we do? */ + code = UCH(*p); + } + } else if (!T.trans_from_uni) { + state = S_got_outchar; + break; + } + /* + * Substitute Lynx special character for 160 (nbsp) if + * use_lynx_specials is set. + */ + if (use_lynx_specials && !Back && + (code == 160 || code == 173)) { + code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN)); + state = S_got_outchar; + break; + } + + state = S_check_uni; + break; + + case S_check_ent: + if (*p == '&') { + char *pp = p + 1; + + len = strlen(pp); + /* + * Check for a numeric entity. - FM + */ + if (*pp == '#' && len > 2 && + (*(pp + 1) == 'x' || *(pp + 1) == 'X') && + UCH(*(pp + 2)) < 127 && + isxdigit(UCH(*(pp + 2)))) { + what = P_hex; + state = S_ncr; + } else if (*pp == '#' && len > 2 && + UCH(*(pp + 1)) < 127 && + isdigit(UCH(*(pp + 1)))) { + what = P_decimal; + state = S_ncr; + } else if (UCH(*pp) < 127 && + isalpha(UCH(*pp))) { + what = P_named; + state = S_named; + } else { + state = S_trans_byte; + } + } else { + state = S_trans_byte; + } + break; + + case S_ncr: + if (what == P_hex) { + p += 3; + } else { /* P_decimal */ + p += 2; + } + cp = p; + while (*p && UCH(*p) < 127 && + (what == P_hex ? isxdigit(UCH(*p)) : + isdigit(UCH(*p)))) { + p++; + } + /* + * Save the terminator and isolate the digit(s). - FM + */ + cpe = *p; + if (*p) + *p++ = '\0'; + /* + * Show the numeric entity if the value: + * (1) Is greater than 255 and unhandled Unicode. + * (2) Is less than 32, and not valid and we don't have HTCJK set. + * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set. + * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set. + */ + if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) { + code = LYcp1252ToUnicode(code); + state = S_check_uni; + } else { + state = S_recover; + break; + } + break; + + case S_check_uni: + /* + * Show the numeric entity if the value: + * (2) Is less than 32, and not valid and we don't have HTCJK set. + * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set. + * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set. + */ + if ((code < 32 && + code != 9 && code != 10 && code != 13 && + !IS_CJK_TTY) || + (code == 127 && + !(HTPassHighCtrlRaw || IS_CJK_TTY)) || + (code > 127 && code < 160 && + !HTPassHighCtrlNum)) { + state = S_recover; + break; + } + /* + * Convert the value as an unsigned char, hex escaped if isURL is + * set and it's 8-bit, and then recycle the terminator if it is not + * a semicolon. - FM + */ + if (code > 159 && stype == st_URL) { + state = S_got_oututf8; + break; + } + /* + * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise + * use an ASCII space (32) if plain_space is TRUE, otherwise use + * the Lynx special character. - FM + */ + if (code == 160) { + if (plain_space) { + code = ' '; + state = S_got_outchar; + break; + } else if (use_lynx_specials) { + code = HT_NON_BREAK_SPACE; + state = S_got_outchar; + break; + } else if ((hidden && !Back) + || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1) + || LYCharSet_UC[cs_to].enc == UCT_ENC_8859 + || (LYCharSet_UC[cs_to].like8859 & + UCT_R_8859SPECL)) { + state = S_got_outchar; + break; + } else if ( + (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) { + ; /* nothing, may be translated later */ + } else { + code = ' '; + state = S_got_outchar; + break; + } + } + /* + * For 173 (shy), use that value if it's a hidden INPUT, otherwise + * ignore it if plain_space is TRUE, otherwise use the Lynx special + * character. - FM + */ + if (code == 173) { + if (plain_space) { + replace_buf[0] = '\0'; + state = S_got_outstring; + break; + } else if (Back && + !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 || + (LYCharSet_UC[cs_to].like8859 & + UCT_R_8859SPECL))) { + ; /* nothing, may be translated later */ + } else if (hidden || Back) { + state = S_got_outchar; + break; + } else if (use_lynx_specials) { + code = LY_SOFT_HYPHEN; + state = S_got_outchar; + break; + } + } + /* + * Seek a translation from the chartrans tables. + */ + if ((uck = UCTransUniChar(code, + cs_to)) >= 32 && + uck < 256 && + (uck < 127 || uck >= lowest_8)) { + code = uck; + state = S_got_outchar; + break; + } else if ((uck == -4 || + (repl_translated_C0 && + uck > 0 && uck < 32)) && + /* + * Not found; look for replacement string. + */ + UCTransUniCharStr(replace_buf, + 60, code, + cs_to, + 0) >= 0) { + state = S_got_outstring; + break; + } + if (output_utf8 && + code > 127 && code < 0x7fffffffL) { + state = S_got_oututf8; + break; + } + /* + * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the + * character reference if it's a hidden INPUT, otherwise use an + * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx + * special character. - FM + */ + if (code == 8194 || code == 8195 || code == 8201) { + if (hidden) { + state = S_recover; + } else if (plain_space) { + code = ' '; + state = S_got_outchar; + } else { + code = HT_EN_SPACE; + state = S_got_outchar; + } + break; + /* + * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm), + * for now, if we got this far without finding a representation + * for them. + */ + } else if (code == 8204 || code == 8205 || + code == 8206 || code == 8207) { + CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%" + PRI_UCode_t "'.\n", CAST_UCode_t (code))); + replace_buf[0] = '\0'; + state = S_got_outstring; + break; + /* + * Show the numeric entity if the value: (1) Is greater than + * 255 and unhandled Unicode. + */ + } else if (code > 255) { + /* + * Illegal or not yet handled value. Return "&#" verbatim and + * continue from there. - FM + */ + state = S_recover; + break; + /* + * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or + * the character set is "ISO Latin 1", use it's value. - FM + */ + } else if (code < 161 || + (code < 256 && + (HTPassEightBitNum || cs_to == LATIN1))) { + /* + * No conversion needed. + */ + state = S_got_outchar; + break; + + /* The following disabled section doesn't make sense any more. + * It used to make sense in the past, when S_check_named would + * look in "old style" tables in addition to what it does now. + * Disabling of going to S_check_name here prevents endless + * looping between S_check_uni and S_check_names states, which + * could occur here for Latin 1 codes for some cs_to if they + * had no translation in that cs_to. Normally all cs_to + * *should* now have valid translations via UCTransUniChar or + * UCTransUniCharStr for all Latin 1 codes, so that we would + * not get here anyway, and no loop could occur. Still, if we + * *do* get here, FALL THROUGH to case S_recover now. - kw + */ +#if 0 + /* + * If we get to here, convert and handle the character as a + * named entity. - FM + */ + } else { + name = HTMLGetEntityName(code - 160); + state = S_check_name; + break; +#endif + } + /* FALLTHRU */ + + case S_recover: + if (what == P_decimal || what == P_hex) { + /* + * Illegal or not yet handled value. Return "&#" verbatim and + * continue from there. - FM + */ + *q++ = '&'; + *q++ = '#'; + if (what == P_hex) + *q++ = 'x'; + if (cpe != '\0') + *(p - 1) = cpe; + p = cp; + state = S_done; + } else if (what == P_named) { + *cp = cpe; + *q++ = '&'; + state = S_done; + } else if (!T.output_utf8 && stype == st_HTML && !hidden && + !(HTPassEightBitRaw && + UCH(*p) >= lowest_8)) { + sprintf(replace_buf, "U%.2" PRI_UCode_t "", CAST_UCode_t (code)); + + state = S_got_outstring; + } else { + puni = p; + code = UCH(*p); + state = S_got_outchar; + } + break; + + case S_named: + cp = ++p; + while (*cp && UCH(*cp) < 127 && + isalnum(UCH(*cp))) + cp++; + cpe = *cp; + *cp = '\0'; + name = p; + state = S_check_name; + break; + + case S_check_name: + /* + * Seek the Unicode value for the named entity. + * + * !!!! We manually recover the case of '=' terminator which is + * commonly found on query to CGI-scripts enclosed as href= URLs + * like "somepath/?x=1&yz=2" Without this dirty fix, submission of + * such URLs was broken if &yz string happened to be a recognized + * entity name. - LP + */ + if (((code = HTMLGetEntityUCValue(name)) > 0) && + !((cpe == '=') && (stype == st_URL))) { + state = S_check_uni; + break; + } + /* + * Didn't find the entity. Return verbatim. + */ + state = S_recover; + break; + + /* * * O U T P U T S T A T E S * * */ + + case S_got_oututf8: + if (code > 255 || + (code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) { + UCConvertUniToUtf8(code, replace_buf); + state = S_got_outstring; + } else { + state = S_got_outchar; + } + break; + case S_got_outstring: + if (what == P_decimal || what == P_hex) { + if (cpe != ';' && cpe != '\0') + *(--p) = cpe; + p--; + } else if (what == P_named) { + *cp = cpe; + p = (*cp != ';') ? (cp - 1) : cp; + } else if (what == P_utf8) { + p = puni; + } + if (replace_buf[0] == '\0') { + state = S_next_char; + break; + } + if (stype == st_URL) { + code = replace_buf[0]; /* assume string OK if first char is */ + if (code >= 127 || + (code < 32 && (code != 9 && code != 10 && code != 0))) { + state = S_put_urlstring; + break; + } + } + REPLACE_STRING(replace_buf); + state = S_next_char; + break; + case S_put_urlstring: + esc = HTEscape(replace_buf, URL_XALPHAS); + REPLACE_STRING(esc); + FREE(esc); + state = S_next_char; + break; + case S_got_outchar: + if (what == P_decimal || what == P_hex) { + if (cpe != ';' && cpe != '\0') + *(--p) = cpe; + p--; + } else if (what == P_named) { + *cp = cpe; + p = (*cp != ';') ? (cp - 1) : cp; + } else if (what == P_utf8) { + p = puni; + } + if (stype == st_URL && + /* Not a full HTEscape, only for 8bit and ctrl chars */ + (TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */ + (code < ' ' && (code != '\t' && code != '\n')))) { + state = S_put_urlchar; + break; + } else if (!hidden && code == 10 && *p == 10 + && q != qs && *(q - 1) == 13) { + /* + * If this is not a hidden string, and the current char is the + * LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW + */ + *(q - 1) = *p++; + state = S_done; + break; + } + *q++ = (char) code; + state = S_next_char; + break; + case S_put_urlchar: + *q++ = '%'; + REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */ + REPLACE_CHAR(hex[(TOASCII(code) & 15)]); + /* fall through */ + case S_next_char: + p++; /* fall through */ + case S_done: + state = S_text; + what = P_text; + /* for next round */ + } + } + + *q = '\0'; + if (chunk) { + HTChunkPutb(CHUNK, qs, (int) (q - qs + 1)); /* also terminates */ + if (stype == st_URL || stype == st_other) { + LYTrimHead(chunk->data); + LYTrimTail(chunk->data); + } + StrAllocCopy(*str, chunk->data); + HTChunkFree(chunk); + } else { + if (stype == st_URL || stype == st_other) { + LYTrimHead(qs); + LYTrimTail(qs); + } + } + return str; +} + +#undef REPLACE_CHAR +#undef REPLACE_STRING + +BOOL LYUCTranslateHTMLString(char **str, + int cs_from, + int cs_to, + int use_lynx_specials, + int plain_space, + int hidden, + CharUtil_st stype) +{ + BOOL ret = YES; + + /* May reallocate *str even if cs_to == 0 */ + if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE, + use_lynx_specials, plain_space, hidden, + NO, stype)) { + ret = NO; + } + return ret; +} + +BOOL LYUCTranslateBackFormData(char **str, + int cs_from, + int cs_to, + int plain_space) +{ + char **ret; + + /* May reallocate *str */ + ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE, + NO, plain_space, YES, + YES, st_HTML)); + return (BOOL) (ret != NULL); +} + +/* + * Parse a parameter from an HTML META tag, i.e., the CONTENT. + */ +char *LYParseTagParam(char *from, + const char *name) +{ + size_t len = strlen(name); + char *result = NULL; + char *string = from; + + do { + if ((string = StrChr(string, ';')) == NULL) + return NULL; + while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) { + string++; + } + if (strlen(string) < len) + return NULL; + } while (strncasecomp(string, name, (int) len) != 0); + string += len; + while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) { + string++; + } + + StrAllocCopy(result, string); + len = 0; + while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) { + len++; + } + result[len] = '\0'; + + /* + * Strip single quotes, just in case. + */ + if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) { + result[len - 1] = '\0'; + for (string = result; (string[0] = string[1]) != '\0'; ++string) ; + } + return result; +} + +/* + * Given a refresh-URL content string, parses the delay time and the URL + * string. Ignore the remainder of the content. + */ +void LYParseRefreshURL(char *content, + char **p_seconds, + char **p_address) +{ + char *cp; + char *cp1 = NULL; + char *Seconds = NULL; + + /* + * Look for the Seconds field. - FM + */ + cp = LYSkipBlanks(content); + if (*cp && isdigit(UCH(*cp))) { + cp1 = cp; + while (*cp1 && isdigit(UCH(*cp1))) + cp1++; + StrnAllocCopy(Seconds, cp, (size_t) (cp1 - cp)); + } + *p_seconds = Seconds; + *p_address = LYParseTagParam(content, "URL"); + + CTRACE((tfp, + "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n", + content, NonNull(*p_seconds), NonNull(*p_address))); +} + +/* + * This function processes META tags in HTML streams. - FM + */ +void LYHandleMETA(HTStructured * me, const BOOL *present, + STRING2PTR value, + char **include GCC_UNUSED) +{ + char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL; + char *href = NULL, *id_string = NULL, *temp = NULL; + char *cp, *cp0, *cp1 = NULL; + int url_type = 0; + + if (!me || !present) + return; + + /* + * Load the attributes for possible use by Lynx. - FM + */ + if (present[HTML_META_HTTP_EQUIV] && + non_empty(value[HTML_META_HTTP_EQUIV])) { + StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]); + convert_to_spaces(http_equiv, TRUE); + LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + if (*http_equiv == '\0') { + FREE(http_equiv); + } + } + if (present[HTML_META_NAME] && + non_empty(value[HTML_META_NAME])) { + StrAllocCopy(name, value[HTML_META_NAME]); + convert_to_spaces(name, TRUE); + LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + if (*name == '\0') { + FREE(name); + } + } + if (present[HTML_META_CONTENT] && + non_empty(value[HTML_META_CONTENT])) { + /* + * Technically, we should be creating a comma-separated list, but META + * tags come one at a time, and we'll handle (or ignore) them as each + * is received. Also, at this point, we only trim leading and trailing + * blanks from the CONTENT value, without translating any named + * entities or numeric character references, because how we should do + * that depends on what type of information it contains, and whether or + * not any of it might be sent to the screen. - FM + */ + StrAllocCopy(content, value[HTML_META_CONTENT]); + convert_to_spaces(content, FALSE); + LYTrimHead(content); + LYTrimTail(content); + if (*content == '\0') { + FREE(content); + } + } + if (present[HTML_META_CHARSET] && + non_empty(value[HTML_META_CHARSET])) { + StrAllocCopy(charset, value[HTML_META_CHARSET]); + convert_to_spaces(charset, TRUE); + LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + if (*charset == '\0') { + FREE(charset); + } + } + CTRACE((tfp, + "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n", + NONNULL(http_equiv), + NONNULL(name), + NONNULL(content), + NONNULL(charset))); + + /* + * Check for a text/html Content-Type with a charset directive, if we + * didn't already set the charset via a server's header. - AAC & FM + */ + if (isEmpty(me->node_anchor->charset) && + (charset || + (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) { + LYUCcharset *p_in = NULL; + LYUCcharset *p_out = NULL; + + if (charset) { + LYLowerCase(charset); + } else { + LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + LYLowerCase(content); + } + + if ((cp1 = charset) != NULL || + (cp1 = strstr(content, "charset")) != NULL) { + BOOL chartrans_ok = NO; + char *cp3 = NULL, *cp4; + int chndl; + + if (!charset) + cp1 += 7; + while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"') + cp1++; + + StrAllocCopy(cp3, cp1); /* copy to mutilate more */ + for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' && + *cp4 != ';' && *cp4 != ':' && + !WHITE(*cp4)); cp4++) { + ; /* do nothing */ + } + *cp4 = '\0'; + cp4 = cp3; + chndl = UCGetLYhndl_byMIME(cp3); + +#ifdef CAN_SWITCH_DISPLAY_CHARSET + /* Allow a switch to a more suitable display charset */ + if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) { + /* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT + should have the same setting for UCInfoStage. */ + HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED); + + me->outUCLYhndl = current_char_set; + HTAnchor_setUCInfoStage(me->node_anchor, + current_char_set, + UCT_STAGE_HTEXT, + UCT_SETBY_MIME); /* highest priority! */ + HTAnchor_setUCInfoStage(me->node_anchor, + current_char_set, + UCT_STAGE_STRUCTURED, + UCT_SETBY_MIME); /* highest priority! */ + me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor, + UCT_STAGE_HTEXT); + /* The SGML stage will be reset in change_chartrans_handling */ + } +#endif + + if (UCCanTranslateFromTo(chndl, current_char_set)) { + chartrans_ok = YES; + StrAllocCopy(me->node_anchor->charset, cp4); + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_PARSER, + UCT_SETBY_STRUCTURED); + } else if (chndl < 0) { + /* + * Got something but we don't recognize it. + */ + chndl = UCLYhndl_for_unrec; + if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */ + chndl = UCLYhndl_for_unspec; /* always >= 0 */ + if (UCCanTranslateFromTo(chndl, current_char_set)) { + chartrans_ok = YES; + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_PARSER, + UCT_SETBY_STRUCTURED); + } + } + if (chartrans_ok) { + p_in = HTAnchor_getUCInfoStage(me->node_anchor, + UCT_STAGE_PARSER); + p_out = HTAnchor_setUCInfoStage(me->node_anchor, + current_char_set, + UCT_STAGE_HTEXT, + UCT_SETBY_DEFAULT); + if (!p_out) { + /* + * Try again. + */ + p_out = HTAnchor_getUCInfoStage(me->node_anchor, + UCT_STAGE_HTEXT); + } + if (!strcmp(p_in->MIMEname, "x-transparent")) { + HTPassEightBitRaw = TRUE; + HTAnchor_setUCInfoStage(me->node_anchor, + HTAnchor_getUCLYhndl(me->node_anchor, + UCT_STAGE_HTEXT), + UCT_STAGE_PARSER, + UCT_SETBY_DEFAULT); + } + if (!strcmp(p_out->MIMEname, "x-transparent")) { + HTPassEightBitRaw = TRUE; + HTAnchor_setUCInfoStage(me->node_anchor, + HTAnchor_getUCLYhndl(me->node_anchor, + UCT_STAGE_PARSER), + UCT_STAGE_HTEXT, + UCT_SETBY_DEFAULT); + } + if ((p_in->enc != UCT_ENC_CJK) +#ifdef USE_JAPANESEUTF8_SUPPORT + && (p_in->enc != UCT_ENC_UTF8) +#endif + ) { + HTCJK = NOCJK; + if (!(p_in->codepoints & + UCT_CP_SUBSETOF_LAT1) && + chndl == current_char_set) { + HTPassEightBitRaw = TRUE; + } + } else if (p_out->enc == UCT_ENC_CJK) { + Set_HTCJK(p_in->MIMEname, p_out->MIMEname); + } + LYGetChartransInfo(me); + /* + * Update the chartrans info homologously to a Content-Type + * MIME header with a charset parameter. - FM + */ + if (me->UCLYhndl != chndl) { + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_MIME, + UCT_SETBY_STRUCTURED); + HTAnchor_setUCInfoStage(me->node_anchor, chndl, + UCT_STAGE_PARSER, + UCT_SETBY_STRUCTURED); + me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor, + UCT_STAGE_PARSER); + me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor, + UCT_STAGE_PARSER); + } + UCSetTransParams(&me->T, + me->inUCLYhndl, me->inUCI, + me->outUCLYhndl, me->outUCI); + } else { + /* + * Cannot translate. If according to some heuristic the given + * charset and the current display character both are likely to + * be like ISO-8859 in structure, pretend we have some kind of + * match. + */ + BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) && + isdigit(UCH(cp4[9]))); + BOOL given_is_8859like = (BOOL) (given_is_8859 + || !StrNCmp(cp4, "windows-", 8) + || !StrNCmp(cp4, "cp12", 4) + || !StrNCmp(cp4, "cp-12", 5)); + BOOL given_and_display_8859like = (BOOL) (given_is_8859like && + (strstr(LYchar_set_names[current_char_set], + "ISO-8859") || + strstr(LYchar_set_names[current_char_set], + "windows-"))); + + if (given_is_8859) { + cp1 = &cp4[10]; + while (*cp1 && + isdigit(UCH((*cp1)))) + cp1++; + *cp1 = '\0'; + } + if (given_and_display_8859like) { + StrAllocCopy(me->node_anchor->charset, cp4); + HTPassEightBitRaw = TRUE; + } + HTAlert(*cp4 ? cp4 : me->node_anchor->charset); + + } + FREE(cp3); + + if (me->node_anchor->charset) { + CTRACE((tfp, + "LYHandleMETA: New charset: %s\n", + me->node_anchor->charset)); + } + } + /* + * Set the kcode element based on the charset. - FM + */ + HText_setKcode(me->text, me->node_anchor->charset, p_in); + } + + /* + * Make sure we have META name/value pairs to handle. - FM + */ + if (!(http_equiv || name) || !content) + goto free_META_copies; + + /* + * Check for a no-cache Pragma + * or Cache-Control directive. - FM + */ + if (!strcasecomp(NonNull(http_equiv), "Pragma") || + !strcasecomp(NonNull(http_equiv), "Cache-Control")) { + LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + if (!strcasecomp(content, "no-cache")) { + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + } + + /* + * If we didn't get a Cache-Control MIME header, and the META has one, + * convert to lowercase, store it in the anchor element, and if we + * haven't yet set no_cache, check whether we should. - FM + */ + if ((!me->node_anchor->cache_control) && + !strcasecomp(NonNull(http_equiv), "Cache-Control")) { + LYLowerCase(content); + StrAllocCopy(me->node_anchor->cache_control, content); + if (me->node_anchor->no_cache == FALSE) { + cp0 = content; + while ((cp = strstr(cp0, "no-cache")) != NULL) { + cp += 8; + while (*cp != '\0' && WHITE(*cp)) + cp++; + if (*cp == '\0' || *cp == ';') { + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + break; + } + cp0 = cp; + } + if (me->node_anchor->no_cache == TRUE) + goto free_META_copies; + cp0 = content; + while ((cp = strstr(cp0, "max-age")) != NULL) { + cp += 7; + while (*cp != '\0' && WHITE(*cp)) + cp++; + if (*cp == '=') { + cp++; + while (*cp != '\0' && WHITE(*cp)) + cp++; + if (isdigit(UCH(*cp))) { + cp0 = cp; + while (isdigit(UCH(*cp))) + cp++; + if (*cp0 == '0' && cp == (cp0 + 1)) { + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + break; + } + } + } + cp0 = cp; + } + } + } + + /* + * Check for an Expires directive. - FM + */ + } else if (!strcasecomp(NonNull(http_equiv), "Expires")) { + /* + * If we didn't get an Expires MIME header, store it in the anchor + * element, and if we haven't yet set no_cache, check whether we + * should. Note that we don't accept a Date header via META tags, + * because it's likely to be untrustworthy, but do check for a Date + * header from a server when making the comparison. - FM + */ + LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + StrAllocCopy(me->node_anchor->expires, content); + if (me->node_anchor->no_cache == FALSE) { + if (!strcmp(content, "0")) { + /* + * The value is zero, which we treat as an absolute no-cache + * directive. - FM + */ + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + } else if (me->node_anchor->date != NULL) { + /* + * We have a Date header, so check if the value is less than or + * equal to that. - FM + */ + if (LYmktime(content, TRUE) <= + LYmktime(me->node_anchor->date, TRUE)) { + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + } + } else if (LYmktime(content, FALSE) == 0) { + /* + * We don't have a Date header, and the value is in past for + * us. - FM + */ + me->node_anchor->no_cache = TRUE; + HText_setNoCache(me->text); + } + } + + /* + * Check for a Refresh directive. - FM + */ + } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) { + char *Seconds = NULL; + + LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset, + NO, NO, YES, st_other); + LYParseRefreshURL(content, &Seconds, &href); + + if (Seconds) { + if (href) { + /* + * We found a URL field, so check it out. - FM + */ + if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) { + /* + * The specs require a complete URL, but this is a + * Netscapism, so don't expect the author to know that. - + * FM + */ + HTUserMsg(REFRESH_URL_NOT_ABSOLUTE); + /* + * Use the document's address as the base. - FM + */ + if (*href != '\0') { + temp = HTParse(href, + me->node_anchor->address, PARSE_ALL); + StrAllocCopy(href, temp); + FREE(temp); + } else { + StrAllocCopy(href, me->node_anchor->address); + HText_setNoCache(me->text); + } + + } else { + /* + * Check whether to fill in localhost. - FM + */ + LYFillLocalFileURL(&href, + (me->inBASE ? + me->base_href : me->node_anchor->address)); + } + + /* + * Set the no_cache flag if the Refresh URL is the same as the + * document's address. - FM + */ + if (!strcmp(href, me->node_anchor->address)) { + HText_setNoCache(me->text); + } + } else { + /* + * We didn't find a URL field, so use the document's own + * address and set the no_cache flag. - FM + */ + StrAllocCopy(href, me->node_anchor->address); + HText_setNoCache(me->text); + } + /* + * Check for an anchor in http or https URLs. - FM + */ + cp = NULL; + /* id_string seems to be used wrong below if given. + not that it matters much. avoid setting it here. - kw */ + if (track_internal_links && + (StrNCmp(href, "http", 4) == 0) && + (cp = StrChr(href, '#')) != NULL) { + StrAllocCopy(id_string, cp); + *cp = '\0'; + } + if (me->inA) { + /* + * Ugh! The META tag, which is a HEAD element, is in an + * Anchor, which is BODY element. All we can do is close the + * Anchor and cross our fingers. - FM + */ + if (me->inBoldA == TRUE && me->inBoldH == FALSE) + HText_appendCharacter(me->text, LY_BOLD_END_CHAR); + me->inBoldA = FALSE; + HText_endAnchor(me->text, me->CurrentANum); + me->inA = FALSE; + me->CurrentANum = 0; + } + me->CurrentA = HTAnchor_findChildAndLink + ( + me->node_anchor, /* Parent */ + id_string, /* Tag */ + href, /* Address */ + (HTLinkType *) 0); /* Type */ + if (id_string) + *cp = '#'; + FREE(id_string); + LYEnsureSingleSpace(me); + if (me->inUnderline == FALSE) + HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR); + HTML_put_string(me, "REFRESH("); + HTML_put_string(me, Seconds); + HTML_put_string(me, " sec):"); + FREE(Seconds); + if (me->inUnderline == FALSE) + HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR); + HTML_put_character(me, ' '); + me->in_word = NO; + HText_beginAnchor(me->text, me->inUnderline, me->CurrentA); + if (me->inBoldH == FALSE) + HText_appendCharacter(me->text, LY_BOLD_START_CHAR); + HTML_put_string(me, href); + FREE(href); + if (me->inBoldH == FALSE) + HText_appendCharacter(me->text, LY_BOLD_END_CHAR); + HText_endAnchor(me->text, 0); + LYEnsureSingleSpace(me); + } + + /* + * Check for a suggested filename via a Content-Disposition with a + * filename=name.suffix in it, if we don't already have it via a server + * header. - FM + */ + } else if (isEmpty(me->node_anchor->SugFname) && + !strcasecomp((http_equiv ? + http_equiv : ""), "Content-Disposition")) { + cp = content; + while (*cp != '\0' && strncasecomp(cp, "filename", 8)) + cp++; + if (*cp != '\0') { + cp = LYSkipBlanks(cp + 8); + if (*cp == '=') + cp++; + cp = LYSkipBlanks(cp); + if (*cp != '\0') { + StrAllocCopy(me->node_anchor->SugFname, cp); + if (*me->node_anchor->SugFname == '"') { + if ((cp = StrChr((me->node_anchor->SugFname + 1), + '"')) != NULL) { + *(cp + 1) = '\0'; + HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname); + if (isEmpty(me->node_anchor->SugFname)) { + FREE(me->node_anchor->SugFname); + } + } else { + FREE(me->node_anchor->SugFname); + } + } +#if defined(UNIX) && !defined(DOSPATH) + /* + * If blanks are not legal for local filenames, replace them + * with underscores. + */ + if ((cp = me->node_anchor->SugFname) != NULL) { + while (*cp != '\0') { + if (isspace(UCH(*cp))) + *cp = '_'; + ++cp; + } + } +#endif + } + } + /* + * Check for a Set-Cookie directive. - AK + */ + } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) { + /* + * This will need to be updated when Set-Cookie/Set-Cookie2 handling is + * finalized. For now, we'll still assume "historical" cookies in META + * directives. - FM + */ + url_type = is_url(me->inBASE ? + me->base_href : me->node_anchor->address); + if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) { + LYSetCookie(content, + NULL, + (me->inBASE ? + me->base_href : me->node_anchor->address)); + } + } + + /* + * Free the copies. - FM + */ + free_META_copies: + FREE(http_equiv); + FREE(name); + FREE(content); + FREE(charset); +} + +/* + * This function handles P elements in HTML streams. + * If start is TRUE it handles a start tag, and if + * FALSE, an end tag. We presently handle start + * and end tags identically, but this can lead to + * a different number of blank lines between the + * current paragraph and subsequent text when a P + * end tag is present or not in the markup. - FM + */ +void LYHandlePlike(HTStructured * me, const BOOL *present, + STRING2PTR value, + char **include GCC_UNUSED, + int align_idx, + int start) +{ + /* + * FIG content should be a true block, which like P inherits the current + * style. APPLET is like character elements or an ALT attribute, unless + * its content contains a block element. If we encounter a P in either's + * content, we set flags to treat the content as a block - FM + */ + if (start) { + if (me->inFIG) + me->inFIGwithP = TRUE; + + if (me->inAPPLET) + me->inAPPLETwithP = TRUE; + } + + UPDATE_STYLE; + if (me->List_Nesting_Level >= 0) { + /* + * We're in a list. Treat P as an instruction to create one blank + * line, if not already present, then fall through to handle + * attributes, with the "second line" margins - FM + */ + if (me->inP) { + if (me->inFIG || me->inAPPLET || + me->inCAPTION || me->inCREDIT || + me->sp->style->spaceAfter > 0 || + (start && me->sp->style->spaceBefore > 0)) { + LYEnsureDoubleSpace(me); + } else { + LYEnsureSingleSpace(me); + } + } + } else if (me->sp[0].tag_number == HTML_ADDRESS) { + /* + * We're in an ADDRESS. Treat P as an instruction to start a newline, + * if needed, then fall through to handle attributes - FM + */ + if (!HText_LastLineEmpty(me->text, FALSE)) { + HText_setLastChar(me->text, ' '); /* absorb white space */ + HText_appendCharacter(me->text, '\r'); + } + } else { + if (start) { + if (!(me->inLABEL && !me->inP)) { + HText_appendParagraph(me->text); + } + } else if (me->sp->style->spaceAfter > 0) { + LYEnsureDoubleSpace(me); + } else { + LYEnsureSingleSpace(me); + } + me->inLABEL = FALSE; + } + me->in_word = NO; + + if (LYoverride_default_alignment(me)) { + me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment; + } else if ((me->List_Nesting_Level >= 0 && + (me->sp->style->id == ST_DivCenter || + me->sp->style->id == ST_DivLeft || + me->sp->style->id == ST_DivRight)) || + ((me->Division_Level < 0) && + (me->sp->style->id == ST_Normal || + me->sp->style->id == ST_Preformatted))) { + me->sp->style->alignment = HT_LEFT; + } else { + me->sp->style->alignment = (short) me->current_default_alignment; + } + + if (start && align_idx >= 0) { + if (present && present[align_idx] && value[align_idx]) { + if (!strcasecomp(value[align_idx], "center") && + !(me->List_Nesting_Level >= 0 && !me->inP)) + me->sp->style->alignment = HT_CENTER; + else if (!strcasecomp(value[align_idx], "right") && + !(me->List_Nesting_Level >= 0 && !me->inP)) + me->sp->style->alignment = HT_RIGHT; + else if (!strcasecomp(value[align_idx], "left") || + !strcasecomp(value[align_idx], "justify")) + me->sp->style->alignment = HT_LEFT; + } + + } + + /* + * Mark that we are starting a new paragraph and don't have any of its + * text yet - FM + */ + me->inP = FALSE; + + return; +} + +/* + * This function handles SELECT elements in HTML streams. + * If start is TRUE it handles a start tag, and if FALSE, + * an end tag. - FM + */ +void LYHandleSELECT(HTStructured * me, const BOOL *present, + STRING2PTR value, + char **include GCC_UNUSED, + int start) +{ + int i; + + if (start == TRUE) { + char *name = NULL; + BOOLEAN multiple = NO; + char *size = NULL; + + /* + * Initialize the disable attribute. + */ + me->select_disabled = FALSE; + + /* + * Check for unclosed TEXTAREA. + */ + if (me->inTEXTAREA) { + if (LYBadHTML(me)) { + LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n"); + } + } + + /* + * Set to know we are in a select tag. + */ + me->inSELECT = TRUE; + + if (!(present && present[HTML_SELECT_NAME] && + non_empty(value[HTML_SELECT_NAME]))) { + StrAllocCopy(name, ""); + } else if (StrChr(value[HTML_SELECT_NAME], '&') == NULL) { + StrAllocCopy(name, value[HTML_SELECT_NAME]); + } else { + StrAllocCopy(name, value[HTML_SELECT_NAME]); + UNESCAPE_FIELDNAME_TO_STD(&name); + } + if (present && present[HTML_SELECT_MULTIPLE]) + multiple = YES; + if (present && present[HTML_SELECT_DISABLED]) + me->select_disabled = TRUE; + if (present && present[HTML_SELECT_SIZE] && + non_empty(value[HTML_SELECT_SIZE])) { + /* + * Let the size be determined by the number of OPTIONs. - FM + */ + CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n", + value[HTML_SELECT_SIZE])); + } + + if (me->inBoldH == TRUE && + (multiple == NO || LYSelectPopups == FALSE)) { + HText_appendCharacter(me->text, LY_BOLD_END_CHAR); + me->inBoldH = FALSE; + me->needBoldH = TRUE; + } + if (me->inUnderline == TRUE && + (multiple == NO || LYSelectPopups == FALSE)) { + HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR); + me->inUnderline = FALSE; + } + + if ((multiple == NO && LYSelectPopups == TRUE) && + (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE || + !me->sp->style->freeFormat) && + HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) { + /* + * Force a newline when we're using a popup in a PRE block and are + * within 7 columns from the right margin. This will allow for the + * '[' popup designator and help avoid a wrap in the underscore + * placeholder for the retracted popup entry in the HText + * structure. - FM + */ + HTML_put_character(me, '\n'); + me->in_word = NO; + } + + LYCheckForID(me, present, value, (int) HTML_SELECT_ID); + + HText_beginSelect(name, ATTR_CS_IN, multiple, size); + FREE(name); + FREE(size); + + me->first_option = TRUE; + } else { + /* + * Handle end tag. + */ + char *ptr; + + /* + * Make sure we had a select start tag. + */ + if (!me->inSELECT) { + if (LYBadHTML(me)) { + LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n"); + } + return; + } + + /* + * Set to know that we are no longer in a select tag. + */ + me->inSELECT = FALSE; + + /* + * Clear the disable attribute. + */ + me->select_disabled = FALSE; + + /* + * Finish the data off. + */ + HTChunkTerminate(&me->option); + /* + * Finish the previous option. + */ + ptr = HText_setLastOptionValue(me->text, + me->option.data, + me->LastOptionValue, + LAST_ORDER, + me->LastOptionChecked, + me->UCLYhndl, + ATTR_CS_IN); + FREE(me->LastOptionValue); + + me->LastOptionChecked = FALSE; + + if (HTCurSelectGroupType == F_CHECKBOX_TYPE || + LYSelectPopups == FALSE) { + /* + * Start a newline after the last checkbox/button option. + */ + LYEnsureSingleSpace(me); + } else { + /* + * Output popup box with the default option to screen, but use + * non-breaking spaces for output. + */ + if (ptr && + me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) { + /* + * The code inadequately handles OPTION fields in PRE tags. + * We'll put up a minimum of 6 characters, and if any more + * would exceed the wrap column, we'll ignore them. + */ + for (i = 0; i < 6; i++) { + if (*ptr == ' ') + HText_appendCharacter(me->text, HT_NON_BREAK_SPACE); + else + HText_appendCharacter(me->text, *ptr); + ptr++; + } + } + for (; non_empty(ptr); ptr++) { + if (*ptr == ' ') + HText_appendCharacter(me->text, HT_NON_BREAK_SPACE); + else + HText_appendCharacter(me->text, *ptr); + } + /* + * Add end option character. + */ + if (!me->first_option) { + HText_appendCharacter(me->text, ']'); + HText_setLastChar(me->text, ']'); + me->in_word = YES; + } + } + HTChunkClear(&me->option); + + if (me->Underline_Level > 0 && me->inUnderline == FALSE) { + HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR); + me->inUnderline = TRUE; + } + if (me->needBoldH == TRUE && me->inBoldH == FALSE) { + HText_appendCharacter(me->text, LY_BOLD_START_CHAR); + me->inBoldH = TRUE; + me->needBoldH = FALSE; + } + } +} + +/* + * This function strips white characters and + * generally fixes up attribute values that + * were received from the SGML parser and + * are to be treated as partial or absolute + * URLs. - FM + */ +int LYLegitimizeHREF(HTStructured * me, char **href, + int force_slash, + int strip_dots) +{ + int url_type = 0; + char *p = NULL; + char *pound = NULL; + const char *Base = NULL; + + if (!me || !href || isEmpty(*href)) + return (url_type); + + if (!LYTrimStartfile(*href)) { + /* + * Collapse spaces in the actual URL, but just protect against tabs or + * newlines in the fragment, if present. This seeks to cope with + * atrocities inflicted on the Web by authoring tools such as + * Frontpage. - FM + */ + + /* Before working on spaces check if we have any, usually none. */ + p = LYSkipNonBlanks(*href); + + if (*p) { /* p == first space character */ + /* no reallocs below, all converted in place */ + + pound = findPoundSelector(*href); + + if (pound != NULL && pound < p) { + convert_to_spaces(p, FALSE); /* done */ + + } else { + if (pound != NULL) + *pound = '\0'; /* mark */ + + /* + * No blanks really belong in the HREF, + * but if it refers to an actual file, + * it may actually have blanks in the name. + * Try to accommodate. See also HTParse(). + */ + if (LYRemoveNewlines(p) || StrChr(p, '\t') != 0) { + LYRemoveBlanks(p); /* a compromise... */ + } + + if (pound != NULL) { + p = StrChr(p, '\0'); + *pound = '#'; /* restore */ + convert_to_spaces(pound, FALSE); + if (p < pound) { + int n; + + for (n = 0; (p[n] = pound[n]) != '\0'; ++n) ; + } + } + } + } + } + if (**href == '\0') + return (url_type); + + TRANSLATE_AND_UNESCAPE_TO_STD(href); + + Base = me->inBASE ? + me->base_href : me->node_anchor->address; + + url_type = is_url(*href); + if (!url_type && force_slash && **href == '.' && + (!strcmp(*href, ".") || !strcmp(*href, "..")) && + !isFILE_URL(Base)) { + /* + * The Fielding RFC/ID for resolving partial HREFs says that a slash + * should be on the end of the preceding symbolic element for "." and + * "..", but all tested browsers only do that for an explicit "./" or + * "../", so we'll respect the RFC/ID only if force_slash was TRUE and + * it's not a file URL. - FM + */ + StrAllocCat(*href, "/"); + } + if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') && + !strncasecomp(Base, "http", 4)) { + /* + * We will be resolving a partial reference versus an http or https + * URL, and it has lead dots, which may be retained when resolving via + * HTParse(), but the request would fail if the first element of the + * resultant path is two dots, because no http or https server accepts + * such paths, and the current URL draft, likely to become an RFC, says + * that it's optional for the UA to strip them as a form of error + * recovery. So we will, recursively, for http/https URLs, like the + * "major market browsers" which made this problem so common on the + * Web, but we'll also issue a message about it, such that the bad + * partial reference might get corrected by the document provider. - + * FM + */ + char *temp = NULL, *path = NULL, *cp; + const char *str = ""; + + temp = HTParse(*href, Base, PARSE_ALL); + path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION); + if (!StrNCmp(path, "/..", 3)) { + cp = (path + 3); + if (LYIsHtmlSep(*cp) || *cp == '\0') { + if (Base[4] == 's') { + str = "s"; + } + CTRACE((tfp, + "LYLegitimizeHREF: Bad value '%s' for http%s URL.\n", + *href, str)); + CTRACE((tfp, " Stripping lead dots.\n")); + if (!me->inBadHREF) { + HTUserMsg(BAD_PARTIAL_REFERENCE); + me->inBadHREF = TRUE; + } + } + if (*cp == '\0') { + StrAllocCopy(*href, "/"); + } else if (LYIsHtmlSep(*cp)) { + while (!StrNCmp(cp, "/..", 3)) { + if (*(cp + 3) == '/') { + cp += 3; + continue; + } else if (*(cp + 3) == '\0') { + *(cp + 1) = '\0'; + *(cp + 2) = '\0'; + } + break; + } + StrAllocCopy(*href, cp); + } + } + FREE(temp); + FREE(path); + } + return (url_type); +} + +/* + * This function checks for a Content-Base header, + * and if not present, a Content-Location header + * which is an absolute URL, and sets the BASE + * accordingly. If set, it will be replaced by + * any BASE tag in the HTML stream, itself. - FM + */ +void LYCheckForContentBase(HTStructured * me) +{ + char *cp = NULL; + BOOL present[HTML_BASE_ATTRIBUTES]; + const char *value[HTML_BASE_ATTRIBUTES]; + int i; + + if (!(me && me->node_anchor)) + return; + + if (me->node_anchor->content_base != NULL) { + /* + * We have a Content-Base value. Use it if it's non-zero length. - FM + */ + if (*me->node_anchor->content_base == '\0') + return; + StrAllocCopy(cp, me->node_anchor->content_base); + LYRemoveBlanks(cp); + } else if (me->node_anchor->content_location != NULL) { + /* + * We didn't have a Content-Base value, but do have a Content-Location + * value. Use it if it's an absolute URL. - FM + */ + if (*me->node_anchor->content_location == '\0') + return; + StrAllocCopy(cp, me->node_anchor->content_location); + LYRemoveBlanks(cp); + if (!is_url(cp)) { + FREE(cp); + return; + } + } else { + /* + * We had neither a Content-Base nor Content-Location value. - FM + */ + return; + } + + /* + * If we collapsed to a zero-length value, ignore it. - FM + */ + if (*cp == '\0') { + FREE(cp); + return; + } + + /* + * Pass the value to HTML_start_element as the HREF of a BASE tag. - FM + */ + for (i = 0; i < HTML_BASE_ATTRIBUTES; i++) + present[i] = NO; + present[HTML_BASE_HREF] = YES; + value[HTML_BASE_HREF] = (const char *) cp; + (*me->isa->start_element) (me, HTML_BASE, present, value, + 0, 0); + FREE(cp); +} + +/* + * This function creates NAMEd Anchors if a non-zero-length NAME + * or ID attribute was present in the tag. - FM + */ +void LYCheckForID(HTStructured * me, const BOOL *present, + STRING2PTR value, + int attribute) +{ + HTChildAnchor *ID_A = NULL; + char *temp = NULL; + + if (!(me && me->text)) + return; + + if (present && present[attribute] + && non_empty(value[attribute])) { + /* + * Translate any named or numeric character references. - FM + */ + StrAllocCopy(temp, value[attribute]); + LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset, + NO, NO, YES, st_URL); + + /* + * Create the link if we still have a non-zero-length string. - FM + */ + if ((temp[0] != '\0') && + (ID_A = HTAnchor_findChildAndLink + ( + me->node_anchor, /* Parent */ + temp, /* Tag */ + NULL, /* Address */ + (HTLinkType *) 0))) { /* Type */ + HText_beginAnchor(me->text, me->inUnderline, ID_A); + HText_endAnchor(me->text, 0); + } + FREE(temp); + } +} + +/* + * This function creates a NAMEd Anchor for the ID string + * passed to it directly as an argument. It assumes the + * does not need checking for character references. - FM + */ +void LYHandleID(HTStructured * me, const char *id) +{ + HTChildAnchor *ID_A = NULL; + + if (!(me && me->text) || + isEmpty(id)) + return; + + /* + * Create the link if we still have a non-zero-length string. - FM + */ + if ((ID_A = HTAnchor_findChildAndLink + ( + me->node_anchor, /* Parent */ + id, /* Tag */ + NULL, /* Address */ + (HTLinkType *) 0)) != NULL) { /* Type */ + HText_beginAnchor(me->text, me->inUnderline, ID_A); + HText_endAnchor(me->text, 0); + } +} + +/* + * This function checks whether we want to override + * the current default alignment for paragraphs and + * instead use that specified in the element's style + * sheet. - FM + */ +BOOLEAN LYoverride_default_alignment(HTStructured * me) +{ + if (!me) + return NO; + + switch (me->sp[0].tag_number) { + case HTML_BLOCKQUOTE: + case HTML_BQ: + case HTML_NOTE: + case HTML_FN: + case HTML_ADDRESS: + me->sp->style->alignment = HT_LEFT; + return YES; + + default: + break; + } + return NO; +} + +/* + * This function inserts newlines if needed to create double spacing, + * and sets the left margin for subsequent text to the second line + * indentation of the current style. - FM + */ +void LYEnsureDoubleSpace(HTStructured * me) +{ + if (!me || !me->text) + return; + + if (!HText_LastLineEmpty(me->text, FALSE)) { + HText_setLastChar(me->text, ' '); /* absorb white space */ + HText_appendCharacter(me->text, '\r'); + HText_appendCharacter(me->text, '\r'); + } else if (!HText_PreviousLineEmpty(me->text, FALSE)) { + HText_setLastChar(me->text, ' '); /* absorb white space */ + HText_appendCharacter(me->text, '\r'); + } else if (me->List_Nesting_Level >= 0) { + HText_NegateLineOne(me->text); + } + me->in_word = NO; + return; +} + +/* + * This function inserts a newline if needed to create single spacing, + * and sets the left margin for subsequent text to the second line + * indentation of the current style. - FM + */ +void LYEnsureSingleSpace(HTStructured * me) +{ + if (!me || !me->text) + return; + + if (!HText_LastLineEmpty(me->text, FALSE)) { + HText_setLastChar(me->text, ' '); /* absorb white space */ + HText_appendCharacter(me->text, '\r'); + } else if (me->List_Nesting_Level >= 0) { + HText_NegateLineOne(me->text); + } + me->in_word = NO; + return; +} + +/* + * This function resets paragraph alignments for block + * elements which do not have a defined style sheet. - FM + */ +void LYResetParagraphAlignment(HTStructured * me) +{ + if (!me) + return; + + if (me->List_Nesting_Level >= 0 || + ((me->Division_Level < 0) && + (me->sp->style->id == ST_Normal || + me->sp->style->id == ST_Preformatted))) { + me->sp->style->alignment = HT_LEFT; + } else { + me->sp->style->alignment = (short) me->current_default_alignment; + } + return; +} + +/* + * This example function checks whether the given anchor has + * an address with a file scheme, and if so, loads it into the + * the SGML parser's context->url element, which was passed as + * the second argument. The handle_comment() calling function in + * SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup + * into the corresponding stream, homologously to an SSI by an + * HTTP server. - FM + * + * For functions similar to this but which depend on details of + * the HTML handler's internal data, the calling interface should + * be changed, and functions in SGML.c would have to make sure not + * to call such functions inappropriately (e.g., calling a function + * specific to the Lynx_HTML_Handler when SGML.c output goes to + * some other HTStructured object like in HTMLGen.c), or the new + * functions could be added to the SGML.h interface. + */ +BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, + char **url) +{ + if (!(anchor && anchor->address)) + return FALSE; + + if (!isFILE_URL(anchor->address)) + return FALSE; + + if (!LYisLocalHost(anchor->address)) + return FALSE; + + StrAllocCopy(*url, anchor->address); + return TRUE; +} + +/* + * This function is called from the SGML parser to look at comments + * and see whether we should collect some info from them. Currently + * it only looks for comments with Message-Id and Subject info, in the + * exact form generated by MHonArc for archived mailing list. If found, + * the info is stored in the document's HTParentAnchor. It can later be + * used for generating a mail response. + * + * We are extra picky here because there isn't any official definition + * for these kinds of comments - we might (and still can) misinterpret + * arbitrary comments as something they aren't. + * + * If something doesn't look right, for example invalid characters, the + * strings are not stored. Mail responses will use something else as + * the subject, probably the document URL, and will not have an + * In-Reply-To header. + * + * All this is a hack - to do this the right way, mailing list archivers + * would have to agree on some better mechanism to make this kind of info + * from original mail headers available, for example using LINK. - kw + */ +BOOLEAN LYCommentHacks(HTParentAnchor *anchor, + const char *comment) +{ + const char *cp; + size_t len; + + if (comment == NULL) + return FALSE; + + if (!(anchor && anchor->address)) + return FALSE; + + if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) { + char *messageid = NULL; + char *p; + + for (cp = comment + 17; *cp; cp++) { + if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) { + break; + } + } + if (strcmp(cp, " --")) { + return FALSE; + } + cp = comment + 17; + StrAllocCopy(messageid, cp); + /* This should be ok - message-id should only contain 7-bit ASCII */ + if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL)) + return FALSE; + for (p = messageid; *p; p++) { + if (UCH(*p) >= 127 || !isgraph(UCH(*p))) { + break; + } + } + if (strcmp(p, " --")) { + FREE(messageid); + return FALSE; + } + if ((p = StrChr(messageid, '@')) == NULL || p[1] == '\0') { + FREE(messageid); + return FALSE; + } + p = messageid; + if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) { + p[len - 3] = '\0'; + } else { + FREE(messageid); + return FALSE; + } + if (HTAnchor_setMessageID(anchor, messageid)) { + FREE(messageid); + return TRUE; + } else { + FREE(messageid); + return FALSE; + } + } + if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) { + char *subject = NULL; + char *p; + + for (cp = comment + 14; *cp; cp++) { + if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) { + return FALSE; + } + } + cp = comment + 14; + StrAllocCopy(subject, cp); + /* @@@ + * This may not be the right thing for the subject - but mail + * subjects shouldn't contain 8-bit characters in raw form anyway. + * We have to unescape character entities, since that's what MHonArc + * seems to generate. But if after that there are 8-bit characters + * the string is rejected. We would probably not know correctly + * what charset to assume anyway - the mail sender's can differ from + * the archive's. And the code for sending mail cannot deal well + * with 8-bit characters - we should not put them in the Subject + * header in raw form, but don't have MIME encoding implemented. + * Someone may want to do more about this... - kw + */ + if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML)) + return FALSE; + for (p = subject; *p; p++) { + if (UCH(*p) >= 127 || !isprint(UCH(*p))) { + FREE(subject); + return FALSE; + } + } + p = subject; + if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) { + p[len - 3] = '\0'; + } else { + FREE(subject); + return FALSE; + } + if (HTAnchor_setSubject(anchor, subject)) { + FREE(subject); + return TRUE; + } else { + FREE(subject); + return FALSE; + } + } + + return FALSE; +} + + /* + * Create the Title with any left-angle-brackets converted to < entities + * and any ampersands converted to & entities. - FM + * + * Convert 8-bit letters to &#xUUUU to avoid dependencies from display + * character set which may need changing. Do NOT convert any 8-bit chars + * if we have CJK display. - LP + */ +void LYformTitle(char **dst, + const char *src) +{ + if (HTCJK == JAPANESE) { + char *tmp_buffer = NULL; + + if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0) + outofmem(__FILE__, "LYformTitle"); + + switch (kanji_code) { /* 1997/11/22 (Sat) 09:28:00 */ + case EUC: + TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer); + break; + case SJIS: + TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer); + break; + default: + CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value.")); + strcpy(tmp_buffer, src); + break; + } + StrAllocCopy(*dst, tmp_buffer); + FREE(tmp_buffer); + } else { + StrAllocCopy(*dst, src); + } +} |