summaryrefslogtreecommitdiffstats
path: root/src/LYCharUtils.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/LYCharUtils.c')
-rw-r--r--src/LYCharUtils.c3410
1 files changed, 3410 insertions, 0 deletions
diff --git a/src/LYCharUtils.c b/src/LYCharUtils.c
new file mode 100644
index 0000000..12c50a0
--- /dev/null
+++ b/src/LYCharUtils.c
@@ -0,0 +1,3410 @@
+/*
+ * $LynxId: LYCharUtils.c,v 1.131 2018/03/05 22:32:14 tom Exp $
+ *
+ * Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
+ * ==========================================================================
+ */
+#include <HTUtils.h>
+#include <SGML.h>
+
+#define Lynx_HTML_Handler
+#include <HTChunk.h>
+#include <HText.h>
+#include <HTStyle.h>
+#include <HTMIME.h>
+#include <HTML.h>
+
+#include <HTCJK.h>
+#include <HTAtom.h>
+#include <HTMLGen.h>
+#include <HTParse.h>
+#include <UCMap.h>
+#include <UCDefs.h>
+#include <UCAux.h>
+
+#include <LYGlobalDefs.h>
+#include <LYCharUtils.h>
+#include <LYCharSets.h>
+
+#include <HTAlert.h>
+#include <HTForms.h>
+#include <HTNestedList.h>
+#include <GridText.h>
+#include <LYStrings.h>
+#include <LYUtils.h>
+#include <LYMap.h>
+#include <LYBookmark.h>
+#include <LYCurses.h>
+#include <LYCookie.h>
+
+#include <LYexit.h>
+#include <LYLeaks.h>
+
+/*
+ * Used for nested lists. - FM
+ */
+int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */
+int OL_VOID = -29998; /* flag for whether a count is set */
+
+static size_t count_char(const char *value, int ch)
+{
+ const char *found;
+ size_t result = 0;
+
+ while ((*value != '\0') && (found = StrChr(value, ch)) != NULL) {
+ ++result;
+ value = (found + 1);
+ }
+ return result;
+}
+
+/*
+ * This function converts any ampersands in a pre-allocated string to "&amp;".
+ * If brackets is TRUE, it also converts any angle-brackets to "&lt;" or "&gt;".
+ */
+void LYEntify(char **in_out,
+ int brackets)
+{
+ char *source = *in_out;
+ char *target;
+ char *result = NULL;
+ size_t count_AMPs = 0;
+ size_t count_LTs = 0;
+ size_t count_GTs = 0;
+
+#ifdef CJK_EX
+ enum _state {
+ S_text,
+ S_esc,
+ S_dollar,
+ S_paren,
+ S_nonascii_text,
+ S_dollar_paren
+ } state = S_text;
+ int in_sjis = 0;
+#endif
+
+ if (non_empty(source)) {
+ count_AMPs = count_char(*in_out, '&');
+ if (brackets) {
+ count_LTs = count_char(*in_out, '<');
+ count_GTs = count_char(*in_out, '>');
+ }
+
+ if (count_AMPs != 0 || count_LTs != 0 || count_GTs != 0) {
+
+ target = typecallocn(char,
+ (strlen(*in_out)
+ + (4 * count_AMPs)
+ + (3 * count_LTs)
+ + (3 * count_GTs) + 1));
+
+ if ((result = target) == NULL)
+ outofmem(__FILE__, "LYEntify");
+
+ for (source = *in_out; *source; source++) {
+#ifdef CJK_EX
+ if (IS_CJK_TTY) {
+ switch (state) {
+ case S_text:
+ if (*source == '\033') {
+ state = S_esc;
+ *target++ = *source;
+ continue;
+ }
+ break;
+
+ case S_esc:
+ if (*source == '$') {
+ state = S_dollar;
+ } else if (*source == '(') {
+ state = S_paren;
+ } else {
+ state = S_text;
+ }
+ *target++ = *source;
+ continue;
+
+ case S_dollar:
+ if (*source == '@' || *source == 'B' || *source == 'A') {
+ state = S_nonascii_text;
+ } else if (*source == '(') {
+ state = S_dollar_paren;
+ } else {
+ state = S_text;
+ }
+ *target++ = *source;
+ continue;
+
+ case S_dollar_paren:
+ if (*source == 'C') {
+ state = S_nonascii_text;
+ } else {
+ state = S_text;
+ }
+ *target++ = *source;
+ continue;
+
+ case S_paren:
+ if (*source == 'B' || *source == 'J' || *source == 'T') {
+ state = S_text;
+ } else if (*source == 'I') {
+ state = S_nonascii_text;
+ } else if (*source == '\033') {
+ state = S_esc;
+ }
+ *target++ = *source;
+ continue;
+
+ case S_nonascii_text:
+ if (*source == '\033')
+ state = S_esc;
+ *target++ = *source;
+ continue;
+
+ default:
+ break;
+ }
+ if (*(source + 1) != '\0' &&
+ (IS_EUC(UCH(*source), UCH(*(source + 1))) ||
+ IS_SJIS(UCH(*source), UCH(*(source + 1)), in_sjis) ||
+ IS_BIG5(UCH(*source), UCH(*(source + 1))))) {
+ *target++ = *source++;
+ *target++ = *source;
+ continue;
+ }
+ }
+#endif
+ switch (*source) {
+ case '&':
+ *target++ = '&';
+ *target++ = 'a';
+ *target++ = 'm';
+ *target++ = 'p';
+ *target++ = ';';
+ break;
+ case '<':
+ if (brackets) {
+ *target++ = '&';
+ *target++ = 'l';
+ *target++ = 't';
+ *target++ = ';';
+ break;
+ }
+ /* FALLTHRU */
+ case '>':
+ if (brackets) {
+ *target++ = '&';
+ *target++ = 'g';
+ *target++ = 't';
+ *target++ = ';';
+ break;
+ }
+ /* FALLTHRU */
+ default:
+ *target++ = *source;
+ break;
+ }
+ }
+ *target = '\0';
+ FREE(*in_out);
+ *in_out = result;
+ }
+ }
+}
+
+/*
+ * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
+ * Optimize things a little by avoiding the memory allocation if not needed,
+ * as is usually the case.
+ */
+static BOOL MustEntify(const char *source)
+{
+ BOOL result;
+
+#ifdef CJK_EX
+ if (IS_CJK_TTY && StrChr(source, '\033') != 0) {
+ result = TRUE;
+ } else
+#endif
+ {
+ size_t length = strlen(source);
+ size_t reject = strcspn(source, "<&>");
+
+ result = (BOOL) (length != reject);
+ }
+
+ return result;
+}
+
+/*
+ * Wrappers for LYEntify() which do not assume that the source was allocated,
+ * e.g., output from gettext().
+ */
+const char *LYEntifyTitle(char **target, const char *source)
+{
+ const char *result = 0;
+
+ if (MustEntify(source)) {
+ StrAllocCopy(*target, source);
+ LYEntify(target, TRUE);
+ result = *target;
+ } else {
+ result = source;
+ }
+ return result;
+}
+
+const char *LYEntifyValue(char **target, const char *source)
+{
+ const char *result = 0;
+
+ if (MustEntify(source)) {
+ StrAllocCopy(*target, source);
+ LYEntify(target, FALSE);
+ result = *target;
+ } else {
+ result = source;
+ }
+ return result;
+}
+
+/*
+ * This function trims characters <= that of a space (32),
+ * including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
+ * but not ESC, from the heads of strings. - FM
+ */
+void LYTrimHead(char *str)
+{
+ const char *s = str;
+
+ if (isEmpty(s))
+ return;
+
+ while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC)) /* S/390 -- gil -- 1669 */
+ s++;
+ if (s > str) {
+ char *ns = str;
+
+ while (*s) {
+ *ns++ = *s++;
+ }
+ *ns = '\0';
+ }
+}
+
+/*
+ * This function trims characters <= that of a space (32),
+ * including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
+ * ESC from the tails of strings. - FM
+ */
+void LYTrimTail(char *str)
+{
+ int i;
+
+ if (isEmpty(str))
+ return;
+
+ i = (int) strlen(str) - 1;
+ while (i >= 0) {
+ if (WHITE(str[i]))
+ str[i] = '\0';
+ else
+ break;
+ i--;
+ }
+}
+
+/*
+ * This function should receive a pointer to the start
+ * of a comment. It returns a pointer to the end ('>')
+ * character of comment, or it's best guess if the comment
+ * is invalid. - FM
+ */
+char *LYFindEndOfComment(char *str)
+{
+ char *cp, *cp1;
+ enum comment_state {
+ start1,
+ start2,
+ end1,
+ end2
+ } state;
+
+ if (str == NULL)
+ /*
+ * We got NULL, so return NULL. - FM
+ */
+ return NULL;
+
+ if (StrNCmp(str, "<!--", 4))
+ /*
+ * We don't have the start of a comment, so return the beginning of the
+ * string. - FM
+ */
+ return str;
+
+ cp = (str + 4);
+ if (*cp == '>')
+ /*
+ * It's an invalid comment, so
+ * return this end character. - FM
+ */
+ return cp;
+
+ if ((cp1 = StrChr(cp, '>')) == NULL)
+ /*
+ * We don't have an end character, so return the beginning of the
+ * string. - FM
+ */
+ return str;
+
+ if (*cp == '-')
+ /*
+ * Ugh, it's a "decorative" series of dashes, so return the next end
+ * character. - FM
+ */
+ return cp1;
+
+ /*
+ * OK, we're ready to start parsing. - FM
+ */
+ state = start2;
+ while (*cp != '\0') {
+ switch (state) {
+ case start1:
+ if (*cp == '-')
+ state = start2;
+ else
+ /*
+ * Invalid comment, so return the first '>' from the start of
+ * the string. - FM
+ */
+ return cp1;
+ break;
+
+ case start2:
+ if (*cp == '-')
+ state = end1;
+ break;
+
+ case end1:
+ if (*cp == '-')
+ state = end2;
+ else
+ /*
+ * Invalid comment, so return the first '>' from the start of
+ * the string. - FM
+ */
+ return cp1;
+ break;
+
+ case end2:
+ if (*cp == '>')
+ /*
+ * Valid comment, so return the end character. - FM
+ */
+ return cp;
+ if (*cp == '-') {
+ state = start1;
+ } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) { /* S/390 -- gil -- 1686 */
+ /*
+ * Invalid comment, so return the first '>' from the start of
+ * the string. - FM
+ */
+ return cp1;
+ }
+ break;
+
+ default:
+ break;
+ }
+ cp++;
+ }
+
+ /*
+ * Invalid comment, so return the first '>' from the start of the string.
+ * - FM
+ */
+ return cp1;
+}
+
+/*
+ * If an HREF, itself or if resolved against a base,
+ * represents a file URL, and the host is defaulted,
+ * force in "//localhost". We need this until
+ * all the other Lynx code which performs security
+ * checks based on the "localhost" string is changed
+ * to assume "//localhost" when a host field is not
+ * present in file URLs - FM
+ */
+void LYFillLocalFileURL(char **href,
+ const char *base)
+{
+ char *temp = NULL;
+
+ if (isEmpty(*href))
+ return;
+
+ if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
+ if (base != NULL && isFILE_URL(base)) {
+ StrAllocCopy(temp, STR_FILE_URL);
+ StrAllocCat(temp, *href);
+ StrAllocCopy(*href, temp);
+ }
+ }
+ if (isFILE_URL(*href)) {
+ if (*(*href + 5) == '\0') {
+ StrAllocCat(*href, "//localhost");
+ } else if (!strcmp(*href, "file://")) {
+ StrAllocCat(*href, "localhost");
+ } else if (!StrNCmp(*href, "file:///", 8)) {
+ StrAllocCopy(temp, (*href + 7));
+ LYLocalFileToURL(href, temp);
+ } else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
+ StrAllocCopy(temp, (*href + 5));
+ LYLocalFileToURL(href, temp);
+ }
+ }
+#if defined(USE_DOS_DRIVES)
+ if (LYIsDosDrive(*href)) {
+ /*
+ * If it's a local DOS path beginning with drive letter,
+ * add file://localhost/ prefix and go ahead.
+ */
+ StrAllocCopy(temp, *href);
+ LYLocalFileToURL(href, temp);
+ }
+
+ /* use below: strlen("file://localhost/") = 17 */
+ if (!StrNCmp(*href, "file://localhost/", 17)
+ && (strlen(*href) == 19)
+ && LYIsDosDrive(*href + 17)) {
+ /*
+ * Terminate DOS drive letter with a slash to surf root successfully.
+ * Here seems a proper place to do so.
+ */
+ LYAddPathSep(href);
+ }
+#endif /* USE_DOS_DRIVES */
+
+ /*
+ * No path in a file://localhost URL means a
+ * directory listing for the current default. - FM
+ */
+ if (!strcmp(*href, "file://localhost")) {
+ const char *temp2;
+
+#ifdef VMS
+ temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
+#else
+ char curdir[LY_MAXPATH];
+
+ temp2 = wwwName(Current_Dir(curdir));
+#endif /* VMS */
+ if (!LYIsHtmlSep(*temp2))
+ LYAddHtmlSep(href);
+ /*
+ * Check for pathological cases - current dir has chars which MUST BE
+ * URL-escaped - kw
+ */
+ if (StrChr(temp2, '%') != NULL || StrChr(temp2, '#') != NULL) {
+ FREE(temp);
+ temp = HTEscape(temp2, URL_PATH);
+ StrAllocCat(*href, temp);
+ } else {
+ StrAllocCat(*href, temp2);
+ }
+ }
+#ifdef VMS
+ /*
+ * On VMS, a file://localhost/ URL means
+ * a listing for the login directory. - FM
+ */
+ if (!strcmp(*href, "file://localhost/"))
+ StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
+#endif /* VMS */
+
+ FREE(temp);
+ return;
+}
+
+void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
+{
+ char *buf = 0;
+
+ if (disp_chndl == -1)
+ /*
+ * -1 means use current_char_set.
+ */
+ disp_chndl = current_char_set;
+
+ if (target != 0 && disp_chndl >= 0) {
+ HTSprintf0(&buf, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
+ "http-equiv=\"content-type\"",
+ LYCharSet_UC[disp_chndl].MIMEname);
+ (*target->isa->put_string) (target, buf);
+ FREE(buf);
+ }
+}
+
+/*
+ * This function writes a line with a META tag to an open file,
+ * which will specify a charset parameter to use when the file is
+ * read back in. It is meant for temporary HTML files used by the
+ * various special pages which may show titles of documents. When those
+ * files are created, the title strings normally have been translated and
+ * expanded to the display character set, so we have to make sure they
+ * don't get translated again.
+ * If the user has changed the display character set during the lifetime
+ * of the Lynx session (or, more exactly, during the time the title
+ * strings to be written were generated), they may now have different
+ * character encodings and there is currently no way to get it all right.
+ * To change this, we would have to add a variable for each string which
+ * keeps track of its character encoding.
+ * But at least we can try to ensure that reading the file after future
+ * display character set changes will give reasonable output.
+ *
+ * The META tag is not written if the display character set (passed as
+ * disp_chndl) already corresponds to the charset assumption that
+ * would be made when the file is read. - KW
+ *
+ * Currently this function is used for temporary files like "Lynx Info Page"
+ * and for one permanent - bookmarks (so it may be a problem if you change
+ * the display charset later: new bookmark entries may be mistranslated).
+ * - LP
+ */
+void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
+{
+ if (disp_chndl == -1)
+ /*
+ * -1 means use current_char_set.
+ */
+ disp_chndl = current_char_set;
+
+ if (fd == NULL || disp_chndl < 0)
+ /*
+ * Should not happen.
+ */
+ return;
+
+ if (UCLYhndl_HTFile_for_unspec == disp_chndl)
+ /*
+ * Not need to do, so we don't.
+ */
+ return;
+
+ if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
+ /*
+ * There shouldn't be any 8-bit characters in this case.
+ */
+ return;
+
+ /*
+ * In other cases we don't know because UCLYhndl_for_unspec may change
+ * during the lifetime of the file (by toggling raw mode or changing the
+ * display character set), so proceed.
+ */
+ fprintf(fd, "<META %s content=\"" STR_HTML ";charset=%s\">\n",
+ "http-equiv=\"content-type\"",
+ LYCharSet_UC[disp_chndl].MIMEname);
+}
+
+/*
+ * This function returns OL TYPE="A" strings in
+ * the range of " A." (1) to "ZZZ." (18278). - FM
+ */
+char *LYUppercaseA_OL_String(int seqnum)
+{
+ static char OLstring[8];
+
+ if (seqnum <= 1) {
+ strcpy(OLstring, " A.");
+ return OLstring;
+ }
+ if (seqnum < 27) {
+ sprintf(OLstring, " %c.", (seqnum + 64));
+ return OLstring;
+ }
+ if (seqnum < 703) {
+ sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
+ (seqnum - ((seqnum - 1) / 26) * 26 + 64));
+ return OLstring;
+ }
+ if (seqnum < 18279) {
+ sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
+ (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
+ (seqnum - ((seqnum - 1) / 26) * 26 + 64));
+ return OLstring;
+ }
+ strcpy(OLstring, "ZZZ.");
+ return OLstring;
+}
+
+/*
+ * This function returns OL TYPE="a" strings in
+ * the range of " a." (1) to "zzz." (18278). - FM
+ */
+char *LYLowercaseA_OL_String(int seqnum)
+{
+ static char OLstring[8];
+
+ if (seqnum <= 1) {
+ strcpy(OLstring, " a.");
+ return OLstring;
+ }
+ if (seqnum < 27) {
+ sprintf(OLstring, " %c.", (seqnum + 96));
+ return OLstring;
+ }
+ if (seqnum < 703) {
+ sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
+ (seqnum - ((seqnum - 1) / 26) * 26 + 96));
+ return OLstring;
+ }
+ if (seqnum < 18279) {
+ sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
+ (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
+ (seqnum - ((seqnum - 1) / 26) * 26 + 96));
+ return OLstring;
+ }
+ strcpy(OLstring, "zzz.");
+ return OLstring;
+}
+
+/*
+ * This function returns OL TYPE="I" strings in the
+ * range of " I." (1) to "MMM." (3000).- FM
+ * Maximum length: 16 -TD
+ */
+char *LYUppercaseI_OL_String(int seqnum)
+{
+ static char OLstring[20];
+ int Arabic = seqnum;
+
+ if (Arabic >= 3000) {
+ strcpy(OLstring, "MMM.");
+ return OLstring;
+ }
+
+ switch (Arabic) {
+ case 1:
+ strcpy(OLstring, " I.");
+ return OLstring;
+ case 5:
+ strcpy(OLstring, " V.");
+ return OLstring;
+ case 10:
+ strcpy(OLstring, " X.");
+ return OLstring;
+ case 50:
+ strcpy(OLstring, " L.");
+ return OLstring;
+ case 100:
+ strcpy(OLstring, " C.");
+ return OLstring;
+ case 500:
+ strcpy(OLstring, " D.");
+ return OLstring;
+ case 1000:
+ strcpy(OLstring, " M.");
+ return OLstring;
+ default:
+ OLstring[0] = '\0';
+ break;
+ }
+
+ while (Arabic >= 1000) {
+ strcat(OLstring, "M");
+ Arabic -= 1000;
+ }
+
+ if (Arabic >= 900) {
+ strcat(OLstring, "CM");
+ Arabic -= 900;
+ }
+
+ if (Arabic >= 500) {
+ strcat(OLstring, "D");
+ Arabic -= 500;
+ }
+
+ if (Arabic >= 400) {
+ strcat(OLstring, "CD");
+ Arabic -= 400;
+ }
+
+ while (Arabic >= 100) {
+ strcat(OLstring, "C");
+ Arabic -= 100;
+ }
+
+ if (Arabic >= 90) {
+ strcat(OLstring, "XC");
+ Arabic -= 90;
+ }
+
+ if (Arabic >= 50) {
+ strcat(OLstring, "L");
+ Arabic -= 50;
+ }
+
+ if (Arabic >= 40) {
+ strcat(OLstring, "XL");
+ Arabic -= 40;
+ }
+
+ while (Arabic > 10) {
+ strcat(OLstring, "X");
+ Arabic -= 10;
+ }
+
+ switch (Arabic) {
+ case 1:
+ strcat(OLstring, "I.");
+ break;
+ case 2:
+ strcat(OLstring, "II.");
+ break;
+ case 3:
+ strcat(OLstring, "III.");
+ break;
+ case 4:
+ strcat(OLstring, "IV.");
+ break;
+ case 5:
+ strcat(OLstring, "V.");
+ break;
+ case 6:
+ strcat(OLstring, "VI.");
+ break;
+ case 7:
+ strcat(OLstring, "VII.");
+ break;
+ case 8:
+ strcat(OLstring, "VIII.");
+ break;
+ case 9:
+ strcat(OLstring, "IX.");
+ break;
+ case 10:
+ strcat(OLstring, "X.");
+ break;
+ default:
+ strcat(OLstring, ".");
+ break;
+ }
+
+ return OLstring;
+}
+
+/*
+ * This function returns OL TYPE="i" strings in
+ * range of " i." (1) to "mmm." (3000).- FM
+ * Maximum length: 16 -TD
+ */
+char *LYLowercaseI_OL_String(int seqnum)
+{
+ static char OLstring[20];
+ int Arabic = seqnum;
+
+ if (Arabic >= 3000) {
+ strcpy(OLstring, "mmm.");
+ return OLstring;
+ }
+
+ switch (Arabic) {
+ case 1:
+ strcpy(OLstring, " i.");
+ return OLstring;
+ case 5:
+ strcpy(OLstring, " v.");
+ return OLstring;
+ case 10:
+ strcpy(OLstring, " x.");
+ return OLstring;
+ case 50:
+ strcpy(OLstring, " l.");
+ return OLstring;
+ case 100:
+ strcpy(OLstring, " c.");
+ return OLstring;
+ case 500:
+ strcpy(OLstring, " d.");
+ return OLstring;
+ case 1000:
+ strcpy(OLstring, " m.");
+ return OLstring;
+ default:
+ OLstring[0] = '\0';
+ break;
+ }
+
+ while (Arabic >= 1000) {
+ strcat(OLstring, "m");
+ Arabic -= 1000;
+ }
+
+ if (Arabic >= 900) {
+ strcat(OLstring, "cm");
+ Arabic -= 900;
+ }
+
+ if (Arabic >= 500) {
+ strcat(OLstring, "d");
+ Arabic -= 500;
+ }
+
+ if (Arabic >= 400) {
+ strcat(OLstring, "cd");
+ Arabic -= 400;
+ }
+
+ while (Arabic >= 100) {
+ strcat(OLstring, "c");
+ Arabic -= 100;
+ }
+
+ if (Arabic >= 90) {
+ strcat(OLstring, "xc");
+ Arabic -= 90;
+ }
+
+ if (Arabic >= 50) {
+ strcat(OLstring, "l");
+ Arabic -= 50;
+ }
+
+ if (Arabic >= 40) {
+ strcat(OLstring, "xl");
+ Arabic -= 40;
+ }
+
+ while (Arabic > 10) {
+ strcat(OLstring, "x");
+ Arabic -= 10;
+ }
+
+ switch (Arabic) {
+ case 1:
+ strcat(OLstring, "i.");
+ break;
+ case 2:
+ strcat(OLstring, "ii.");
+ break;
+ case 3:
+ strcat(OLstring, "iii.");
+ break;
+ case 4:
+ strcat(OLstring, "iv.");
+ break;
+ case 5:
+ strcat(OLstring, "v.");
+ break;
+ case 6:
+ strcat(OLstring, "vi.");
+ break;
+ case 7:
+ strcat(OLstring, "vii.");
+ break;
+ case 8:
+ strcat(OLstring, "viii.");
+ break;
+ case 9:
+ strcat(OLstring, "ix.");
+ break;
+ case 10:
+ strcat(OLstring, "x.");
+ break;
+ default:
+ strcat(OLstring, ".");
+ break;
+ }
+
+ return OLstring;
+}
+
+/*
+ * This function initializes the Ordered List counter. - FM
+ */
+void LYZero_OL_Counter(HTStructured * me)
+{
+ int i;
+
+ if (!me)
+ return;
+
+ for (i = 0; i < 12; i++) {
+ me->OL_Counter[i] = OL_VOID;
+ me->OL_Type[i] = '1';
+ }
+
+ me->Last_OL_Count = 0;
+ me->Last_OL_Type = '1';
+
+ return;
+}
+
+/*
+ * This function is used by the HTML Structured object. - KW
+ */
+void LYGetChartransInfo(HTStructured * me)
+{
+ me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_STRUCTURED);
+ if (me->UCLYhndl < 0) {
+ int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
+
+ if (chndl < 0) {
+ chndl = current_char_set;
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_HTEXT,
+ UCT_SETBY_STRUCTURED);
+ }
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_STRUCTURED,
+ UCT_SETBY_STRUCTURED);
+ me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_STRUCTURED);
+ }
+ me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_STRUCTURED);
+}
+
+ /* as in HTParse.c, saves some calls - kw */
+static const char *hex = "0123456789ABCDEF";
+
+/*
+ * Any raw 8-bit or multibyte characters already have been
+ * handled in relation to the display character set
+ * in SGML_character(), including named and numeric entities.
+ *
+ * This function used for translations HTML special fields inside tags
+ * (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
+ * It also unescapes non-ASCII characters from URL (#fragments !)
+ * if st_URL is active.
+ *
+ * If `do_ent' is YES, it converts named entities
+ * and numeric character references (NCRs) to their `cs_to' replacements.
+ *
+ * Named entities converted to unicodes. NCRs (unicodes) converted
+ * by UCdomap.c chartrans functions.
+ * ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
+ * to their HTML entity names (via old-style entities) and then translated
+ * according to the LYCharSets.c array for `cs_out'???.
+ *
+ * Some characters (see descriptions in `put_special_unicodes' from SGML.c)
+ * translated in relation with the state of boolean variables
+ * `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet:
+ *
+ * If plain_space is TRUE, nbsp (160) will be treated as an ASCII
+ * space (32). If hidden is TRUE, entities will be translated
+ * (if `do_ent' is YES) but escape sequences will be passed unaltered.
+ * If `hidden' is FALSE, some characters are converted to Lynx special
+ * codes (see `put_special_unicodes') or ASCII space if `plain_space'
+ * applies). @@ is `use_lynx_specials' needed, does it have any effect? @@
+ * If `use_lynx_specials' is YES, translate byte values 160 and 173
+ * meaning U+00A0 and U+00AD given as or converted from raw char input
+ * are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
+ * (unless input and output charset are both iso-8859-1, for compatibility
+ * with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
+ *
+ * If `Back' is YES, the reverse is done instead i.e., Lynx special codes
+ * in the input are translated back to character values.
+ *
+ * If `Back' is YES, an attempt is made to use UCReverseTransChar() for
+ * back translation which may be more efficient. (?)
+ *
+ * If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
+ * The sequence of bytes being URL-encoded is the raw input character if
+ * we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
+ * UTF-8 representation if either `cs_to' requires this or if the
+ * character's Unicode value is > 255, otherwise it should be the iso-8859-1
+ * representation.
+ * No general URL-encoding occurs for displayable ASCII characters and
+ * spaces and some C0 controls valid in HTML (LF, TAB), it is expected
+ * that other functions will take care of that as appropriate.
+ *
+ * Escape characters (0x1B, '\033') are
+ * - URL-encoded if `stype' is st_URL, otherwise
+ * - dropped if `stype' is st_other, otherwise (i.e., st_HTML)
+ * - passed if `hidden' is TRUE or HTCJK is set, otherwise
+ * - dropped.
+ *
+ * (If `stype' is st_URL or st_other most of the parameters really predefined:
+ * cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
+ *
+ *
+ * Returns pointer to the char** passed in
+ * if string translated or translation unnecessary,
+ * NULL otherwise
+ * (in which case something probably went wrong.)
+ *
+ *
+ * In general, this somehow ugly function (KW)
+ * cover three functions from v.2.7.2 (FM):
+ * extern void LYExpandString (
+ * HTStructured * me,
+ * char ** str);
+ * extern void LYUnEscapeEntities (
+ * HTStructured * me,
+ * char ** str);
+ * extern void LYUnEscapeToLatinOne (
+ * HTStructured * me,
+ * char ** str,
+ * BOOLEAN isURL);
+ */
+
+char **LYUCFullyTranslateString(char **str,
+ int cs_from,
+ int cs_to,
+ int do_ent,
+ int use_lynx_specials,
+ int plain_space,
+ int hidden,
+ int Back,
+ CharUtil_st stype)
+{
+ char *p;
+ char *q, *qs;
+ HTChunk *chunk = NULL;
+ char *cp = 0;
+ char cpe = 0;
+ char *esc = NULL;
+ char replace_buf[64];
+ int uck;
+ int lowest_8;
+ UCode_t code = 0;
+ BOOL output_utf8 = 0, repl_translated_C0 = 0;
+ size_t len;
+ const char *name = NULL;
+ BOOLEAN no_bytetrans;
+ UCTransParams T;
+ BOOL from_is_utf8 = FALSE;
+ char *puni = 0;
+ enum _state {
+ S_text,
+ S_esc,
+ S_dollar,
+ S_paren,
+ S_nonascii_text,
+ S_dollar_paren,
+ S_trans_byte,
+ S_check_ent,
+ S_ncr,
+ S_check_uni,
+ S_named,
+ S_check_name,
+ S_recover,
+ S_got_oututf8,
+ S_got_outstring,
+ S_put_urlstring,
+ S_got_outchar,
+ S_put_urlchar,
+ S_next_char,
+ S_done
+ } state = S_text;
+ enum _parsing_what {
+ P_text,
+ P_utf8,
+ P_hex,
+ P_decimal,
+ P_named
+ } what = P_text;
+
+#ifdef KANJI_CODE_OVERRIDE
+ static unsigned char sjis_1st = '\0';
+
+ unsigned char sjis_str[3];
+#endif
+
+ /*
+ * Make sure we have a non-empty string. - FM
+ */
+ if (isEmpty(*str))
+ return str;
+
+ /*
+ * FIXME: something's wrong with the limit checks here (clearing the
+ * buffer helps).
+ */
+ memset(replace_buf, 0, sizeof(replace_buf));
+
+ /*
+ * Don't do byte translation if original AND target character sets are both
+ * iso-8859-1 (and we are not called to back-translate), or if we are in
+ * CJK mode.
+ */
+ if (IS_CJK_TTY
+#ifdef EXP_JAPANESEUTF8_SUPPORT
+ && (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
+ && (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
+#endif
+ ) {
+ no_bytetrans = TRUE;
+ } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
+ no_bytetrans = TRUE;
+ } else {
+ /* No need to translate or examine the string any further */
+ no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
+ UCNeedNotTranslate(cs_from, cs_to));
+ }
+ /*
+ * Save malloc/calloc overhead in simple case - kw
+ */
+ if (do_ent && hidden && (stype != st_URL) && (StrChr(*str, '&') == NULL))
+ do_ent = FALSE;
+
+ /* Can't do, caller should figure out what to do... */
+ if (!UCCanTranslateFromTo(cs_from, cs_to)) {
+ if (cs_to < 0)
+ return NULL;
+ if (!do_ent && no_bytetrans)
+ return NULL;
+ no_bytetrans = TRUE;
+ } else if (cs_to < 0) {
+ do_ent = FALSE;
+ }
+
+ if (!do_ent && no_bytetrans)
+ return str;
+ p = *str;
+
+ if (!no_bytetrans) {
+ UCTransParams_clear(&T);
+ UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
+ cs_to, &LYCharSet_UC[cs_to]);
+ from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
+ output_utf8 = T.output_utf8;
+ repl_translated_C0 = T.repl_translated_C0;
+ puni = p;
+ } else if (do_ent) {
+ output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
+ HText_hasUTF8OutputSet(HTMainText));
+ repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
+ }
+
+ lowest_8 = LYlowest_eightbit[cs_to];
+
+ /*
+ * Create a buffer string seven times the length of the original, so we
+ * have plenty of room for expansions. - FM
+ */
+ len = strlen(p) + 16;
+ q = p;
+
+ qs = q;
+
+/* Create the HTChunk only if we need it */
+#define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
+
+#define REPLACE_STRING(s) \
+ if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
+ HTChunkPuts(CHUNK, s); \
+ qs = q = *str
+
+#define REPLACE_CHAR(c) if (q > p) { \
+ HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
+ qs = q = *str; \
+ *q++ = c; \
+ } else \
+ *q++ = c
+
+ /*
+ * Loop through string, making conversions as needed.
+ *
+ * The while() checks for a non-'\0' char only for the normal text states
+ * since other states may temporarily modify p or *p (which should be
+ * restored before S_done!) - kw
+ */
+ while (*p || (state != S_text && state != S_nonascii_text)) {
+ switch (state) {
+ case S_text:
+ code = UCH(*p);
+#ifdef KANJI_CODE_OVERRIDE
+ if (HTCJK == JAPANESE && last_kcode == SJIS) {
+ if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
+ sjis_1st = UCH(code);
+ } else if (sjis_1st && IS_SJIS_LO(code)) {
+ sjis_1st = '\0';
+ } else {
+ if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
+ sjis_str[2] = '\0';
+ JISx0201TO0208_SJIS(UCH(code),
+ sjis_str, sjis_str + 1);
+ REPLACE_STRING(sjis_str);
+ p++;
+ continue;
+ }
+ }
+ }
+#endif
+ if (*p == '\033') {
+ if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
+ state = S_esc;
+ if (stype == st_URL) {
+ REPLACE_STRING("%1B");
+ p++;
+ continue;
+ } else if (stype != st_HTML) {
+ p++;
+ continue;
+ } else {
+ *q++ = *p++;
+ continue;
+ }
+ } else if (!hidden) {
+ /*
+ * CJK handling not on, and not a hidden INPUT, so block
+ * escape. - FM
+ */
+ state = S_next_char;
+ } else {
+ state = S_trans_byte;
+ }
+ } else {
+ state = (do_ent ? S_check_ent : S_trans_byte);
+ }
+ break;
+
+ case S_esc:
+ if (*p == '$') {
+ state = S_dollar;
+ *q++ = *p++;
+ continue;
+ } else if (*p == '(') {
+ state = S_paren;
+ *q++ = *p++;
+ continue;
+ } else {
+ state = S_text;
+ }
+ break;
+
+ case S_dollar:
+ if (*p == '@' || *p == 'B' || *p == 'A') {
+ state = S_nonascii_text;
+ *q++ = *p++;
+ continue;
+ } else if (*p == '(') {
+ state = S_dollar_paren;
+ *q++ = *p++;
+ continue;
+ } else {
+ state = S_text;
+ }
+ break;
+
+ case S_dollar_paren:
+ if (*p == 'C') {
+ state = S_nonascii_text;
+ *q++ = *p++;
+ continue;
+ } else {
+ state = S_text;
+ }
+ break;
+
+ case S_paren:
+ if (*p == 'B' || *p == 'J' || *p == 'T') {
+ state = S_text;
+ *q++ = *p++;
+ continue;
+ } else if (*p == 'I') {
+ state = S_nonascii_text;
+ *q++ = *p++;
+ continue;
+ } else {
+ state = S_text;
+ }
+ break;
+
+ case S_nonascii_text:
+ if (*p == '\033') {
+ if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
+ state = S_esc;
+ if (stype == st_URL) {
+ REPLACE_STRING("%1B");
+ p++;
+ continue;
+ } else if (stype != st_HTML) {
+ p++;
+ continue;
+ }
+ }
+ }
+ *q++ = *p++;
+ continue;
+
+ case S_trans_byte:
+ /* character translation goes here */
+ /*
+ * Don't do anything if we have no string, or if original AND
+ * target character sets are both iso-8859-1, or if we are in CJK
+ * mode.
+ */
+ if (*p == '\0' || no_bytetrans) {
+ state = S_got_outchar;
+ break;
+ }
+
+ if (Back) {
+ int rev_c;
+
+ if ((*p) == HT_NON_BREAK_SPACE ||
+ (*p) == HT_EN_SPACE) {
+ if (plain_space) {
+ code = *p = ' ';
+ state = S_got_outchar;
+ break;
+ } else {
+ code = 160;
+ if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
+ (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
+ state = S_got_outchar;
+ break;
+ } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
+ || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
+ state = S_check_uni;
+ break;
+ } else {
+ *(unsigned char *) p = UCH(160);
+ }
+ }
+ } else if ((*p) == LY_SOFT_HYPHEN) {
+ code = 173;
+ if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
+ (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
+ state = S_got_outchar;
+ break;
+ } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
+ || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
+ state = S_check_uni;
+ break;
+ } else {
+ *(unsigned char *) p = UCH(173);
+ }
+#ifdef EXP_JAPANESEUTF8_SUPPORT
+ } else if (output_utf8) {
+ if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
+ (IS_EUC((unsigned char) (*p),
+ (unsigned char) (*(p + 1))))) ||
+ (!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
+ (IS_SJIS_2BYTE((unsigned char) (*p),
+ (unsigned char) (*(p + 1)))))) {
+ code = UCTransJPToUni(p, 2, cs_from);
+ p++;
+ state = S_check_uni;
+ break;
+ }
+#endif
+ } else if (code < 127 || T.transp) {
+ state = S_got_outchar;
+ break;
+ }
+ rev_c = UCReverseTransChar(*p, cs_to, cs_from);
+ if (rev_c > 127) {
+ *p = (char) rev_c;
+ code = rev_c;
+ state = S_got_outchar;
+ break;
+ }
+ } else if (code < 127) {
+ state = S_got_outchar;
+ break;
+ }
+
+ if (from_is_utf8) {
+ if (((*p) & 0xc0) == 0xc0) {
+ const char *pq = p;
+
+ puni = p;
+ code = UCGetUniFromUtf8String(&pq);
+ if (code <= 0) {
+ code = UCH(*p);
+ } else {
+ what = P_utf8;
+ puni += (pq - (const char *) p);
+ }
+ }
+ } else if (use_lynx_specials && !Back &&
+ (code == 160 || code == 173) &&
+ (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
+ (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
+ if (code == 160)
+ code = *p = HT_NON_BREAK_SPACE;
+ else if (code == 173)
+ code = *p = LY_SOFT_HYPHEN;
+ state = S_got_outchar;
+ break;
+ } else if (T.trans_to_uni) {
+ code = UCTransToUni(*p, cs_from);
+ if (code <= 0) {
+ /* What else can we do? */
+ code = UCH(*p);
+ }
+ } else if (!T.trans_from_uni) {
+ state = S_got_outchar;
+ break;
+ }
+ /*
+ * Substitute Lynx special character for 160 (nbsp) if
+ * use_lynx_specials is set.
+ */
+ if (use_lynx_specials && !Back &&
+ (code == 160 || code == 173)) {
+ code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
+ state = S_got_outchar;
+ break;
+ }
+
+ state = S_check_uni;
+ break;
+
+ case S_check_ent:
+ if (*p == '&') {
+ char *pp = p + 1;
+
+ len = strlen(pp);
+ /*
+ * Check for a numeric entity. - FM
+ */
+ if (*pp == '#' && len > 2 &&
+ (*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
+ UCH(*(pp + 2)) < 127 &&
+ isxdigit(UCH(*(pp + 2)))) {
+ what = P_hex;
+ state = S_ncr;
+ } else if (*pp == '#' && len > 2 &&
+ UCH(*(pp + 1)) < 127 &&
+ isdigit(UCH(*(pp + 1)))) {
+ what = P_decimal;
+ state = S_ncr;
+ } else if (UCH(*pp) < 127 &&
+ isalpha(UCH(*pp))) {
+ what = P_named;
+ state = S_named;
+ } else {
+ state = S_trans_byte;
+ }
+ } else {
+ state = S_trans_byte;
+ }
+ break;
+
+ case S_ncr:
+ if (what == P_hex) {
+ p += 3;
+ } else { /* P_decimal */
+ p += 2;
+ }
+ cp = p;
+ while (*p && UCH(*p) < 127 &&
+ (what == P_hex ? isxdigit(UCH(*p)) :
+ isdigit(UCH(*p)))) {
+ p++;
+ }
+ /*
+ * Save the terminator and isolate the digit(s). - FM
+ */
+ cpe = *p;
+ if (*p)
+ *p++ = '\0';
+ /*
+ * Show the numeric entity if the value:
+ * (1) Is greater than 255 and unhandled Unicode.
+ * (2) Is less than 32, and not valid and we don't have HTCJK set.
+ * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
+ * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
+ */
+ if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
+ code = LYcp1252ToUnicode(code);
+ state = S_check_uni;
+ } else {
+ state = S_recover;
+ break;
+ }
+ break;
+
+ case S_check_uni:
+ /*
+ * Show the numeric entity if the value:
+ * (2) Is less than 32, and not valid and we don't have HTCJK set.
+ * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
+ * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
+ */
+ if ((code < 32 &&
+ code != 9 && code != 10 && code != 13 &&
+ !IS_CJK_TTY) ||
+ (code == 127 &&
+ !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
+ (code > 127 && code < 160 &&
+ !HTPassHighCtrlNum)) {
+ state = S_recover;
+ break;
+ }
+ /*
+ * Convert the value as an unsigned char, hex escaped if isURL is
+ * set and it's 8-bit, and then recycle the terminator if it is not
+ * a semicolon. - FM
+ */
+ if (code > 159 && stype == st_URL) {
+ state = S_got_oututf8;
+ break;
+ }
+ /*
+ * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
+ * use an ASCII space (32) if plain_space is TRUE, otherwise use
+ * the Lynx special character. - FM
+ */
+ if (code == 160) {
+ if (plain_space) {
+ code = ' ';
+ state = S_got_outchar;
+ break;
+ } else if (use_lynx_specials) {
+ code = HT_NON_BREAK_SPACE;
+ state = S_got_outchar;
+ break;
+ } else if ((hidden && !Back)
+ || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
+ || LYCharSet_UC[cs_to].enc == UCT_ENC_8859
+ || (LYCharSet_UC[cs_to].like8859 &
+ UCT_R_8859SPECL)) {
+ state = S_got_outchar;
+ break;
+ } else if (
+ (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
+ ; /* nothing, may be translated later */
+ } else {
+ code = ' ';
+ state = S_got_outchar;
+ break;
+ }
+ }
+ /*
+ * For 173 (shy), use that value if it's a hidden INPUT, otherwise
+ * ignore it if plain_space is TRUE, otherwise use the Lynx special
+ * character. - FM
+ */
+ if (code == 173) {
+ if (plain_space) {
+ replace_buf[0] = '\0';
+ state = S_got_outstring;
+ break;
+ } else if (Back &&
+ !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
+ (LYCharSet_UC[cs_to].like8859 &
+ UCT_R_8859SPECL))) {
+ ; /* nothing, may be translated later */
+ } else if (hidden || Back) {
+ state = S_got_outchar;
+ break;
+ } else if (use_lynx_specials) {
+ code = LY_SOFT_HYPHEN;
+ state = S_got_outchar;
+ break;
+ }
+ }
+ /*
+ * Seek a translation from the chartrans tables.
+ */
+ if ((uck = UCTransUniChar(code,
+ cs_to)) >= 32 &&
+ uck < 256 &&
+ (uck < 127 || uck >= lowest_8)) {
+ code = uck;
+ state = S_got_outchar;
+ break;
+ } else if ((uck == -4 ||
+ (repl_translated_C0 &&
+ uck > 0 && uck < 32)) &&
+ /*
+ * Not found; look for replacement string.
+ */
+ UCTransUniCharStr(replace_buf,
+ 60, code,
+ cs_to,
+ 0) >= 0) {
+ state = S_got_outstring;
+ break;
+ }
+ if (output_utf8 &&
+ code > 127 && code < 0x7fffffffL) {
+ state = S_got_oututf8;
+ break;
+ }
+ /*
+ * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
+ * character reference if it's a hidden INPUT, otherwise use an
+ * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
+ * special character. - FM
+ */
+ if (code == 8194 || code == 8195 || code == 8201) {
+ if (hidden) {
+ state = S_recover;
+ } else if (plain_space) {
+ code = ' ';
+ state = S_got_outchar;
+ } else {
+ code = HT_EN_SPACE;
+ state = S_got_outchar;
+ }
+ break;
+ /*
+ * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
+ * for now, if we got this far without finding a representation
+ * for them.
+ */
+ } else if (code == 8204 || code == 8205 ||
+ code == 8206 || code == 8207) {
+ CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
+ PRI_UCode_t "'.\n", code));
+ replace_buf[0] = '\0';
+ state = S_got_outstring;
+ break;
+ /*
+ * Show the numeric entity if the value: (1) Is greater than
+ * 255 and unhandled Unicode.
+ */
+ } else if (code > 255) {
+ /*
+ * Illegal or not yet handled value. Return "&#" verbatim and
+ * continue from there. - FM
+ */
+ state = S_recover;
+ break;
+ /*
+ * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
+ * the character set is "ISO Latin 1", use it's value. - FM
+ */
+ } else if (code < 161 ||
+ (code < 256 &&
+ (HTPassEightBitNum || cs_to == LATIN1))) {
+ /*
+ * No conversion needed.
+ */
+ state = S_got_outchar;
+ break;
+
+ /* The following disabled section doesn't make sense any more.
+ * It used to make sense in the past, when S_check_named would
+ * look in "old style" tables in addition to what it does now.
+ * Disabling of going to S_check_name here prevents endless
+ * looping between S_check_uni and S_check_names states, which
+ * could occur here for Latin 1 codes for some cs_to if they
+ * had no translation in that cs_to. Normally all cs_to
+ * *should* now have valid translations via UCTransUniChar or
+ * UCTransUniCharStr for all Latin 1 codes, so that we would
+ * not get here anyway, and no loop could occur. Still, if we
+ * *do* get here, FALL THROUGH to case S_recover now. - kw
+ */
+#if 0
+ /*
+ * If we get to here, convert and handle the character as a
+ * named entity. - FM
+ */
+ } else {
+ name = HTMLGetEntityName(code - 160);
+ state = S_check_name;
+ break;
+#endif
+ }
+ /* FALLTHRU */
+
+ case S_recover:
+ if (what == P_decimal || what == P_hex) {
+ /*
+ * Illegal or not yet handled value. Return "&#" verbatim and
+ * continue from there. - FM
+ */
+ *q++ = '&';
+ *q++ = '#';
+ if (what == P_hex)
+ *q++ = 'x';
+ if (cpe != '\0')
+ *(p - 1) = cpe;
+ p = cp;
+ state = S_done;
+ } else if (what == P_named) {
+ *cp = cpe;
+ *q++ = '&';
+ state = S_done;
+ } else if (!T.output_utf8 && stype == st_HTML && !hidden &&
+ !(HTPassEightBitRaw &&
+ UCH(*p) >= lowest_8)) {
+ sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
+
+ state = S_got_outstring;
+ } else {
+ puni = p;
+ code = UCH(*p);
+ state = S_got_outchar;
+ }
+ break;
+
+ case S_named:
+ cp = ++p;
+ while (*cp && UCH(*cp) < 127 &&
+ isalnum(UCH(*cp)))
+ cp++;
+ cpe = *cp;
+ *cp = '\0';
+ name = p;
+ state = S_check_name;
+ break;
+
+ case S_check_name:
+ /*
+ * Seek the Unicode value for the named entity.
+ *
+ * !!!! We manually recover the case of '=' terminator which is
+ * commonly found on query to CGI-scripts enclosed as href= URLs
+ * like "somepath/?x=1&yz=2" Without this dirty fix, submission of
+ * such URLs was broken if &yz string happened to be a recognized
+ * entity name. - LP
+ */
+ if (((code = HTMLGetEntityUCValue(name)) > 0) &&
+ !((cpe == '=') && (stype == st_URL))) {
+ state = S_check_uni;
+ break;
+ }
+ /*
+ * Didn't find the entity. Return verbatim.
+ */
+ state = S_recover;
+ break;
+
+ /* * * O U T P U T S T A T E S * * */
+
+ case S_got_oututf8:
+ if (code > 255 ||
+ (code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
+ UCConvertUniToUtf8(code, replace_buf);
+ state = S_got_outstring;
+ } else {
+ state = S_got_outchar;
+ }
+ break;
+ case S_got_outstring:
+ if (what == P_decimal || what == P_hex) {
+ if (cpe != ';' && cpe != '\0')
+ *(--p) = cpe;
+ p--;
+ } else if (what == P_named) {
+ *cp = cpe;
+ p = (*cp != ';') ? (cp - 1) : cp;
+ } else if (what == P_utf8) {
+ p = puni;
+ }
+ if (replace_buf[0] == '\0') {
+ state = S_next_char;
+ break;
+ }
+ if (stype == st_URL) {
+ code = replace_buf[0]; /* assume string OK if first char is */
+ if (code >= 127 ||
+ (code < 32 && (code != 9 && code != 10 && code != 0))) {
+ state = S_put_urlstring;
+ break;
+ }
+ }
+ REPLACE_STRING(replace_buf);
+ state = S_next_char;
+ break;
+ case S_put_urlstring:
+ esc = HTEscape(replace_buf, URL_XALPHAS);
+ REPLACE_STRING(esc);
+ FREE(esc);
+ state = S_next_char;
+ break;
+ case S_got_outchar:
+ if (what == P_decimal || what == P_hex) {
+ if (cpe != ';' && cpe != '\0')
+ *(--p) = cpe;
+ p--;
+ } else if (what == P_named) {
+ *cp = cpe;
+ p = (*cp != ';') ? (cp - 1) : cp;
+ } else if (what == P_utf8) {
+ p = puni;
+ }
+ if (stype == st_URL &&
+ /* Not a full HTEscape, only for 8bit and ctrl chars */
+ (TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */
+ (code < ' ' && (code != '\t' && code != '\n')))) {
+ state = S_put_urlchar;
+ break;
+ } else if (!hidden && code == 10 && *p == 10
+ && q != qs && *(q - 1) == 13) {
+ /*
+ * If this is not a hidden string, and the current char is the
+ * LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW
+ */
+ *(q - 1) = *p++;
+ state = S_done;
+ break;
+ }
+ *q++ = (char) code;
+ state = S_next_char;
+ break;
+ case S_put_urlchar:
+ *q++ = '%';
+ REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */
+ REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
+ /* fall through */
+ case S_next_char:
+ p++; /* fall through */
+ case S_done:
+ state = S_text;
+ what = P_text;
+ /* for next round */
+ }
+ }
+
+ *q = '\0';
+ if (chunk) {
+ HTChunkPutb(CHUNK, qs, (int) (q - qs + 1)); /* also terminates */
+ if (stype == st_URL || stype == st_other) {
+ LYTrimHead(chunk->data);
+ LYTrimTail(chunk->data);
+ }
+ StrAllocCopy(*str, chunk->data);
+ HTChunkFree(chunk);
+ } else {
+ if (stype == st_URL || stype == st_other) {
+ LYTrimHead(qs);
+ LYTrimTail(qs);
+ }
+ }
+ return str;
+}
+
+#undef REPLACE_CHAR
+#undef REPLACE_STRING
+
+BOOL LYUCTranslateHTMLString(char **str,
+ int cs_from,
+ int cs_to,
+ int use_lynx_specials,
+ int plain_space,
+ int hidden,
+ CharUtil_st stype)
+{
+ BOOL ret = YES;
+
+ /* May reallocate *str even if cs_to == 0 */
+ if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
+ use_lynx_specials, plain_space, hidden,
+ NO, stype)) {
+ ret = NO;
+ }
+ return ret;
+}
+
+BOOL LYUCTranslateBackFormData(char **str,
+ int cs_from,
+ int cs_to,
+ int plain_space)
+{
+ char **ret;
+
+ /* May reallocate *str */
+ ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
+ NO, plain_space, YES,
+ YES, st_HTML));
+ return (BOOL) (ret != NULL);
+}
+
+/*
+ * Parse a parameter from an HTML META tag, i.e., the CONTENT.
+ */
+char *LYParseTagParam(char *from,
+ const char *name)
+{
+ size_t len = strlen(name);
+ char *result = NULL;
+ char *string = from;
+
+ do {
+ if ((string = StrChr(string, ';')) == NULL)
+ return NULL;
+ while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
+ string++;
+ }
+ if (strlen(string) < len)
+ return NULL;
+ } while (strncasecomp(string, name, (int) len) != 0);
+ string += len;
+ while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
+ string++;
+ }
+
+ StrAllocCopy(result, string);
+ len = 0;
+ while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
+ len++;
+ }
+ result[len] = '\0';
+
+ /*
+ * Strip single quotes, just in case.
+ */
+ if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
+ result[len - 1] = '\0';
+ for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
+ }
+ return result;
+}
+
+/*
+ * Given a refresh-URL content string, parses the delay time and the URL
+ * string. Ignore the remainder of the content.
+ */
+void LYParseRefreshURL(char *content,
+ char **p_seconds,
+ char **p_address)
+{
+ char *cp;
+ char *cp1 = NULL;
+ char *Seconds = NULL;
+
+ /*
+ * Look for the Seconds field. - FM
+ */
+ cp = LYSkipBlanks(content);
+ if (*cp && isdigit(UCH(*cp))) {
+ cp1 = cp;
+ while (*cp1 && isdigit(UCH(*cp1)))
+ cp1++;
+ StrnAllocCopy(Seconds, cp, (size_t) (cp1 - cp));
+ }
+ *p_seconds = Seconds;
+ *p_address = LYParseTagParam(content, "URL");
+
+ CTRACE((tfp,
+ "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
+ content, NonNull(*p_seconds), NonNull(*p_address)));
+}
+
+/*
+ * This function processes META tags in HTML streams. - FM
+ */
+void LYHandleMETA(HTStructured * me, const BOOL *present,
+ STRING2PTR value,
+ char **include GCC_UNUSED)
+{
+ char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
+ char *href = NULL, *id_string = NULL, *temp = NULL;
+ char *cp, *cp0, *cp1 = NULL;
+ int url_type = 0;
+
+ if (!me || !present)
+ return;
+
+ /*
+ * Load the attributes for possible use by Lynx. - FM
+ */
+ if (present[HTML_META_HTTP_EQUIV] &&
+ non_empty(value[HTML_META_HTTP_EQUIV])) {
+ StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
+ convert_to_spaces(http_equiv, TRUE);
+ LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ if (*http_equiv == '\0') {
+ FREE(http_equiv);
+ }
+ }
+ if (present[HTML_META_NAME] &&
+ non_empty(value[HTML_META_NAME])) {
+ StrAllocCopy(name, value[HTML_META_NAME]);
+ convert_to_spaces(name, TRUE);
+ LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ if (*name == '\0') {
+ FREE(name);
+ }
+ }
+ if (present[HTML_META_CONTENT] &&
+ non_empty(value[HTML_META_CONTENT])) {
+ /*
+ * Technically, we should be creating a comma-separated list, but META
+ * tags come one at a time, and we'll handle (or ignore) them as each
+ * is received. Also, at this point, we only trim leading and trailing
+ * blanks from the CONTENT value, without translating any named
+ * entities or numeric character references, because how we should do
+ * that depends on what type of information it contains, and whether or
+ * not any of it might be sent to the screen. - FM
+ */
+ StrAllocCopy(content, value[HTML_META_CONTENT]);
+ convert_to_spaces(content, FALSE);
+ LYTrimHead(content);
+ LYTrimTail(content);
+ if (*content == '\0') {
+ FREE(content);
+ }
+ }
+ if (present[HTML_META_CHARSET] &&
+ non_empty(value[HTML_META_CHARSET])) {
+ StrAllocCopy(charset, value[HTML_META_CHARSET]);
+ convert_to_spaces(charset, TRUE);
+ LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ if (*charset == '\0') {
+ FREE(charset);
+ }
+ }
+ CTRACE((tfp,
+ "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
+ NONNULL(http_equiv),
+ NONNULL(name),
+ NONNULL(content),
+ NONNULL(charset)));
+
+ /*
+ * Check for a text/html Content-Type with a charset directive, if we
+ * didn't already set the charset via a server's header. - AAC & FM
+ */
+ if (isEmpty(me->node_anchor->charset) &&
+ (charset ||
+ (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
+ LYUCcharset *p_in = NULL;
+ LYUCcharset *p_out = NULL;
+
+ if (charset) {
+ LYLowerCase(charset);
+ } else {
+ LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ LYLowerCase(content);
+ }
+
+ if ((cp1 = charset) != NULL ||
+ (cp1 = strstr(content, "charset")) != NULL) {
+ BOOL chartrans_ok = NO;
+ char *cp3 = NULL, *cp4;
+ int chndl;
+
+ if (!charset)
+ cp1 += 7;
+ while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
+ cp1++;
+
+ StrAllocCopy(cp3, cp1); /* copy to mutilate more */
+ for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
+ *cp4 != ';' && *cp4 != ':' &&
+ !WHITE(*cp4)); cp4++) {
+ ; /* do nothing */
+ }
+ *cp4 = '\0';
+ cp4 = cp3;
+ chndl = UCGetLYhndl_byMIME(cp3);
+
+#ifdef CAN_SWITCH_DISPLAY_CHARSET
+ /* Allow a switch to a more suitable display charset */
+ if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
+ /* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
+ should have the same setting for UCInfoStage. */
+ HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
+
+ me->outUCLYhndl = current_char_set;
+ HTAnchor_setUCInfoStage(me->node_anchor,
+ current_char_set,
+ UCT_STAGE_HTEXT,
+ UCT_SETBY_MIME); /* highest priorty! */
+ HTAnchor_setUCInfoStage(me->node_anchor,
+ current_char_set,
+ UCT_STAGE_STRUCTURED,
+ UCT_SETBY_MIME); /* highest priorty! */
+ me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_HTEXT);
+ /* The SGML stage will be reset in change_chartrans_handling */
+ }
+#endif
+
+ if (UCCanTranslateFromTo(chndl, current_char_set)) {
+ chartrans_ok = YES;
+ StrAllocCopy(me->node_anchor->charset, cp4);
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_PARSER,
+ UCT_SETBY_STRUCTURED);
+ } else if (chndl < 0) {
+ /*
+ * Got something but we don't recognize it.
+ */
+ chndl = UCLYhndl_for_unrec;
+ if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */
+ chndl = UCLYhndl_for_unspec; /* always >= 0 */
+ if (UCCanTranslateFromTo(chndl, current_char_set)) {
+ chartrans_ok = YES;
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_PARSER,
+ UCT_SETBY_STRUCTURED);
+ }
+ }
+ if (chartrans_ok) {
+ p_in = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_PARSER);
+ p_out = HTAnchor_setUCInfoStage(me->node_anchor,
+ current_char_set,
+ UCT_STAGE_HTEXT,
+ UCT_SETBY_DEFAULT);
+ if (!p_out) {
+ /*
+ * Try again.
+ */
+ p_out = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_HTEXT);
+ }
+ if (!strcmp(p_in->MIMEname, "x-transparent")) {
+ HTPassEightBitRaw = TRUE;
+ HTAnchor_setUCInfoStage(me->node_anchor,
+ HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_HTEXT),
+ UCT_STAGE_PARSER,
+ UCT_SETBY_DEFAULT);
+ }
+ if (!strcmp(p_out->MIMEname, "x-transparent")) {
+ HTPassEightBitRaw = TRUE;
+ HTAnchor_setUCInfoStage(me->node_anchor,
+ HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_PARSER),
+ UCT_STAGE_HTEXT,
+ UCT_SETBY_DEFAULT);
+ }
+ if ((p_in->enc != UCT_ENC_CJK)
+#ifdef EXP_JAPANESEUTF8_SUPPORT
+ && (p_in->enc != UCT_ENC_UTF8)
+#endif
+ ) {
+ HTCJK = NOCJK;
+ if (!(p_in->codepoints &
+ UCT_CP_SUBSETOF_LAT1) &&
+ chndl == current_char_set) {
+ HTPassEightBitRaw = TRUE;
+ }
+ } else if (p_out->enc == UCT_ENC_CJK) {
+ Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
+ }
+ LYGetChartransInfo(me);
+ /*
+ * Update the chartrans info homologously to a Content-Type
+ * MIME header with a charset parameter. - FM
+ */
+ if (me->UCLYhndl != chndl) {
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_MIME,
+ UCT_SETBY_STRUCTURED);
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_PARSER,
+ UCT_SETBY_STRUCTURED);
+ me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_PARSER);
+ me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_PARSER);
+ }
+ UCSetTransParams(&me->T,
+ me->inUCLYhndl, me->inUCI,
+ me->outUCLYhndl, me->outUCI);
+ } else {
+ /*
+ * Cannot translate. If according to some heuristic the given
+ * charset and the current display character both are likely to
+ * be like ISO-8859 in structure, pretend we have some kind of
+ * match.
+ */
+ BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
+ isdigit(UCH(cp4[9])));
+ BOOL given_is_8859like = (BOOL) (given_is_8859
+ || !StrNCmp(cp4, "windows-", 8)
+ || !StrNCmp(cp4, "cp12", 4)
+ || !StrNCmp(cp4, "cp-12", 5));
+ BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
+ (strstr(LYchar_set_names[current_char_set],
+ "ISO-8859") ||
+ strstr(LYchar_set_names[current_char_set],
+ "windows-")));
+
+ if (given_is_8859) {
+ cp1 = &cp4[10];
+ while (*cp1 &&
+ isdigit(UCH((*cp1))))
+ cp1++;
+ *cp1 = '\0';
+ }
+ if (given_and_display_8859like) {
+ StrAllocCopy(me->node_anchor->charset, cp4);
+ HTPassEightBitRaw = TRUE;
+ }
+ HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
+
+ }
+ FREE(cp3);
+
+ if (me->node_anchor->charset) {
+ CTRACE((tfp,
+ "LYHandleMETA: New charset: %s\n",
+ me->node_anchor->charset));
+ }
+ }
+ /*
+ * Set the kcode element based on the charset. - FM
+ */
+ HText_setKcode(me->text, me->node_anchor->charset, p_in);
+ }
+
+ /*
+ * Make sure we have META name/value pairs to handle. - FM
+ */
+ if (!(http_equiv || name) || !content)
+ goto free_META_copies;
+
+ /*
+ * Check for a no-cache Pragma
+ * or Cache-Control directive. - FM
+ */
+ if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
+ !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
+ LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ if (!strcasecomp(content, "no-cache")) {
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ }
+
+ /*
+ * If we didn't get a Cache-Control MIME header, and the META has one,
+ * convert to lowercase, store it in the anchor element, and if we
+ * haven't yet set no_cache, check whether we should. - FM
+ */
+ if ((!me->node_anchor->cache_control) &&
+ !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
+ LYLowerCase(content);
+ StrAllocCopy(me->node_anchor->cache_control, content);
+ if (me->node_anchor->no_cache == FALSE) {
+ cp0 = content;
+ while ((cp = strstr(cp0, "no-cache")) != NULL) {
+ cp += 8;
+ while (*cp != '\0' && WHITE(*cp))
+ cp++;
+ if (*cp == '\0' || *cp == ';') {
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ break;
+ }
+ cp0 = cp;
+ }
+ if (me->node_anchor->no_cache == TRUE)
+ goto free_META_copies;
+ cp0 = content;
+ while ((cp = strstr(cp0, "max-age")) != NULL) {
+ cp += 7;
+ while (*cp != '\0' && WHITE(*cp))
+ cp++;
+ if (*cp == '=') {
+ cp++;
+ while (*cp != '\0' && WHITE(*cp))
+ cp++;
+ if (isdigit(UCH(*cp))) {
+ cp0 = cp;
+ while (isdigit(UCH(*cp)))
+ cp++;
+ if (*cp0 == '0' && cp == (cp0 + 1)) {
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ break;
+ }
+ }
+ }
+ cp0 = cp;
+ }
+ }
+ }
+
+ /*
+ * Check for an Expires directive. - FM
+ */
+ } else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
+ /*
+ * If we didn't get an Expires MIME header, store it in the anchor
+ * element, and if we haven't yet set no_cache, check whether we
+ * should. Note that we don't accept a Date header via META tags,
+ * because it's likely to be untrustworthy, but do check for a Date
+ * header from a server when making the comparison. - FM
+ */
+ LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ StrAllocCopy(me->node_anchor->expires, content);
+ if (me->node_anchor->no_cache == FALSE) {
+ if (!strcmp(content, "0")) {
+ /*
+ * The value is zero, which we treat as an absolute no-cache
+ * directive. - FM
+ */
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ } else if (me->node_anchor->date != NULL) {
+ /*
+ * We have a Date header, so check if the value is less than or
+ * equal to that. - FM
+ */
+ if (LYmktime(content, TRUE) <=
+ LYmktime(me->node_anchor->date, TRUE)) {
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ }
+ } else if (LYmktime(content, FALSE) == 0) {
+ /*
+ * We don't have a Date header, and the value is in past for
+ * us. - FM
+ */
+ me->node_anchor->no_cache = TRUE;
+ HText_setNoCache(me->text);
+ }
+ }
+
+ /*
+ * Check for a Refresh directive. - FM
+ */
+ } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
+ char *Seconds = NULL;
+
+ LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_other);
+ LYParseRefreshURL(content, &Seconds, &href);
+
+ if (Seconds) {
+ if (href) {
+ /*
+ * We found a URL field, so check it out. - FM
+ */
+ if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
+ /*
+ * The specs require a complete URL, but this is a
+ * Netscapism, so don't expect the author to know that. -
+ * FM
+ */
+ HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
+ /*
+ * Use the document's address as the base. - FM
+ */
+ if (*href != '\0') {
+ temp = HTParse(href,
+ me->node_anchor->address, PARSE_ALL);
+ StrAllocCopy(href, temp);
+ FREE(temp);
+ } else {
+ StrAllocCopy(href, me->node_anchor->address);
+ HText_setNoCache(me->text);
+ }
+
+ } else {
+ /*
+ * Check whether to fill in localhost. - FM
+ */
+ LYFillLocalFileURL(&href,
+ (me->inBASE ?
+ me->base_href : me->node_anchor->address));
+ }
+
+ /*
+ * Set the no_cache flag if the Refresh URL is the same as the
+ * document's address. - FM
+ */
+ if (!strcmp(href, me->node_anchor->address)) {
+ HText_setNoCache(me->text);
+ }
+ } else {
+ /*
+ * We didn't find a URL field, so use the document's own
+ * address and set the no_cache flag. - FM
+ */
+ StrAllocCopy(href, me->node_anchor->address);
+ HText_setNoCache(me->text);
+ }
+ /*
+ * Check for an anchor in http or https URLs. - FM
+ */
+ cp = NULL;
+ /* id_string seems to be used wrong below if given.
+ not that it matters much. avoid setting it here. - kw */
+ if (track_internal_links &&
+ (StrNCmp(href, "http", 4) == 0) &&
+ (cp = StrChr(href, '#')) != NULL) {
+ StrAllocCopy(id_string, cp);
+ *cp = '\0';
+ }
+ if (me->inA) {
+ /*
+ * Ugh! The META tag, which is a HEAD element, is in an
+ * Anchor, which is BODY element. All we can do is close the
+ * Anchor and cross our fingers. - FM
+ */
+ if (me->inBoldA == TRUE && me->inBoldH == FALSE)
+ HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
+ me->inBoldA = FALSE;
+ HText_endAnchor(me->text, me->CurrentANum);
+ me->inA = FALSE;
+ me->CurrentANum = 0;
+ }
+ me->CurrentA = HTAnchor_findChildAndLink
+ (
+ me->node_anchor, /* Parent */
+ id_string, /* Tag */
+ href, /* Addresss */
+ (HTLinkType *) 0); /* Type */
+ if (id_string)
+ *cp = '#';
+ FREE(id_string);
+ LYEnsureSingleSpace(me);
+ if (me->inUnderline == FALSE)
+ HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
+ HTML_put_string(me, "REFRESH(");
+ HTML_put_string(me, Seconds);
+ HTML_put_string(me, " sec):");
+ FREE(Seconds);
+ if (me->inUnderline == FALSE)
+ HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
+ HTML_put_character(me, ' ');
+ me->in_word = NO;
+ HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
+ if (me->inBoldH == FALSE)
+ HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
+ HTML_put_string(me, href);
+ FREE(href);
+ if (me->inBoldH == FALSE)
+ HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
+ HText_endAnchor(me->text, 0);
+ LYEnsureSingleSpace(me);
+ }
+
+ /*
+ * Check for a suggested filename via a Content-Disposition with a
+ * filename=name.suffix in it, if we don't already have it via a server
+ * header. - FM
+ */
+ } else if (isEmpty(me->node_anchor->SugFname) &&
+ !strcasecomp((http_equiv ?
+ http_equiv : ""), "Content-Disposition")) {
+ cp = content;
+ while (*cp != '\0' && strncasecomp(cp, "filename", 8))
+ cp++;
+ if (*cp != '\0') {
+ cp = LYSkipBlanks(cp + 8);
+ if (*cp == '=')
+ cp++;
+ cp = LYSkipBlanks(cp);
+ if (*cp != '\0') {
+ StrAllocCopy(me->node_anchor->SugFname, cp);
+ if (*me->node_anchor->SugFname == '"') {
+ if ((cp = StrChr((me->node_anchor->SugFname + 1),
+ '"')) != NULL) {
+ *(cp + 1) = '\0';
+ HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
+ if (isEmpty(me->node_anchor->SugFname)) {
+ FREE(me->node_anchor->SugFname);
+ }
+ } else {
+ FREE(me->node_anchor->SugFname);
+ }
+ }
+#if defined(UNIX) && !defined(DOSPATH)
+ /*
+ * If blanks are not legal for local filenames, replace them
+ * with underscores.
+ */
+ if ((cp = me->node_anchor->SugFname) != NULL) {
+ while (*cp != '\0') {
+ if (isspace(UCH(*cp)))
+ *cp = '_';
+ ++cp;
+ }
+ }
+#endif
+ }
+ }
+ /*
+ * Check for a Set-Cookie directive. - AK
+ */
+ } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
+ /*
+ * This will need to be updated when Set-Cookie/Set-Cookie2 handling is
+ * finalized. For now, we'll still assume "historical" cookies in META
+ * directives. - FM
+ */
+ url_type = is_url(me->inBASE ?
+ me->base_href : me->node_anchor->address);
+ if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
+ LYSetCookie(content,
+ NULL,
+ (me->inBASE ?
+ me->base_href : me->node_anchor->address));
+ }
+ }
+
+ /*
+ * Free the copies. - FM
+ */
+ free_META_copies:
+ FREE(http_equiv);
+ FREE(name);
+ FREE(content);
+ FREE(charset);
+}
+
+/*
+ * This function handles P elements in HTML streams.
+ * If start is TRUE it handles a start tag, and if
+ * FALSE, an end tag. We presently handle start
+ * and end tags identically, but this can lead to
+ * a different number of blank lines between the
+ * current paragraph and subsequent text when a P
+ * end tag is present or not in the markup. - FM
+ */
+void LYHandlePlike(HTStructured * me, const BOOL *present,
+ STRING2PTR value,
+ char **include GCC_UNUSED,
+ int align_idx,
+ int start)
+{
+ /*
+ * FIG content should be a true block, which like P inherits the current
+ * style. APPLET is like character elements or an ALT attribute, unless
+ * its content contains a block element. If we encounter a P in either's
+ * content, we set flags to treat the content as a block - FM
+ */
+ if (start) {
+ if (me->inFIG)
+ me->inFIGwithP = TRUE;
+
+ if (me->inAPPLET)
+ me->inAPPLETwithP = TRUE;
+ }
+
+ UPDATE_STYLE;
+ if (me->List_Nesting_Level >= 0) {
+ /*
+ * We're in a list. Treat P as an instruction to create one blank
+ * line, if not already present, then fall through to handle
+ * attributes, with the "second line" margins - FM
+ */
+ if (me->inP) {
+ if (me->inFIG || me->inAPPLET ||
+ me->inCAPTION || me->inCREDIT ||
+ me->sp->style->spaceAfter > 0 ||
+ (start && me->sp->style->spaceBefore > 0)) {
+ LYEnsureDoubleSpace(me);
+ } else {
+ LYEnsureSingleSpace(me);
+ }
+ }
+ } else if (me->sp[0].tag_number == HTML_ADDRESS) {
+ /*
+ * We're in an ADDRESS. Treat P as an instruction to start a newline,
+ * if needed, then fall through to handle attributes - FM
+ */
+ if (!HText_LastLineEmpty(me->text, FALSE)) {
+ HText_setLastChar(me->text, ' '); /* absorb white space */
+ HText_appendCharacter(me->text, '\r');
+ }
+ } else {
+ if (start) {
+ if (!(me->inLABEL && !me->inP)) {
+ HText_appendParagraph(me->text);
+ }
+ } else if (me->sp->style->spaceAfter > 0) {
+ LYEnsureDoubleSpace(me);
+ } else {
+ LYEnsureSingleSpace(me);
+ }
+ me->inLABEL = FALSE;
+ }
+ me->in_word = NO;
+
+ if (LYoverride_default_alignment(me)) {
+ me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
+ } else if ((me->List_Nesting_Level >= 0 &&
+ (me->sp->style->id == ST_DivCenter ||
+ me->sp->style->id == ST_DivLeft ||
+ me->sp->style->id == ST_DivRight)) ||
+ ((me->Division_Level < 0) &&
+ (me->sp->style->id == ST_Normal ||
+ me->sp->style->id == ST_Preformatted))) {
+ me->sp->style->alignment = HT_LEFT;
+ } else {
+ me->sp->style->alignment = (short) me->current_default_alignment;
+ }
+
+ if (start && align_idx >= 0) {
+ if (present && present[align_idx] && value[align_idx]) {
+ if (!strcasecomp(value[align_idx], "center") &&
+ !(me->List_Nesting_Level >= 0 && !me->inP))
+ me->sp->style->alignment = HT_CENTER;
+ else if (!strcasecomp(value[align_idx], "right") &&
+ !(me->List_Nesting_Level >= 0 && !me->inP))
+ me->sp->style->alignment = HT_RIGHT;
+ else if (!strcasecomp(value[align_idx], "left") ||
+ !strcasecomp(value[align_idx], "justify"))
+ me->sp->style->alignment = HT_LEFT;
+ }
+
+ }
+
+ /*
+ * Mark that we are starting a new paragraph and don't have any of its
+ * text yet - FM
+ */
+ me->inP = FALSE;
+
+ return;
+}
+
+/*
+ * This function handles SELECT elements in HTML streams.
+ * If start is TRUE it handles a start tag, and if FALSE,
+ * an end tag. - FM
+ */
+void LYHandleSELECT(HTStructured * me, const BOOL *present,
+ STRING2PTR value,
+ char **include GCC_UNUSED,
+ int start)
+{
+ int i;
+
+ if (start == TRUE) {
+ char *name = NULL;
+ BOOLEAN multiple = NO;
+ char *size = NULL;
+
+ /*
+ * Initialize the disable attribute.
+ */
+ me->select_disabled = FALSE;
+
+ /*
+ * Check for unclosed TEXTAREA.
+ */
+ if (me->inTEXTAREA) {
+ if (LYBadHTML(me)) {
+ LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
+ }
+ }
+
+ /*
+ * Set to know we are in a select tag.
+ */
+ me->inSELECT = TRUE;
+
+ if (!(present && present[HTML_SELECT_NAME] &&
+ non_empty(value[HTML_SELECT_NAME]))) {
+ StrAllocCopy(name, "");
+ } else if (StrChr(value[HTML_SELECT_NAME], '&') == NULL) {
+ StrAllocCopy(name, value[HTML_SELECT_NAME]);
+ } else {
+ StrAllocCopy(name, value[HTML_SELECT_NAME]);
+ UNESCAPE_FIELDNAME_TO_STD(&name);
+ }
+ if (present && present[HTML_SELECT_MULTIPLE])
+ multiple = YES;
+ if (present && present[HTML_SELECT_DISABLED])
+ me->select_disabled = TRUE;
+ if (present && present[HTML_SELECT_SIZE] &&
+ non_empty(value[HTML_SELECT_SIZE])) {
+ /*
+ * Let the size be determined by the number of OPTIONs. - FM
+ */
+ CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
+ value[HTML_SELECT_SIZE]));
+ }
+
+ if (me->inBoldH == TRUE &&
+ (multiple == NO || LYSelectPopups == FALSE)) {
+ HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
+ me->inBoldH = FALSE;
+ me->needBoldH = TRUE;
+ }
+ if (me->inUnderline == TRUE &&
+ (multiple == NO || LYSelectPopups == FALSE)) {
+ HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
+ me->inUnderline = FALSE;
+ }
+
+ if ((multiple == NO && LYSelectPopups == TRUE) &&
+ (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
+ !me->sp->style->freeFormat) &&
+ HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
+ /*
+ * Force a newline when we're using a popup in a PRE block and are
+ * within 7 columns from the right margin. This will allow for the
+ * '[' popup designator and help avoid a wrap in the underscore
+ * placeholder for the retracted popup entry in the HText
+ * structure. - FM
+ */
+ HTML_put_character(me, '\n');
+ me->in_word = NO;
+ }
+
+ LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
+
+ HText_beginSelect(name, ATTR_CS_IN, multiple, size);
+ FREE(name);
+ FREE(size);
+
+ me->first_option = TRUE;
+ } else {
+ /*
+ * Handle end tag.
+ */
+ char *ptr;
+
+ /*
+ * Make sure we had a select start tag.
+ */
+ if (!me->inSELECT) {
+ if (LYBadHTML(me)) {
+ LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
+ }
+ return;
+ }
+
+ /*
+ * Set to know that we are no longer in a select tag.
+ */
+ me->inSELECT = FALSE;
+
+ /*
+ * Clear the disable attribute.
+ */
+ me->select_disabled = FALSE;
+
+ /*
+ * Finish the data off.
+ */
+ HTChunkTerminate(&me->option);
+ /*
+ * Finish the previous option.
+ */
+ ptr = HText_setLastOptionValue(me->text,
+ me->option.data,
+ me->LastOptionValue,
+ LAST_ORDER,
+ me->LastOptionChecked,
+ me->UCLYhndl,
+ ATTR_CS_IN);
+ FREE(me->LastOptionValue);
+
+ me->LastOptionChecked = FALSE;
+
+ if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
+ LYSelectPopups == FALSE) {
+ /*
+ * Start a newline after the last checkbox/button option.
+ */
+ LYEnsureSingleSpace(me);
+ } else {
+ /*
+ * Output popup box with the default option to screen, but use
+ * non-breaking spaces for output.
+ */
+ if (ptr &&
+ me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
+ /*
+ * The code inadequately handles OPTION fields in PRE tags.
+ * We'll put up a minimum of 6 characters, and if any more
+ * would exceed the wrap column, we'll ignore them.
+ */
+ for (i = 0; i < 6; i++) {
+ if (*ptr == ' ')
+ HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
+ else
+ HText_appendCharacter(me->text, *ptr);
+ ptr++;
+ }
+ }
+ for (; non_empty(ptr); ptr++) {
+ if (*ptr == ' ')
+ HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
+ else
+ HText_appendCharacter(me->text, *ptr);
+ }
+ /*
+ * Add end option character.
+ */
+ if (!me->first_option) {
+ HText_appendCharacter(me->text, ']');
+ HText_setLastChar(me->text, ']');
+ me->in_word = YES;
+ }
+ }
+ HTChunkClear(&me->option);
+
+ if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
+ HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
+ me->inUnderline = TRUE;
+ }
+ if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
+ HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
+ me->inBoldH = TRUE;
+ me->needBoldH = FALSE;
+ }
+ }
+}
+
+/*
+ * This function strips white characters and
+ * generally fixes up attribute values that
+ * were received from the SGML parser and
+ * are to be treated as partial or absolute
+ * URLs. - FM
+ */
+int LYLegitimizeHREF(HTStructured * me, char **href,
+ int force_slash,
+ int strip_dots)
+{
+ int url_type = 0;
+ char *p = NULL;
+ char *pound = NULL;
+ const char *Base = NULL;
+
+ if (!me || !href || isEmpty(*href))
+ return (url_type);
+
+ if (!LYTrimStartfile(*href)) {
+ /*
+ * Collapse spaces in the actual URL, but just protect against tabs or
+ * newlines in the fragment, if present. This seeks to cope with
+ * atrocities inflicted on the Web by authoring tools such as
+ * Frontpage. - FM
+ */
+
+ /* Before working on spaces check if we have any, usually none. */
+ p = LYSkipNonBlanks(*href);
+
+ if (*p) { /* p == first space character */
+ /* no reallocs below, all converted in place */
+
+ pound = findPoundSelector(*href);
+
+ if (pound != NULL && pound < p) {
+ convert_to_spaces(p, FALSE); /* done */
+
+ } else {
+ if (pound != NULL)
+ *pound = '\0'; /* mark */
+
+ /*
+ * No blanks really belong in the HREF,
+ * but if it refers to an actual file,
+ * it may actually have blanks in the name.
+ * Try to accommodate. See also HTParse().
+ */
+ if (LYRemoveNewlines(p) || StrChr(p, '\t') != 0) {
+ LYRemoveBlanks(p); /* a compromise... */
+ }
+
+ if (pound != NULL) {
+ p = StrChr(p, '\0');
+ *pound = '#'; /* restore */
+ convert_to_spaces(pound, FALSE);
+ if (p < pound)
+ strcpy(p, pound);
+ }
+ }
+ }
+ }
+ if (**href == '\0')
+ return (url_type);
+
+ TRANSLATE_AND_UNESCAPE_TO_STD(href);
+
+ Base = me->inBASE ?
+ me->base_href : me->node_anchor->address;
+
+ url_type = is_url(*href);
+ if (!url_type && force_slash && **href == '.' &&
+ (!strcmp(*href, ".") || !strcmp(*href, "..")) &&
+ !isFILE_URL(Base)) {
+ /*
+ * The Fielding RFC/ID for resolving partial HREFs says that a slash
+ * should be on the end of the preceding symbolic element for "." and
+ * "..", but all tested browsers only do that for an explicit "./" or
+ * "../", so we'll respect the RFC/ID only if force_slash was TRUE and
+ * it's not a file URL. - FM
+ */
+ StrAllocCat(*href, "/");
+ }
+ if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
+ !strncasecomp(Base, "http", 4)) {
+ /*
+ * We will be resolving a partial reference versus an http or https
+ * URL, and it has lead dots, which may be retained when resolving via
+ * HTParse(), but the request would fail if the first element of the
+ * resultant path is two dots, because no http or https server accepts
+ * such paths, and the current URL draft, likely to become an RFC, says
+ * that it's optional for the UA to strip them as a form of error
+ * recovery. So we will, recursively, for http/https URLs, like the
+ * "major market browsers" which made this problem so common on the
+ * Web, but we'll also issue a message about it, such that the bad
+ * partial reference might get corrected by the document provider. -
+ * FM
+ */
+ char *temp = NULL, *path = NULL, *cp;
+ const char *str = "";
+
+ temp = HTParse(*href, Base, PARSE_ALL);
+ path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
+ if (!StrNCmp(path, "/..", 3)) {
+ cp = (path + 3);
+ if (LYIsHtmlSep(*cp) || *cp == '\0') {
+ if (Base[4] == 's') {
+ str = "s";
+ }
+ CTRACE((tfp,
+ "LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
+ *href, str));
+ CTRACE((tfp, " Stripping lead dots.\n"));
+ if (!me->inBadHREF) {
+ HTUserMsg(BAD_PARTIAL_REFERENCE);
+ me->inBadHREF = TRUE;
+ }
+ }
+ if (*cp == '\0') {
+ StrAllocCopy(*href, "/");
+ } else if (LYIsHtmlSep(*cp)) {
+ while (!StrNCmp(cp, "/..", 3)) {
+ if (*(cp + 3) == '/') {
+ cp += 3;
+ continue;
+ } else if (*(cp + 3) == '\0') {
+ *(cp + 1) = '\0';
+ *(cp + 2) = '\0';
+ }
+ break;
+ }
+ StrAllocCopy(*href, cp);
+ }
+ }
+ FREE(temp);
+ FREE(path);
+ }
+ return (url_type);
+}
+
+/*
+ * This function checks for a Content-Base header,
+ * and if not present, a Content-Location header
+ * which is an absolute URL, and sets the BASE
+ * accordingly. If set, it will be replaced by
+ * any BASE tag in the HTML stream, itself. - FM
+ */
+void LYCheckForContentBase(HTStructured * me)
+{
+ char *cp = NULL;
+ BOOL present[HTML_BASE_ATTRIBUTES];
+ const char *value[HTML_BASE_ATTRIBUTES];
+ int i;
+
+ if (!(me && me->node_anchor))
+ return;
+
+ if (me->node_anchor->content_base != NULL) {
+ /*
+ * We have a Content-Base value. Use it if it's non-zero length. - FM
+ */
+ if (*me->node_anchor->content_base == '\0')
+ return;
+ StrAllocCopy(cp, me->node_anchor->content_base);
+ LYRemoveBlanks(cp);
+ } else if (me->node_anchor->content_location != NULL) {
+ /*
+ * We didn't have a Content-Base value, but do have a Content-Location
+ * value. Use it if it's an absolute URL. - FM
+ */
+ if (*me->node_anchor->content_location == '\0')
+ return;
+ StrAllocCopy(cp, me->node_anchor->content_location);
+ LYRemoveBlanks(cp);
+ if (!is_url(cp)) {
+ FREE(cp);
+ return;
+ }
+ } else {
+ /*
+ * We had neither a Content-Base nor Content-Location value. - FM
+ */
+ return;
+ }
+
+ /*
+ * If we collapsed to a zero-length value, ignore it. - FM
+ */
+ if (*cp == '\0') {
+ FREE(cp);
+ return;
+ }
+
+ /*
+ * Pass the value to HTML_start_element as the HREF of a BASE tag. - FM
+ */
+ for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
+ present[i] = NO;
+ present[HTML_BASE_HREF] = YES;
+ value[HTML_BASE_HREF] = (const char *) cp;
+ (*me->isa->start_element) (me, HTML_BASE, present, value,
+ 0, 0);
+ FREE(cp);
+}
+
+/*
+ * This function creates NAMEd Anchors if a non-zero-length NAME
+ * or ID attribute was present in the tag. - FM
+ */
+void LYCheckForID(HTStructured * me, const BOOL *present,
+ STRING2PTR value,
+ int attribute)
+{
+ HTChildAnchor *ID_A = NULL;
+ char *temp = NULL;
+
+ if (!(me && me->text))
+ return;
+
+ if (present && present[attribute]
+ && non_empty(value[attribute])) {
+ /*
+ * Translate any named or numeric character references. - FM
+ */
+ StrAllocCopy(temp, value[attribute]);
+ LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
+ NO, NO, YES, st_URL);
+
+ /*
+ * Create the link if we still have a non-zero-length string. - FM
+ */
+ if ((temp[0] != '\0') &&
+ (ID_A = HTAnchor_findChildAndLink
+ (
+ me->node_anchor, /* Parent */
+ temp, /* Tag */
+ NULL, /* Addresss */
+ (HTLinkType *) 0))) { /* Type */
+ HText_beginAnchor(me->text, me->inUnderline, ID_A);
+ HText_endAnchor(me->text, 0);
+ }
+ FREE(temp);
+ }
+}
+
+/*
+ * This function creates a NAMEd Anchor for the ID string
+ * passed to it directly as an argument. It assumes the
+ * does not need checking for character references. - FM
+ */
+void LYHandleID(HTStructured * me, const char *id)
+{
+ HTChildAnchor *ID_A = NULL;
+
+ if (!(me && me->text) ||
+ isEmpty(id))
+ return;
+
+ /*
+ * Create the link if we still have a non-zero-length string. - FM
+ */
+ if ((ID_A = HTAnchor_findChildAndLink
+ (
+ me->node_anchor, /* Parent */
+ id, /* Tag */
+ NULL, /* Addresss */
+ (HTLinkType *) 0)) != NULL) { /* Type */
+ HText_beginAnchor(me->text, me->inUnderline, ID_A);
+ HText_endAnchor(me->text, 0);
+ }
+}
+
+/*
+ * This function checks whether we want to override
+ * the current default alignment for paragraphs and
+ * instead use that specified in the element's style
+ * sheet. - FM
+ */
+BOOLEAN LYoverride_default_alignment(HTStructured * me)
+{
+ if (!me)
+ return NO;
+
+ switch (me->sp[0].tag_number) {
+ case HTML_BLOCKQUOTE:
+ case HTML_BQ:
+ case HTML_NOTE:
+ case HTML_FN:
+ case HTML_ADDRESS:
+ me->sp->style->alignment = HT_LEFT;
+ return YES;
+
+ default:
+ break;
+ }
+ return NO;
+}
+
+/*
+ * This function inserts newlines if needed to create double spacing,
+ * and sets the left margin for subsequent text to the second line
+ * indentation of the current style. - FM
+ */
+void LYEnsureDoubleSpace(HTStructured * me)
+{
+ if (!me || !me->text)
+ return;
+
+ if (!HText_LastLineEmpty(me->text, FALSE)) {
+ HText_setLastChar(me->text, ' '); /* absorb white space */
+ HText_appendCharacter(me->text, '\r');
+ HText_appendCharacter(me->text, '\r');
+ } else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
+ HText_setLastChar(me->text, ' '); /* absorb white space */
+ HText_appendCharacter(me->text, '\r');
+ } else if (me->List_Nesting_Level >= 0) {
+ HText_NegateLineOne(me->text);
+ }
+ me->in_word = NO;
+ return;
+}
+
+/*
+ * This function inserts a newline if needed to create single spacing,
+ * and sets the left margin for subsequent text to the second line
+ * indentation of the current style. - FM
+ */
+void LYEnsureSingleSpace(HTStructured * me)
+{
+ if (!me || !me->text)
+ return;
+
+ if (!HText_LastLineEmpty(me->text, FALSE)) {
+ HText_setLastChar(me->text, ' '); /* absorb white space */
+ HText_appendCharacter(me->text, '\r');
+ } else if (me->List_Nesting_Level >= 0) {
+ HText_NegateLineOne(me->text);
+ }
+ me->in_word = NO;
+ return;
+}
+
+/*
+ * This function resets paragraph alignments for block
+ * elements which do not have a defined style sheet. - FM
+ */
+void LYResetParagraphAlignment(HTStructured * me)
+{
+ if (!me)
+ return;
+
+ if (me->List_Nesting_Level >= 0 ||
+ ((me->Division_Level < 0) &&
+ (me->sp->style->id == ST_Normal ||
+ me->sp->style->id == ST_Preformatted))) {
+ me->sp->style->alignment = HT_LEFT;
+ } else {
+ me->sp->style->alignment = (short) me->current_default_alignment;
+ }
+ return;
+}
+
+/*
+ * This example function checks whether the given anchor has
+ * an address with a file scheme, and if so, loads it into the
+ * the SGML parser's context->url element, which was passed as
+ * the second argument. The handle_comment() calling function in
+ * SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
+ * into the corresponding stream, homologously to an SSI by an
+ * HTTP server. - FM
+ *
+ * For functions similar to this but which depend on details of
+ * the HTML handler's internal data, the calling interface should
+ * be changed, and functions in SGML.c would have to make sure not
+ * to call such functions inappropriately (e.g., calling a function
+ * specific to the Lynx_HTML_Handler when SGML.c output goes to
+ * some other HTStructured object like in HTMLGen.c), or the new
+ * functions could be added to the SGML.h interface.
+ */
+BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
+ char **url)
+{
+ if (!(anchor && anchor->address))
+ return FALSE;
+
+ if (!isFILE_URL(anchor->address))
+ return FALSE;
+
+ if (!LYisLocalHost(anchor->address))
+ return FALSE;
+
+ StrAllocCopy(*url, anchor->address);
+ return TRUE;
+}
+
+/*
+ * This function is called from the SGML parser to look at comments
+ * and see whether we should collect some info from them. Currently
+ * it only looks for comments with Message-Id and Subject info, in the
+ * exact form generated by MHonArc for archived mailing list. If found,
+ * the info is stored in the document's HTParentAnchor. It can later be
+ * used for generating a mail response.
+ *
+ * We are extra picky here because there isn't any official definition
+ * for these kinds of comments - we might (and still can) misinterpret
+ * arbitrary comments as something they aren't.
+ *
+ * If something doesn't look right, for example invalid characters, the
+ * strings are not stored. Mail responses will use something else as
+ * the subject, probably the document URL, and will not have an
+ * In-Reply-To header.
+ *
+ * All this is a hack - to do this the right way, mailing list archivers
+ * would have to agree on some better mechanism to make this kind of info
+ * from original mail headers available, for example using LINK. - kw
+ */
+BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
+ const char *comment)
+{
+ const char *cp;
+ size_t len;
+
+ if (comment == NULL)
+ return FALSE;
+
+ if (!(anchor && anchor->address))
+ return FALSE;
+
+ if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
+ char *messageid = NULL;
+ char *p;
+
+ for (cp = comment + 17; *cp; cp++) {
+ if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
+ break;
+ }
+ }
+ if (strcmp(cp, " --")) {
+ return FALSE;
+ }
+ cp = comment + 17;
+ StrAllocCopy(messageid, cp);
+ /* This should be ok - message-id should only contain 7-bit ASCII */
+ if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
+ return FALSE;
+ for (p = messageid; *p; p++) {
+ if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
+ break;
+ }
+ }
+ if (strcmp(p, " --")) {
+ FREE(messageid);
+ return FALSE;
+ }
+ if ((p = StrChr(messageid, '@')) == NULL || p[1] == '\0') {
+ FREE(messageid);
+ return FALSE;
+ }
+ p = messageid;
+ if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
+ p[len - 3] = '\0';
+ } else {
+ FREE(messageid);
+ return FALSE;
+ }
+ if (HTAnchor_setMessageID(anchor, messageid)) {
+ FREE(messageid);
+ return TRUE;
+ } else {
+ FREE(messageid);
+ return FALSE;
+ }
+ }
+ if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
+ char *subject = NULL;
+ char *p;
+
+ for (cp = comment + 14; *cp; cp++) {
+ if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
+ return FALSE;
+ }
+ }
+ cp = comment + 14;
+ StrAllocCopy(subject, cp);
+ /* @@@
+ * This may not be the right thing for the subject - but mail
+ * subjects shouldn't contain 8-bit characters in raw form anyway.
+ * We have to unescape character entities, since that's what MHonArc
+ * seems to generate. But if after that there are 8-bit characters
+ * the string is rejected. We would probably not know correctly
+ * what charset to assume anyway - the mail sender's can differ from
+ * the archive's. And the code for sending mail cannot deal well
+ * with 8-bit characters - we should not put them in the Subject
+ * header in raw form, but don't have MIME encoding implemented.
+ * Someone may want to do more about this... - kw
+ */
+ if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
+ return FALSE;
+ for (p = subject; *p; p++) {
+ if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
+ FREE(subject);
+ return FALSE;
+ }
+ }
+ p = subject;
+ if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
+ p[len - 3] = '\0';
+ } else {
+ FREE(subject);
+ return FALSE;
+ }
+ if (HTAnchor_setSubject(anchor, subject)) {
+ FREE(subject);
+ return TRUE;
+ } else {
+ FREE(subject);
+ return FALSE;
+ }
+ }
+
+ return FALSE;
+}
+
+ /*
+ * Create the Title with any left-angle-brackets converted to &lt; entities
+ * and any ampersands converted to &amp; entities. - FM
+ *
+ * Convert 8-bit letters to &#xUUUU to avoid dependencies from display
+ * character set which may need changing. Do NOT convert any 8-bit chars
+ * if we have CJK display. - LP
+ */
+void LYformTitle(char **dst,
+ const char *src)
+{
+ if (HTCJK == JAPANESE) {
+ char *tmp_buffer = NULL;
+
+ if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
+ outofmem(__FILE__, "LYformTitle");
+
+ switch (kanji_code) { /* 1997/11/22 (Sat) 09:28:00 */
+ case EUC:
+ TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
+ break;
+ case SJIS:
+ TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
+ break;
+ default:
+ CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
+ strcpy(tmp_buffer, src);
+ break;
+ }
+ StrAllocCopy(*dst, tmp_buffer);
+ FREE(tmp_buffer);
+ } else {
+ StrAllocCopy(*dst, src);
+ }
+}