summaryrefslogtreecommitdiffstats
path: root/WWW/Library/Implementation/SGML.c
diff options
context:
space:
mode:
Diffstat (limited to 'WWW/Library/Implementation/SGML.c')
-rw-r--r--WWW/Library/Implementation/SGML.c5167
1 files changed, 5167 insertions, 0 deletions
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
new file mode 100644
index 0000000..c256685
--- /dev/null
+++ b/WWW/Library/Implementation/SGML.c
@@ -0,0 +1,5167 @@
+/*
+ * $LynxId: SGML.c,v 1.186 2023/10/25 22:35:29 tom Exp $
+ *
+ * General SGML Parser code SGML.c
+ * ========================
+ *
+ * This module implements an HTStream object. To parse an
+ * SGML file, create this object which is a parser. The object
+ * is (currently) created by being passed a DTD structure,
+ * and a target HTStructured object at which to throw the parsed stuff.
+ *
+ * 6 Feb 93 Binary searches used. Interface modified.
+ */
+
+#define HTSTREAM_INTERNAL 1
+
+#include <HTUtils.h>
+
+#include <SGML.h>
+#include <HTMLDTD.h>
+#include <HTAccess.h>
+#include <UCAux.h>
+
+#include <HTChunk.h>
+#include <HTUtils.h>
+
+#include <LYCharSets.h>
+#include <LYCharVals.h> /* S/390 -- gil -- 0635 */
+#include <LYGlobalDefs.h>
+#include <LYStrings.h>
+#include <LYLeaks.h>
+#include <LYUtils.h>
+
+#ifdef USE_COLOR_STYLE
+# include <LYStyle.h>
+#endif
+#ifdef USE_PRETTYSRC
+# include <LYPrettySrc.h>
+#endif
+
+/* a global variable doesn't work with info-stages which convert encoding */
+#if defined(EXP_CHINESEUTF8_SUPPORT)
+#undef IS_CJK_TTY
+#define IS_CJK_TTY me->T.do_cjk
+#endif
+
+#define AssumeCP1252(me) \
+ (((me)->inUCLYhndl == LATIN1 \
+ || (me)->inUCLYhndl == US_ASCII) \
+ && html5_charsets)
+
+#define INVALID (-1)
+
+static int sgml_offset;
+
+#ifdef USE_PRETTYSRC
+
+static char *entity_string; /* this is used for printing entity name.
+
+ Unconditionally added since redundant assignments don't hurt much */
+
+static void fake_put_character(HTStream *p GCC_UNUSED,
+ int c GCC_UNUSED)
+{
+}
+
+#define START TRUE
+#define STOP FALSE
+
+#define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x)
+
+#endif
+
+/* my_casecomp() - optimized by the first character, NOT_ASCII ok */
+#define my_casecomp(a,b) ((TOUPPER(*a) == TOUPPER(*b)) ? \
+ AS_casecomp(a,b) : \
+ (TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b))))
+
+/* ...used for comments and attributes value like href... */
+#define HTChunkPutUtf8Char(ch,x) \
+ { \
+ if ((TOASCII(x) < 128) && (ch->size < ch->allocated)) \
+ ch->data[ch->size++] = (char)x; \
+ else \
+ (HTChunkPutUtf8Char)(ch,x); \
+ }
+
+#define PUTS(str) ((*me->actions->put_string)(me->target, str))
+#define PUTC(ch) ((*me->actions->put_character)(me->target, (char) ch))
+#define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)me->target, \
+ (putc_func_t*)(me->actions->put_character), code))
+
+#ifdef USE_PRETTYSRC
+#define PRETTYSRC_PUTC(c) if (psrc_view) PUTC(c)
+#else
+#define PRETTYSRC_PUTC(c) /* nothing */
+#endif
+
+/*the following macros are used for pretty source view. */
+#define IS_C(attr) (attr.type == HTMLA_CLASS)
+
+#if defined(USE_JAPANESEUTF8_SUPPORT)
+# define UTF8_TTY_ISO2022JP (me->T.output_utf8)
+#else
+# define UTF8_TTY_ISO2022JP 0
+#endif
+
+HTCJKlang HTCJK = NOCJK; /* CJK enum value. */
+BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */
+BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */
+BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173,&#127; raw. */
+BOOL HTPassHighCtrlNum = FALSE; /* Pass &#128;-&#159; raw. */
+
+/* The State (context) of the parser
+ *
+ * This is passed with each call to make the parser reentrant
+ */
+
+#define MAX_ATTRIBUTES 36 /* Max number of attributes per element */
+
+/* Element Stack
+ * -------------
+ * This allows us to return down the stack reselecting styles.
+ * As we return, attribute values will be garbage in general.
+ */
+typedef struct _HTElement HTElement;
+struct _HTElement {
+ HTElement *next; /* Previously nested element or 0 */
+ HTTag *tag; /* The tag at this level */
+};
+
+typedef enum {
+ S_text = 0
+ ,S_attr
+ ,S_attr_gap
+ ,S_comment
+ ,S_cro
+ ,S_doctype
+ ,S_dollar
+ ,S_dollar_dq
+ ,S_dollar_paren
+ ,S_dollar_paren_dq
+ ,S_dollar_paren_sq
+ ,S_dollar_sq
+ ,S_dquoted
+ ,S_end
+ ,S_entity
+ ,S_equals
+ ,S_ero
+ ,S_esc
+ ,S_esc_dq
+ ,S_esc_sq
+ ,S_exclamation
+ ,S_in_kanji
+ ,S_incro
+ ,S_junk_tag
+ ,S_litteral
+ ,S_marked
+ ,S_nonascii_text
+ ,S_nonascii_text_dq
+ ,S_nonascii_text_sq
+ ,S_paren
+ ,S_paren_dq
+ ,S_paren_sq
+ ,S_pcdata
+ ,S_pi
+ ,S_script
+ ,S_sgmlatt
+ ,S_sgmlele
+ ,S_sgmlent
+ ,S_squoted
+ ,S_tag
+ ,S_tag_gap
+ ,S_tagname_slash
+ ,S_value
+} sgml_state;
+
+/* Internal Context Data Structure
+ * -------------------------------
+ */
+struct _HTStream {
+
+ const HTStreamClass *isa; /* inherited from HTStream */
+
+ const SGML_dtd *dtd;
+ const HTStructuredClass *actions; /* target class */
+ HTStructured *target; /* target object */
+
+ HTTag *current_tag;
+ HTTag *slashedtag;
+ const HTTag *unknown_tag;
+ BOOL extended_html; /* xhtml */
+ BOOL strict_xml; /* xml */
+ BOOL inSELECT;
+ BOOL no_lynx_specialcodes;
+ int current_attribute_number;
+ HTChunk *string;
+ int leading_spaces;
+ int trailing_spaces;
+ HTElement *element_stack;
+ sgml_state state;
+ unsigned char kanji_buf;
+#ifdef CALLERDATA
+ void *callerData;
+#endif /* CALLERDATA */
+ BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
+ char *value[MAX_ATTRIBUTES]; /* NULL, or strings alloc'd with StrAllocCopy_extra() */
+
+ BOOL lead_exclamation;
+ BOOL first_dash;
+ BOOL end_comment;
+ BOOL doctype_bracket;
+ BOOL first_bracket;
+ BOOL second_bracket;
+ BOOL isHex;
+
+ HTParentAnchor *node_anchor;
+ LYUCcharset *inUCI; /* pointer to anchor UCInfo */
+ int inUCLYhndl; /* charset we are fed */
+ LYUCcharset *outUCI; /* anchor UCInfo for target */
+ int outUCLYhndl; /* charset for target */
+ UTFDecodeState U;
+ UCTransParams T;
+ int current_tag_charset; /* charset to pass attributes */
+
+ char *recover;
+ int recover_index;
+ char *include;
+ char *active_include;
+ int include_index;
+ char *url;
+ char *csi;
+ int csi_index;
+#ifdef USE_PRETTYSRC
+ BOOL cur_attr_is_href;
+ BOOL cur_attr_is_name;
+#endif
+};
+
+#ifdef NO_LYNX_TRACE
+#define state_name(n) "state"
+#else
+static const char *state_name(sgml_state n)
+{
+ const char *result = "?";
+ /* *INDENT-OFF* */
+ switch (n) {
+ case S_attr: result = "S_attr"; break;
+ case S_attr_gap: result = "S_attr_gap"; break;
+ case S_comment: result = "S_comment"; break;
+ case S_cro: result = "S_cro"; break;
+ case S_doctype: result = "S_doctype"; break;
+ case S_dollar: result = "S_dollar"; break;
+ case S_dollar_dq: result = "S_dollar_dq"; break;
+ case S_dollar_paren: result = "S_dollar_paren"; break;
+ case S_dollar_paren_dq: result = "S_dollar_paren_dq"; break;
+ case S_dollar_paren_sq: result = "S_dollar_paren_sq"; break;
+ case S_dollar_sq: result = "S_dollar_sq"; break;
+ case S_dquoted: result = "S_dquoted"; break;
+ case S_end: result = "S_end"; break;
+ case S_entity: result = "S_entity"; break;
+ case S_equals: result = "S_equals"; break;
+ case S_ero: result = "S_ero"; break;
+ case S_esc: result = "S_esc"; break;
+ case S_esc_dq: result = "S_esc_dq"; break;
+ case S_esc_sq: result = "S_esc_sq"; break;
+ case S_exclamation: result = "S_exclamation"; break;
+ case S_in_kanji: result = "S_in_kanji"; break;
+ case S_incro: result = "S_incro"; break;
+ case S_pi: result = "S_pi"; break;
+ case S_junk_tag: result = "S_junk_tag"; break;
+ case S_litteral: result = "S_litteral"; break;
+ case S_marked: result = "S_marked"; break;
+ case S_nonascii_text: result = "S_nonascii_text"; break;
+ case S_nonascii_text_dq: result = "S_nonascii_text_dq"; break;
+ case S_nonascii_text_sq: result = "S_nonascii_text_sq"; break;
+ case S_paren: result = "S_paren"; break;
+ case S_paren_dq: result = "S_paren_dq"; break;
+ case S_paren_sq: result = "S_paren_sq"; break;
+ case S_pcdata: result = "S_pcdata"; break;
+ case S_script: result = "S_script"; break;
+ case S_sgmlatt: result = "S_sgmlatt"; break;
+ case S_sgmlele: result = "S_sgmlele"; break;
+ case S_sgmlent: result = "S_sgmlent"; break;
+ case S_squoted: result = "S_squoted"; break;
+ case S_tag: result = "S_tag"; break;
+ case S_tag_gap: result = "S_tag_gap"; break;
+ case S_tagname_slash: result = "S_tagname_slash"; break;
+ case S_text: result = "S_text"; break;
+ case S_value: result = "S_value"; break;
+ }
+ /* *INDENT-ON* */
+
+ return result;
+}
+#endif
+
+/* storage for Element Stack */
+#define DEPTH 10
+static HTElement pool[DEPTH];
+static int depth = 0;
+
+static HTElement *pool_alloc(void)
+{
+ depth++;
+ if (depth > DEPTH)
+ return (HTElement *) malloc(sizeof(HTElement));
+ return (pool + depth - 1);
+}
+
+static void pool_free(HTElement * e)
+{
+ if (depth > DEPTH)
+ FREE(e);
+ depth--;
+ return;
+}
+
+#ifdef USE_PRETTYSRC
+
+static void HTMLSRC_apply_markup(HTStream *me,
+ HTlexeme lexeme,
+ int start)
+{
+ HT_tagspec *ts = *((start ? lexeme_start : lexeme_end) + lexeme);
+
+ while (ts) {
+#ifdef USE_COLOR_STYLE
+ if (ts->start) {
+ current_tag_style = ts->style;
+ force_current_tag_style = TRUE;
+ forced_classname = ts->class_name;
+ force_classname = TRUE;
+ }
+#endif
+ CTRACE((tfp, ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n", (int) lexeme));
+ if (ts->start)
+ (*me->actions->start_element) (me->target,
+ (int) ts->element,
+ ts->present,
+ (STRING2PTR) ts->value,
+ me->current_tag_charset,
+ &me->include);
+ else
+ (*me->actions->end_element) (me->target,
+ (int) ts->element,
+ &me->include);
+ ts = ts->next;
+ }
+}
+
+#define PSRCSTART(x) HTMLSRC_apply_markup(me,HTL_##x,START)
+#define PSRCSTOP(x) HTMLSRC_apply_markup(me,HTL_##x,STOP)
+
+#define attr_is_href me->cur_attr_is_href
+#define attr_is_name me->cur_attr_is_name
+#endif
+
+static void set_chartrans_handling(HTStream *me,
+ HTParentAnchor *anchor,
+ int chndl)
+{
+ if (chndl < 0) {
+ /*
+ * Nothing was set for the parser in earlier stages, so the HTML
+ * parser's UCLYhndl should still be its default. - FM
+ */
+ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED);
+ if (chndl < 0)
+ /*
+ * That wasn't set either, so seek the HText default. - FM
+ */
+ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
+ if (chndl < 0)
+ /*
+ * That wasn't set either, so assume the current display character
+ * set. - FM
+ */
+ chndl = current_char_set;
+ /*
+ * Try to set the HText and HTML stages' chartrans info with the
+ * default lock level (will not be changed if it was set previously
+ * with a higher lock level). - FM
+ */
+ HTAnchor_setUCInfoStage(anchor, chndl,
+ UCT_STAGE_HTEXT,
+ UCT_SETBY_DEFAULT);
+ HTAnchor_setUCInfoStage(anchor, chndl,
+ UCT_STAGE_STRUCTURED,
+ UCT_SETBY_DEFAULT);
+ /*
+ * Get the chartrans info for output to the HTML parser. - FM
+ */
+ me->outUCI = HTAnchor_getUCInfoStage(anchor,
+ UCT_STAGE_STRUCTURED);
+ me->outUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_STRUCTURED);
+ }
+ /*
+ * Set the in->out transformation parameters. - FM
+ */
+ UCSetTransParams(&me->T,
+ me->inUCLYhndl, me->inUCI,
+ me->outUCLYhndl, me->outUCI);
+ /*
+ * This is intended for passing the SGML parser's input charset as an
+ * argument in each call to the HTML parser's start tag function, but it
+ * would be better to call a Lynx_HTML_parser function to set an element in
+ * its HTStructured object, itself, if this were needed. - FM
+ */
+#ifndef USE_JAPANESEUTF8_SUPPORT
+ if (IS_CJK_TTY) {
+ me->current_tag_charset = -1;
+ } else
+#endif
+ if (me->T.transp) {
+ me->current_tag_charset = me->inUCLYhndl;
+ } else if (me->T.decode_utf8) {
+ me->current_tag_charset = me->inUCLYhndl;
+ } else if (me->T.do_8bitraw ||
+ me->T.use_raw_char_in) {
+ me->current_tag_charset = me->inUCLYhndl;
+ } else if (me->T.output_utf8 ||
+ me->T.trans_from_uni) {
+ me->current_tag_charset = UCGetLYhndl_byMIME("utf-8");
+ } else {
+ me->current_tag_charset = LATIN1;
+ }
+}
+
+static void change_chartrans_handling(HTStream *me)
+{
+ int new_LYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
+ UCT_STAGE_PARSER);
+
+ if (new_LYhndl != me->inUCLYhndl &&
+ new_LYhndl >= 0) {
+ /*
+ * Something changed. but ignore if a META wants an unknown charset.
+ */
+ LYUCcharset *new_UCI = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_PARSER);
+
+ if (new_UCI) {
+ LYUCcharset *next_UCI = HTAnchor_getUCInfoStage(me->node_anchor,
+ UCT_STAGE_STRUCTURED);
+ int next_LYhndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_STRUCTURED);
+
+ me->inUCI = new_UCI;
+ me->inUCLYhndl = new_LYhndl;
+ me->outUCI = next_UCI;
+ me->outUCLYhndl = next_LYhndl;
+ set_chartrans_handling(me,
+ me->node_anchor, next_LYhndl);
+ }
+ }
+}
+
+#ifdef USE_COLOR_STYLE
+#include <AttrList.h>
+static int current_is_class = 0;
+#endif
+
+/* Handle Attribute
+ * ----------------
+ */
+/* PUBLIC const char * SGML_default = ""; ?? */
+
+static void handle_attribute_name(HTStream *me, const char *s)
+{
+ HTTag *tag = me->current_tag;
+ const attr *attributes = tag->attributes;
+ int high, low, i, diff;
+
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ attr_is_href = FALSE;
+ attr_is_name = FALSE;
+ }
+#endif
+ /*
+ * Ignore unknown tag. - KW
+ */
+ if (tag == me->unknown_tag) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ me->current_attribute_number = 1; /* anything !=INVALID */
+#endif
+ return;
+ }
+
+ /*
+ * Binary search for attribute name.
+ */
+ for (low = 0, high = tag->number_of_attributes;
+ high > low;
+ diff < 0 ? (low = i + 1) : (high = i)) {
+ i = (low + (high - low) / 2);
+ diff = my_casecomp(attributes[i].name, s);
+ if (diff == 0) { /* success: found it */
+ me->current_attribute_number = i;
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME);
+ attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF);
+ } else
+#endif
+ {
+ me->present[i] = YES;
+ Clear_extra(me->value[i]);
+#ifdef USE_COLOR_STYLE
+# ifdef USE_PRETTYSRC
+ current_is_class = IS_C(attributes[i]);
+# else
+ current_is_class = (!strcasecomp("class", s));
+# endif
+ CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class));
+#endif
+ }
+ return;
+ }
+ /* if */
+ } /* for */
+
+ CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n",
+ s, NonNull(me->current_tag->name)));
+ me->current_attribute_number = INVALID; /* Invalid */
+}
+
+/* Handle attribute value
+ * ----------------------
+ */
+static void handle_attribute_value(HTStream *me, const char *s)
+{
+ if (me->current_attribute_number != INVALID) {
+ StrAllocCopy_extra(me->value[me->current_attribute_number], s);
+#ifdef USE_COLOR_STYLE
+ if (current_is_class) {
+ StrNCpy(class_string, s, TEMPSTRINGSIZE);
+ CTRACE((tfp, "SGML: class is '%s'\n", s));
+ } else {
+ CTRACE((tfp, "SGML: attribute value is '%s'\n", s));
+ }
+#endif
+ } else {
+ CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s));
+ }
+ me->current_attribute_number = INVALID; /* can't have two assignments! */
+}
+
+/*
+ * Translate some Unicodes to Lynx special codes and output them.
+ * Special codes - ones those output depend on parsing.
+ *
+ * Additional issue, like handling bidirectional text if necessary
+ * may be called from here: zwnj (8204), zwj (8205), lrm (8206), rlm (8207)
+ * - currently they are ignored in SGML.c and LYCharUtils.c
+ * but also in UCdomap.c because they are non printable...
+ *
+ */
+static BOOL put_special_unicodes(HTStream *me, UCode_t code)
+{
+ /* (Tgf_nolyspcl) */
+ if (me->no_lynx_specialcodes) {
+ /*
+ * We were asked by a "DTD" flag to not generate lynx specials. - kw
+ */
+ return NO;
+ }
+
+ if (code == CH_NBSP) { /* S/390 -- gil -- 0657 */
+ /*
+ * Use Lynx special character for nbsp.
+ */
+#ifdef USE_PRETTYSRC
+ if (!psrc_view)
+#endif
+ PUTC(HT_NON_BREAK_SPACE);
+ } else if (code == CH_SHY) {
+ /*
+ * Use Lynx special character for shy.
+ */
+#ifdef USE_PRETTYSRC
+ if (!psrc_view)
+#endif
+ PUTC(LY_SOFT_HYPHEN);
+ } else if (code == 8194 || code == 8201) {
+ /*
+ * Use Lynx special character for ensp or thinsp.
+ *
+ * Originally, Lynx use space '32' as word delimiter and omits this
+ * space at end of line if word is wrapped to the next line. There are
+ * several other spaces in the Unicode repertoire and we should teach
+ * Lynx to understand them, not only as regular characters but in the
+ * context of line wrapping. Unfortunately, if we use HT_EN_SPACE we
+ * override the chartrans tables for those spaces with a single '32'
+ * for all (but do line wrapping more fancy).
+ *
+ * We may treat emsp as one or two ensp (below).
+ */
+#ifdef USE_PRETTYSRC
+ if (!psrc_view)
+#endif
+ PUTC(HT_EN_SPACE);
+ } else if (code == 8195) {
+ /*
+ * Use Lynx special character for emsp.
+ */
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) {
+#endif
+ /* PUTC(HT_EN_SPACE); let's stay with a single space :) */
+ PUTC(HT_EN_SPACE);
+#ifdef USE_PRETTYSRC
+ }
+#endif
+ } else {
+ /*
+ * Return NO if nothing done.
+ */
+ return NO;
+ }
+ /*
+ * We have handled it.
+ */
+ return YES;
+}
+
+#ifdef USE_PRETTYSRC
+static void put_pretty_entity(HTStream *me, int term)
+{
+ PSRCSTART(entity);
+ PUTC('&');
+ PUTS(entity_string);
+ if (term)
+ PUTC((char) term);
+ PSRCSTOP(entity);
+}
+
+static void put_pretty_number(HTStream *me)
+{
+ PSRCSTART(entity);
+ PUTS((me->isHex ? "&#x" : "&#"));
+ PUTS(entity_string);
+ PUTC(';');
+ PSRCSTOP(entity);
+}
+#endif /* USE_PRETTYSRC */
+
+/* Handle entity
+ * -------------
+ *
+ * On entry,
+ * s contains the entity name zero terminated
+ * Bugs:
+ * If the entity name is unknown, the terminator is treated as
+ * a printable non-special character in all cases, even if it is '<'
+ * Bug-fix:
+ * Modified SGML_character() so we only come here with terminator
+ * as '\0' and check a FoundEntity flag. -- Foteos Macrides
+ *
+ * Modified more (for use with Lynx character translation code):
+ */
+static char replace_buf[64]; /* buffer for replacement strings */
+static BOOL FoundEntity = FALSE;
+
+static void handle_entity(HTStream *me, int term)
+{
+ UCode_t code;
+ long uck = -1;
+ const char *s = me->string->data;
+
+ /*
+ * Handle all entities normally. - FM
+ */
+ FoundEntity = FALSE;
+ if ((code = HTMLGetEntityUCValue(s)) != 0) {
+ /*
+ * We got a Unicode value for the entity name. Check for special
+ * Unicodes. - FM
+ */
+ if (put_special_unicodes(me, code)) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ }
+#endif
+ FoundEntity = TRUE;
+ return;
+ }
+ /*
+ * Seek a translation from the chartrans tables.
+ */
+ if ((uck = UCTransUniChar(code, me->outUCLYhndl)) >= 32 &&
+/* =============== work in ASCII below here =============== S/390 -- gil -- 0672 */
+ uck < 256 &&
+ (uck < 127 ||
+ uck >= LYlowest_eightbit[me->outUCLYhndl])) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ } else
+#endif
+ PUTC(FROMASCII((char) uck));
+ FoundEntity = TRUE;
+ return;
+ } else if ((uck == -4 ||
+ (me->T.repl_translated_C0 &&
+ uck > 0 && uck < 32)) &&
+ /*
+ * Not found; look for replacement string.
+ */
+ (uck = UCTransUniCharStr(replace_buf, 60, code,
+ me->outUCLYhndl, 0) >= 0)) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ } else
+#endif
+ PUTS(replace_buf);
+ FoundEntity = TRUE;
+ return;
+ }
+ /*
+ * If we're displaying UTF-8, try that now. - FM
+ */
+#ifndef USE_PRETTYSRC
+ if (me->T.output_utf8 && PUTUTF8(code)) {
+ FoundEntity = TRUE;
+ return;
+ }
+#else
+ if (me->T.output_utf8 && (psrc_view
+ ? (UCPutUtf8_charstring((HTStream *) me->target,
+ (putc_func_t *) (fake_put_character),
+ code))
+ : PUTUTF8(code))) {
+
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ }
+
+ FoundEntity = TRUE;
+ return;
+ }
+#endif
+ /*
+ * If it's safe ASCII, use it. - FM
+ */
+ if (code >= 32 && code < 127) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ } else
+#endif
+
+ PUTC(FROMASCII((char) code));
+ FoundEntity = TRUE;
+ return;
+ }
+/* =============== work in ASCII above here =============== S/390 -- gil -- 0682 */
+ /*
+ * Ignore zwnj (8204) and zwj (8205), if we get to here. Note that
+ * zwnj may have been handled as <WBR> by the calling function. - FM
+ */
+ if (!strcmp(s, "zwnj") ||
+ !strcmp(s, "zwj")) {
+ CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ }
+#endif
+ FoundEntity = TRUE;
+ return;
+ }
+ /*
+ * Ignore lrm (8206), and rln (8207), if we get to here. - FM
+ */
+ if (!strcmp(s, "lrm") ||
+ !strcmp(s, "rlm")) {
+ CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_entity(me, term);
+ }
+#endif
+ FoundEntity = TRUE;
+ return;
+ }
+ }
+
+ /*
+ * If entity string not found, display as text.
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ /* S/390 -- gil -- 0695 */
+ CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck));
+ PUTC('&');
+ PUTS(s);
+ if (term != '\0')
+ PUTC(term);
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+}
+
+/* Handle comment
+ * --------------
+ */
+static void handle_comment(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Comment:\n<%s>\n", s));
+
+ if (me->csi == NULL &&
+ StrNCmp(s, "!--#", 4) == 0 &&
+ LYCheckForCSI(me->node_anchor, &me->url) == TRUE) {
+ LYDoCSI(me->url, s, &me->csi);
+ } else {
+ LYCommentHacks(me->node_anchor, me->string->data);
+ }
+
+ return;
+}
+
+/* Handle identifier
+ * -----------------
+ */
+static void handle_identifier(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Identifier:\n<%s>\n", s));
+
+ return;
+}
+
+/* Handle doctype
+ * --------------
+ */
+static void handle_doctype(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Doctype:\n<%s>\n", s));
+ if (strstr(s, "DTD XHTML ") != 0) {
+ CTRACE((tfp, "...processing extended HTML\n"));
+ me->extended_html = TRUE;
+ }
+
+ return;
+}
+
+/* Handle marked
+ * -------------
+ */
+static void handle_marked(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s));
+
+ if (!StrNCmp(me->string->data, "![INCLUDE[", 10)) {
+ me->string->data[me->string->size - 3] = '\0';
+ StrAllocCat(me->include, me->string->data + 10);
+ /* @@@ This needs to take charset into account! @@@
+ the wrong assumptions will be made about the data's
+ charset once it is in include - kw */
+
+ } else if (!StrNCmp(me->string->data, "![CDATA[", 8)) {
+ (*me->actions->put_block) (me->target,
+ me->string->data + 8,
+ me->string->size - 11);
+
+ }
+ return;
+}
+
+/* Handle processing instruction
+ * -----------------------------
+ */
+static void handle_processing_instruction(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Processing instruction:\n<%s>\n", s));
+
+ if (!StrNCmp(s, "?xml ", 5)) {
+ int flag = me->T.decode_utf8;
+
+ me->strict_xml = TRUE;
+ /*
+ * Switch to UTF-8 if the encoding is explicitly "utf-8".
+ */
+ if (!flag) {
+ char *t = strstr(s, "encoding=");
+
+ if (t != 0) {
+ char delim = 0;
+
+ t += 9;
+ if (*t == '"' || *t == '\'')
+ delim = *t++;
+ flag = (!strncasecomp(t, "utf-8", 5) &&
+ (delim == 0 || t[5] == delim));
+ }
+ if (flag) {
+ CTRACE((tfp, "...Use UTF-8 for XML\n"));
+ me->T.decode_utf8 = TRUE;
+ }
+ }
+ }
+
+ return;
+}
+
+/* Handle sgmlent
+ * --------------
+ */
+static void handle_sgmlent(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s));
+
+ return;
+}
+
+/* Handle sgmlent
+ * --------------
+ */
+static void handle_sgmlele(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s));
+
+ return;
+}
+
+/* Handle sgmlatt
+ * --------------
+ */
+static void handle_sgmlatt(HTStream *me)
+{
+ const char *s = me->string->data;
+
+ CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s));
+
+ return;
+}
+
+/*
+ * Convenience macros - tags (elements) are identified sometimes by an int or
+ * enum value ('TAGNUM'), sometimes by a pointer to HTTag ('TAGP'). - kw
+ */
+#define TAGNUM_OF_TAGP(t) (HTMLElement) (t - me->dtd->tags)
+#define TAGP_OF_TAGNUM(e) (me->dtd->tags + e)
+
+/*
+ * The following implement special knowledge about OBJECT. As long as
+ * HTML_OBJECT is the only tag for which an alternative variant exist, they can
+ * be simple macros. - kw
+ */
+/* does 'TAGNUM' e have an alternative (variant) parsing mode? */
+#define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT)
+
+/* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */
+#define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e)
+
+/* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */
+#define NORMAL_TAGNUM(e) (((int)(e) >= HTML_ELEMENTS) ? HTML_OBJECT : (HTMLElement)e)
+
+/* More convenience stuff. - kw */
+#define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e))
+#define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e))
+
+#define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
+#define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
+
+#define IsTagAlias(a,b) (((a) == (b)) || ((a) - (a)->alias == (b) - (b)->alias))
+
+static BOOL element_valid_within(HTTag * new_tag, HTTag * stacked_tag, int direct)
+{
+ BOOL result = YES;
+ TagClass usecontains, usecontained;
+
+ if (stacked_tag && new_tag) {
+ usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains);
+ usecontained = (direct ? new_tag->contained : new_tag->icontained);
+ if (IsTagAlias(new_tag, stacked_tag)) {
+ result = (BOOL) ((Tgc_same & usecontains) &&
+ (Tgc_same & usecontained));
+ } else {
+ result = (BOOL) ((new_tag->tagclass & usecontains) &&
+ (stacked_tag->tagclass & usecontained));
+ }
+ }
+ return result;
+}
+
+static BOOL element_really_within(HTTag * new_tag, HTTag * stacked_tag, int direct)
+{
+ BOOL result = YES;
+ TagClass usecontains, usecontained;
+
+ if (stacked_tag && new_tag) {
+ usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains);
+ usecontained = (direct ? new_tag->contained : new_tag->icontained);
+ if (IsTagAlias(new_tag, stacked_tag)) {
+ result = (BOOL) ((Tgc_same & usecontains) &&
+ (Tgc_same & usecontained));
+ } else {
+ result = (BOOL) ((new_tag->tagclass & usecontains) ==
+ new_tag->tagclass &&
+ (stacked_tag->tagclass & usecontained) == stacked_tag->tagclass);
+ }
+ }
+ return result;
+}
+
+typedef enum {
+ close_NO = 0,
+ close_error = 1,
+ close_valid = 2
+} canclose_t;
+
+static canclose_t can_close(HTTag * new_tag, HTTag * stacked_tag)
+{
+ canclose_t result;
+
+ if (!stacked_tag) {
+ result = close_NO;
+ } else if (stacked_tag->flags & Tgf_endO) {
+ result = close_valid;
+ } else if (IsTagAlias(new_tag, stacked_tag)) {
+ result = ((Tgc_same & new_tag->canclose)
+ ? close_error
+ : close_NO);
+ } else {
+ result = ((stacked_tag->tagclass & new_tag->canclose)
+ ? close_error
+ : close_NO);
+ }
+ return result;
+}
+
+static void do_close_stacked(HTStream *me)
+{
+ HTElement *stacked = me->element_stack;
+ HTMLElement e;
+
+ if (!stacked)
+ return; /* stack was empty */
+ if (me->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) {
+ me->inSELECT = FALSE;
+ }
+ e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag));
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
+#endif
+ (*me->actions->end_element) (me->target,
+ (int) e,
+ &me->include);
+ me->element_stack = stacked->next;
+ pool_free(stacked);
+ me->no_lynx_specialcodes =
+ (BOOL) (me->element_stack
+ ? (me->element_stack->tag->flags & Tgf_nolyspcl)
+ : NO);
+}
+
+static int is_on_stack(HTStream *me, HTTag * old_tag)
+{
+ HTElement *stacked = me->element_stack;
+ int i = 1;
+
+ for (; stacked; stacked = stacked->next, i++) {
+ if (IsTagAlias(stacked->tag, old_tag) ||
+ stacked->tag == ALT_TAGP(old_tag))
+ return i;
+ }
+ return 0;
+}
+
+/* End element
+ * -----------
+ */
+static void end_element(HTStream *me, HTTag * old_tag)
+{
+ BOOL extra_action_taken = NO;
+ canclose_t canclose_check = close_valid;
+ int stackpos = is_on_stack(me, old_tag);
+ BOOL direct_container = YES;
+
+ if (!Old_DTD) {
+ if (old_tag->aliases) {
+ if (me->element_stack) {
+ if (!element_really_within(old_tag,
+ me->element_stack->tag,
+ direct_container) &&
+ element_really_within(old_tag + 1,
+ me->element_stack->tag,
+ direct_container)) {
+ ++old_tag;
+ }
+ }
+ }
+ while (canclose_check != close_NO &&
+ me->element_stack &&
+ (stackpos > 1 || (!extra_action_taken && stackpos == 0))) {
+ if (stackpos == 0 && (old_tag->flags & Tgf_startO) &&
+ element_valid_within(old_tag, me->element_stack->tag, YES)) {
+ CTRACE((tfp, "SGML: </%s> ignored\n", old_tag->name));
+ return;
+ }
+ canclose_check = can_close(old_tag, me->element_stack->tag);
+ if (canclose_check != close_NO) {
+ CTRACE((tfp, "SGML: End </%s> \t<- %s end </%s>\n",
+ me->element_stack->tag->name,
+ ((canclose_check == close_valid)
+ ? "supplied,"
+ : "***forced by"),
+ old_tag->name));
+ do_close_stacked(me);
+ extra_action_taken = YES;
+ stackpos = is_on_stack(me, old_tag);
+ }
+ }
+
+ if (stackpos == 0 && old_tag->contents != SGML_EMPTY) {
+ CTRACE((tfp, "SGML: Still open %s, ***no open %s for </%s>\n",
+ me->element_stack ?
+ me->element_stack->tag->name : "none",
+ old_tag->name,
+ old_tag->name));
+ return;
+ }
+ if (stackpos > 1) {
+ CTRACE((tfp,
+ "SGML: Nesting <%s>...<%s> \t<- ***invalid end </%s>\n",
+ old_tag->name,
+ me->element_stack ?
+ me->element_stack->tag->name : "none",
+ old_tag->name));
+ return;
+ }
+ }
+ /* Now let the non-extended code deal with the rest. - kw */
+
+ /*
+ * If we are in a SELECT block, ignore anything but a SELECT end tag. - FM
+ */
+ if (me->inSELECT) {
+ if (!strcasecomp(old_tag->name, "SELECT")) {
+ /*
+ * Turn off the inSELECT flag and fall through. - FM
+ */
+ me->inSELECT = FALSE;
+ } else {
+ /*
+ * Ignore the end tag. - FM
+ */
+ CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
+ old_tag->name));
+ return;
+ }
+ }
+ /*
+ * Handle the end tag. - FM
+ */
+ CTRACE((tfp, "SGML: End </%s>\n", old_tag->name));
+ if (old_tag->contents == SGML_EMPTY) {
+ CTRACE((tfp, "SGML: ***Illegal end tag </%s> found.\n",
+ old_tag->name));
+ return;
+ }
+#ifdef WIND_DOWN_STACK
+ while (me->element_stack) /* Loop is error path only */
+#else
+ if (me->element_stack) /* Substitute and remove one stack element */
+#endif /* WIND_DOWN_STACK */
+ {
+ int status = HT_OK;
+ HTMLElement e;
+ HTElement *N = me->element_stack;
+ HTTag *t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag;
+
+ if (old_tag != t) { /* Mismatch: syntax error */
+ if (me->element_stack->next) { /* This is not the last level */
+ CTRACE((tfp,
+ "SGML: Found </%s> when expecting </%s>. </%s> ***assumed.\n",
+ old_tag->name, t->name, t->name));
+ } else { /* last level */
+ CTRACE((tfp,
+ "SGML: Found </%s> when expecting </%s>. </%s> ***Ignored.\n",
+ old_tag->name, t->name, old_tag->name));
+ return; /* Ignore */
+ }
+ }
+
+ e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t));
+ CTRACE2(TRACE_SGML, (tfp, "tagnum(%p) = %d\n", (void *) t, (int) e));
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
+#endif
+ status = (*me->actions->end_element) (me->target,
+ (int) e,
+ &me->include);
+ if (status == HT_PARSER_REOPEN_ELT) {
+ CTRACE((tfp, "SGML: Restart <%s>\n", t->name));
+ (*me->actions->start_element) (me->target,
+ (int) e,
+ NULL,
+ NULL,
+ me->current_tag_charset,
+ &me->include);
+ } else if (status == HT_PARSER_OTHER_CONTENT) {
+ CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name));
+ me->element_stack->tag = ALT_TAGP_OF_TAGNUM(e);
+ } else {
+ me->element_stack = N->next; /* Remove from stack */
+ pool_free(N);
+ }
+ me->no_lynx_specialcodes =
+ (BOOL) (me->element_stack
+ ? (me->element_stack->tag->flags & Tgf_nolyspcl)
+ : NO);
+#ifdef WIND_DOWN_STACK
+ if (IsTagAlias(old_tag, t))
+ return; /* Correct sequence */
+#else
+ return;
+#endif /* WIND_DOWN_STACK */
+
+ /* Syntax error path only */
+
+ }
+ CTRACE((tfp, "SGML: Extra end tag </%s> found and ignored.\n",
+ old_tag->name));
+}
+
+/* Start a element
+*/
+static void start_element(HTStream *me)
+{
+ int status;
+ HTTag *new_tag = me->current_tag;
+ HTMLElement e = TAGNUM_OF_TAGP(new_tag);
+ BOOL ok = FALSE;
+
+ BOOL valid = YES;
+ BOOL direct_container = YES;
+ BOOL extra_action_taken = NO;
+ canclose_t canclose_check = close_valid;
+
+ if (!Old_DTD) {
+ if (new_tag->aliases) {
+ if (me->element_stack) {
+ if (!element_really_within(new_tag,
+ me->element_stack->tag,
+ direct_container) &&
+ element_really_within(new_tag + 1,
+ me->element_stack->tag,
+ direct_container)) {
+ ++new_tag;
+ }
+ }
+ }
+ while (me->element_stack &&
+ (canclose_check == close_valid ||
+ (canclose_check == close_error &&
+ IsTagAlias(new_tag, me->element_stack->tag))) &&
+ !(valid = element_valid_within(new_tag,
+ me->element_stack->tag,
+ direct_container))) {
+ canclose_check = can_close(new_tag, me->element_stack->tag);
+ if (canclose_check != close_NO) {
+ CTRACE((tfp, "SGML: End </%s> \t<- %s start <%s>\n",
+ me->element_stack->tag->name,
+ ((canclose_check == close_valid)
+ ? "supplied,"
+ : "***forced by"),
+ new_tag->name));
+ do_close_stacked(me);
+ extra_action_taken = YES;
+ if (canclose_check == close_error)
+ direct_container = NO;
+ } else {
+ CTRACE((tfp,
+ "SGML: Still open %s \t<- ***invalid start <%s>\n",
+ me->element_stack->tag->name,
+ new_tag->name));
+ }
+ }
+ if (me->element_stack && !valid &&
+ (me->element_stack->tag->flags & Tgf_strict) &&
+ !(valid = element_valid_within(new_tag,
+ me->element_stack->tag,
+ direct_container))) {
+ CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n",
+ me->element_stack->tag->name,
+ new_tag->name));
+ return;
+ }
+
+ if (me->element_stack &&
+ !extra_action_taken &&
+ (canclose_check == close_NO) &&
+ !valid && (new_tag->flags & Tgf_mafse)) {
+ BOOL has_attributes = NO;
+ int i = 0;
+
+ for (; i < new_tag->number_of_attributes && !has_attributes; i++)
+ has_attributes = me->present[i];
+ if (!has_attributes) {
+ CTRACE((tfp,
+ "SGML: Still open %s, ***converting invalid <%s> to </%s>\n",
+ me->element_stack->tag->name,
+ new_tag->name,
+ new_tag->name));
+ end_element(me, new_tag);
+ return;
+ }
+ }
+
+ if (me->element_stack &&
+ (canclose_check == close_error) &&
+ !element_valid_within(new_tag,
+ me->element_stack->tag,
+ direct_container)) {
+ CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n",
+ me->element_stack->tag->name,
+ new_tag->name));
+ }
+ }
+ /* Fall through to the non-extended code - kw */
+
+ /*
+ * If we are not in a SELECT block, check if this is a SELECT start tag.
+ * Otherwise (i.e., we are in a SELECT block) accept only OPTION as valid,
+ * terminate the SELECT block if it is any other form-related element, and
+ * otherwise ignore it. - FM
+ */
+ if (!me->inSELECT) {
+ /*
+ * We are not in a SELECT block, so check if this starts one. - FM
+ * (frequent case!)
+ */
+ /* my_casecomp() - optimized by the first character */
+ if (!my_casecomp(new_tag->name, "SELECT")) {
+ /*
+ * Set the inSELECT flag and fall through. - FM
+ */
+ me->inSELECT = TRUE;
+ }
+ } else {
+ /*
+ * We are in a SELECT block. - FM
+ */
+ if (strcasecomp(new_tag->name, "OPTION")) {
+ /*
+ * Ugh, it is not an OPTION. - FM
+ */
+ switch (e) {
+ case HTML_INPUT:
+ case HTML_TEXTAREA:
+ case HTML_SELECT:
+ case HTML_BUTTON:
+ case HTML_FIELDSET:
+ case HTML_LABEL:
+ case HTML_LEGEND:
+ case HTML_FORM:
+ ok = TRUE;
+ break;
+ default:
+ break;
+ }
+ if (ok) {
+ /*
+ * It is another form-related start tag, so terminate the
+ * current SELECT block and fall through. - FM
+ */
+ CTRACE((tfp,
+ "SGML: ***Faking SELECT end tag before <%s> start tag.\n",
+ new_tag->name));
+ end_element(me, SGMLFindTag(me->dtd, "SELECT"));
+ } else {
+ /*
+ * Ignore the start tag. - FM
+ */
+ CTRACE((tfp,
+ "SGML: ***Ignoring start tag <%s> in SELECT block.\n",
+ new_tag->name));
+ return;
+ }
+ }
+ }
+ /*
+ * Handle the start tag. - FM
+ */
+ CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name));
+ status = (*me->actions->start_element) (me->target,
+ (int) TAGNUM_OF_TAGP(new_tag),
+ me->present,
+ (STRING2PTR) me->value, /* coerce type for think c */
+ me->current_tag_charset,
+ &me->include);
+ if (status == HT_PARSER_OTHER_CONTENT)
+ new_tag = ALT_TAGP(new_tag); /* this is only returned for OBJECT */
+ if (new_tag->contents != SGML_EMPTY) { /* i.e., tag not empty */
+ HTElement *N = pool_alloc();
+
+ if (N == NULL)
+ outofmem(__FILE__, "start_element");
+
+ N->next = me->element_stack;
+ N->tag = new_tag;
+ me->element_stack = N;
+ me->no_lynx_specialcodes = (BOOLEAN) (new_tag->flags & Tgf_nolyspcl);
+
+ } else if (e == HTML_META) {
+ /*
+ * Check for result of META tag. - KW & FM
+ */
+ change_chartrans_handling(me);
+ }
+}
+
+/* Find Tag in DTD tag list
+ * ------------------------
+ *
+ * On entry,
+ * dtd points to dtd structure including valid tag list
+ * string points to name of tag in question
+ *
+ * On exit,
+ * returns:
+ * NULL tag not found
+ * else address of tag structure in dtd
+ */
+HTTag *SGMLFindTag(const SGML_dtd * dtd,
+ const char *s)
+{
+ int high, low, i, diff;
+ static HTTag *last[64] =
+ {NULL}; /*optimize using the previous results */
+ HTTag **res = last + (UCH(*s) % 64); /*pointer arithmetic */
+
+ if (*res) {
+ if ((*res)->name == NULL)
+ return NULL;
+ if (!strcasecomp((*res)->name, s))
+ return *res;
+ }
+
+ for (low = 0, high = dtd->number_of_tags;
+ high > low;
+ diff < 0 ? (low = i + 1) : (high = i)) { /* Binary search */
+ i = (low + (high - low) / 2);
+ /* my_casecomp() - optimized by the first character, NOT_ASCII ok */
+ diff = my_casecomp(dtd->tags[i].name, s); /* Case insensitive */
+ if (diff == 0) { /* success: found it */
+ i -= dtd->tags[i].alias;
+ *res = &dtd->tags[i];
+ return *res;
+ }
+ }
+ if (IsNmStart(*s)) {
+ /*
+ * Unrecognized, but may be valid. - KW
+ */
+ return &HTTag_unrecognized;
+ }
+ return NULL;
+}
+
+/*________________________________________________________________________
+ * Public Methods
+ */
+
+/* Could check that we are back to bottom of stack! @@ */
+/* Do check! - FM */
+/* */
+static void SGML_free(HTStream *me)
+{
+ int i;
+ HTElement *cur;
+ HTTag *t;
+
+ /*
+ * Free the buffers. - FM
+ */
+ FREE(me->recover);
+ FREE(me->url);
+ FREE(me->csi);
+ FREE(me->include);
+ FREE(me->active_include);
+
+ /*
+ * Wind down stack if any elements are open. - FM
+ */
+ while (me->element_stack) {
+ cur = me->element_stack;
+ t = cur->tag;
+ me->element_stack = cur->next; /* Remove from stack */
+ pool_free(cur);
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) /* Don't actually call on target if viewing psrc - kw */
+#endif
+ (*me->actions->end_element)
+ (me->target,
+ (int) NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)),
+ &me->include);
+ FREE(me->include);
+ }
+
+ /*
+ * Finish off the target. - FM
+ */
+ (*me->actions->_free) (me->target);
+
+ /*
+ * Free the strings and context structure. - FM
+ */
+ HTChunkFree(me->string);
+ for (i = 0; i < MAX_ATTRIBUTES; i++)
+ FREE_extra(me->value[i]);
+ FREE(me);
+
+#ifdef USE_PRETTYSRC
+ sgml_in_psrc_was_initialized = FALSE;
+#endif
+}
+
+static void SGML_abort(HTStream *me, HTError e)
+{
+ int i;
+ HTElement *cur;
+
+ /*
+ * Abort the target. - FM
+ */
+ (*me->actions->_abort) (me->target, e);
+
+ /*
+ * Free the buffers. - FM
+ */
+ FREE(me->recover);
+ FREE(me->include);
+ FREE(me->active_include);
+ FREE(me->url);
+ FREE(me->csi);
+
+ /*
+ * Free stack memory if any elements were left open. - KW
+ */
+ while (me->element_stack) {
+ cur = me->element_stack;
+ me->element_stack = cur->next; /* Remove from stack */
+ pool_free(cur);
+ }
+
+ /*
+ * Free the strings and context structure. - FM
+ */
+ HTChunkFree(me->string);
+ for (i = 0; i < MAX_ATTRIBUTES; i++)
+ FREE_extra(me->value[i]);
+ FREE(me);
+
+#ifdef USE_PRETTYSRC
+ sgml_in_psrc_was_initialized = FALSE;
+#endif
+}
+
+/* Read and write user callback handle
+ * -----------------------------------
+ *
+ * The callbacks from the SGML parser have an SGML context parameter.
+ * These calls allow the caller to associate his own context with a
+ * particular SGML context.
+ */
+
+#ifdef CALLERDATA
+void *SGML_callerData(HTStream *me)
+{
+ return me->callerData;
+}
+
+void SGML_setCallerData(HTStream *me, void *data)
+{
+ me->callerData = data;
+}
+#endif /* CALLERDATA */
+
+#ifdef USE_PRETTYSRC
+static void transform_tag(HTStream *me, HTChunk *string)
+{
+ if (!me->strict_xml) {
+ if (tagname_transform != 1) {
+ if (tagname_transform == 0)
+ LYLowerCase(string->data);
+ else
+ LYUpperCase(string->data);
+ }
+ }
+}
+#endif /* USE_PRETTYSRC */
+
+static BOOL ignore_when_empty(HTTag * tag)
+{
+ BOOL result = FALSE;
+
+ if (!LYPreparsedSource
+ && LYxhtml_parsing
+ && tag->name != 0
+ && !(tag->flags & Tgf_mafse)
+ && tag->contents != SGML_EMPTY
+ && tag->tagclass != Tgc_Plike
+ && (tag->tagclass == Tgc_APPLETlike
+ || tag->tagclass == Tgc_SELECTlike
+ || (tag->contains && tag->icontains))) {
+ result = TRUE;
+ }
+ CTRACE((tfp, "SGML Do%s ignore_when_empty:%s\n",
+ result ? "" : " not",
+ NonNull(tag->name)));
+ return result;
+}
+
+static void discard_empty(HTStream *me)
+{
+ static HTTag empty_tag;
+
+ CTRACE((tfp, "SGML discarding empty %s\n",
+ NonNull(me->current_tag->name)));
+ CTRACE_FLUSH(tfp);
+
+ memset(&empty_tag, 0, sizeof(empty_tag));
+ me->current_tag = &empty_tag;
+ me->string->size = 0;
+
+ /* do not call end_element() if start_element() was not called */
+}
+
+#ifdef USE_PRETTYSRC
+static BOOL end_if_prettysrc(HTStream *me, HTChunk *string, int end_ch)
+{
+ BOOL result = psrc_view;
+
+ if (psrc_view) {
+ if (attr_is_name) {
+ HTStartAnchor(me->target, string->data, NULL);
+ (*me->actions->end_element) (me->target,
+ HTML_A,
+ &me->include);
+ } else if (attr_is_href) {
+ PSRCSTART(href);
+ HTStartAnchor(me->target, NULL, string->data);
+ }
+ PUTS_TR(string->data);
+ if (attr_is_href) {
+ (*me->actions->end_element) (me->target,
+ HTML_A,
+ &me->include);
+ PSRCSTOP(href);
+ }
+ if (end_ch)
+ PUTC(end_ch);
+ PSRCSTOP(attrval);
+ }
+ return result;
+}
+#endif
+
+static void SGML_character(HTStream *me, int c_in)
+{
+ const SGML_dtd *dtd = me->dtd;
+ HTChunk *string = me->string;
+ const char *EntityName;
+ HTTag *testtag = NULL;
+ BOOLEAN chk; /* Helps (?) walk through all the else ifs... */
+ UCode_t clong, uck = 0; /* Enough bits for UCS4 ... */
+ int testlast;
+
+ unsigned char c;
+ unsigned char saved_char_in = '\0';
+
+ ++sgml_offset;
+
+ c = UCH(c_in);
+ clong = UCH(c);
+
+ if (me->T.decode_utf8) {
+ switch (HTDecodeUTF8(&(me->U), &c_in, &clong)) {
+ case dUTF8_ok:
+ if (clong < 256) {
+ c_in = FROMASCII(UCH(clong));
+ }
+ break;
+ case dUTF8_err:
+ clong = UCS_REPL;
+ strcpy(me->U.utf_buf, "\357\277\275");
+ me->U.utf_buf_p = (me->U.utf_buf + 3);
+ break;
+ case dUTF8_more:
+ return;
+ }
+
+ c = UCH(c_in);
+ if ((me->U.utf_buf_p - me->U.utf_buf) > 1) {
+ goto top1;
+ }
+ }
+
+ /*
+ * If we want the raw input converted to Unicode, try that now. - FM
+ */
+#ifdef USE_JAPANESEUTF8_SUPPORT
+ /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */
+#define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F)
+ if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text
+ || me->state == S_nonascii_text_sq
+ || me->state == S_nonascii_text_dq)) {
+ /* end of ISO-2022-JP? || not in ISO-2022-JP range */
+ if (TOASCII(c) == '\033' || !IS_JIS7_HILO(c)) {
+ me->kanji_buf = '\0';
+ goto top1;
+ }
+ if (me->kanji_buf == '\t') { /* flag for single byte kana in "ESC(I" */
+ if (conv_jisx0201kana) {
+ JISx0201TO0208_SJIS(c | 0200,
+ (unsigned char *) me->U.utf_buf,
+ (unsigned char *) me->U.utf_buf + 1);
+ clong = UCTransJPToUni(me->U.utf_buf, 2,
+ UCGetLYhndl_byMIME("shift_jis"));
+ } else {
+ clong = UCTransToUni(c | 0200, UCGetLYhndl_byMIME("shift_jis"));
+ }
+ } else if (me->kanji_buf) {
+ me->U.utf_buf[0] = (char) (me->kanji_buf | 0200); /* to EUC-JP */
+ me->U.utf_buf[1] = (char) (c | 0200);
+ clong = UCTransJPToUni(me->U.utf_buf, 2,
+ UCGetLYhndl_byMIME("euc-jp"));
+ me->kanji_buf = '\0';
+ } else {
+ me->kanji_buf = c;
+ clong = ucNeedMore;
+ }
+ goto top1;
+ }
+#endif /* USE_JAPANESEUTF8_SUPPORT */
+#ifdef USE_JAPANESEUTF8_SUPPORT
+ if (me->T.trans_to_uni &&
+ ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
+ (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0))) {
+ if (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0) {
+ if (me->U.utf_count == 0) {
+ if (IS_SJIS_HI1(c) ||
+ IS_SJIS_HI2(c)) {
+ me->U.utf_buf[0] = (char) c;
+ me->U.utf_count = 1;
+ clong = ucCannotConvert;
+ } else if (IS_SJIS_X0201KANA(c)) {
+ if (conv_jisx0201kana) {
+ JISx0201TO0208_SJIS(c,
+ (unsigned char *) me->U.utf_buf,
+ (unsigned char *) me->U.utf_buf + 1);
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ } else {
+ clong = UCTransToUni(c, me->inUCLYhndl);
+ }
+ }
+ } else {
+ if (IS_SJIS_LO(c)) {
+ me->U.utf_buf[1] = (char) c;
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ }
+ me->U.utf_count = 0;
+ }
+ } else {
+ if (me->U.utf_count == 0) {
+ if (IS_EUC_HI(c) || c == 0x8E) {
+ me->U.utf_buf[0] = (char) c;
+ me->U.utf_count = 1;
+ clong = ucCannotConvert;
+ }
+ } else {
+ if (IS_EUC_LOX(c)) {
+ me->U.utf_buf[1] = (char) c;
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ }
+ me->U.utf_count = 0;
+ }
+ }
+ goto top1;
+ } else
+#endif /* USE_JAPANESEUTF8_SUPPORT */
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ if (me->T.trans_to_uni &&
+ ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn") == 0))) {
+ if (me->U.utf_count == 0) {
+ if (IS_GBK_HI(c)) {
+ me->U.utf_buf[0] = (char) c;
+ me->U.utf_count = 1;
+ clong = ucCannotConvert;
+ CTRACE((tfp, "Get EUC-CN: 0x%02X\n", UCH(c)));
+ }
+ } else {
+ if (IS_GBK_LO(c)) {
+ me->U.utf_buf[1] = (char) c;
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ if (clong > 0) {
+ CTRACE((tfp, "... second: [%02X%02X] U+%04lX\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ } else {
+ CTRACE((tfp, "... second: [%02X%02X] %ld\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ }
+ }
+ me->U.utf_count = 0;
+ }
+ goto top1;
+ } else
+#endif /* EXP_CHINESEUTF8_SUPPORT */
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ if (me->T.trans_to_uni &&
+ ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-kr") == 0))) {
+ if (me->U.utf_count == 0) {
+ if (IS_EUC_HI(c)) {
+ me->U.utf_buf[0] = (char) c;
+ me->U.utf_count = 1;
+ clong = ucCannotConvert;
+ CTRACE((tfp, "Get EUC-KR: 0x%02X\n", UCH(c)));
+ }
+ } else {
+ if (IS_EUC_LOS(c) ||
+ IS_EUC_LOX(c)) {
+ me->U.utf_buf[1] = (char) c;
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ if (clong > 0) {
+ CTRACE((tfp, "... second: [%02X%02X] U+%04lX\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ } else {
+ CTRACE((tfp, "... second: [%02X%02X] %ld\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ }
+ }
+ me->U.utf_count = 0;
+ }
+ goto top1;
+ } else
+#endif /* EXP_CHINESEUTF8_SUPPORT */
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ if (me->T.trans_to_uni &&
+ ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "big5") == 0))) {
+ if (me->U.utf_count == 0) {
+ if (IS_BIG5_HI(c)) {
+ me->U.utf_buf[0] = (char) c;
+ me->U.utf_count = 1;
+ clong = ucCannotConvert;
+ CTRACE((tfp, "Get BIG5: 0x%02X\n", UCH(c)));
+ }
+ } else {
+ if (IS_BIG5_LOS(c) ||
+ IS_BIG5_LOX(c)) {
+ me->U.utf_buf[1] = (char) c;
+ clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+ if (clong > 0) {
+ CTRACE((tfp, "... second: [%02X%02X] U+%04lX\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ } else {
+ CTRACE((tfp, "... second: [%02X%02X] %ld\n",
+ UCH(me->U.utf_buf[0]),
+ UCH(me->U.utf_buf[1]),
+ clong));
+ }
+ }
+ me->U.utf_count = 0;
+ }
+ goto top1;
+ } else
+#endif /* EXP_CHINESEUTF8_SUPPORT */
+ if (me->T.trans_to_uni &&
+ /* S/390 -- gil -- 0744 */
+ ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) ||
+ (clong < ' ' && clong != 0 &&
+ me->T.trans_C0_to_uni))) {
+ /*
+ * Convert the octet to Unicode. - FM
+ */
+ clong = UCTransToUni((char) c, me->inUCLYhndl);
+ if (clong > 0) {
+ saved_char_in = c;
+ if (clong < 256) {
+ c = FROMASCII(UCH(clong));
+ }
+ }
+ goto top1;
+ } else if (clong < ' ' && clong != 0 && /* S/390 -- gil -- 0768 */
+ me->T.trans_C0_to_uni) {
+ /*
+ * This else if may be too ugly to keep. - KW
+ */
+ if (me->T.trans_from_uni &&
+ (((clong = UCTransToUni((char) c, me->inUCLYhndl)) >= ' ') ||
+ (me->T.transp &&
+ (clong = UCTransToUni((char) c, me->inUCLYhndl)) > 0))) {
+ saved_char_in = c;
+ if (clong < 256) {
+ c = FROMASCII(UCH(clong));
+ }
+ goto top1;
+ } else {
+ uck = -1;
+ if (me->T.transp) {
+ uck = UCTransCharStr(replace_buf, 60, (char) c,
+ me->inUCLYhndl,
+ me->inUCLYhndl, NO);
+ }
+ if (!me->T.transp || uck < 0) {
+ uck = UCTransCharStr(replace_buf, 60, (char) c,
+ me->inUCLYhndl,
+ me->outUCLYhndl, YES);
+ }
+ if (uck == 0) {
+ return;
+ } else if (uck < 0) {
+ goto top0a;
+ }
+ c = UCH(replace_buf[0]);
+ if (c && replace_buf[1]) {
+ if (me->state == S_text) {
+ PUTS(replace_buf);
+ return;
+ }
+ StrAllocCat(me->recover, replace_buf + 1);
+ }
+ goto top0a;
+ } /* Next line end of ugly stuff for C0. - KW */
+ } else { /* end of me->T.trans_to_uni S/390 -- gil -- 0791 */
+ goto top0a;
+ }
+
+ /*
+ * We jump up to here from below if we have
+ * stuff in the recover, insert, or csi buffers
+ * to process. We zero saved_char_in, in effect
+ * as a flag that the octet is not that of the
+ * actual call to this function. This may be OK
+ * for now, for the stuff this function adds to
+ * its recover buffer, but it might not be for
+ * stuff other functions added to the insert or
+ * csi buffer, so bear that in mind. - FM
+ * Stuff from the recover buffer is now handled
+ * as UTF-8 if we can expect that's what it is,
+ * and in that case we don't come back up here. - kw
+ */
+ top:
+ saved_char_in = '\0';
+ /*
+ * We jump to here from above when we don't have
+ * UTF-8 input, haven't converted to Unicode, and
+ * want clong set to the input octet (unsigned)
+ * without zeroing its saved_char_in copy (which
+ * is signed). - FM
+ */
+ top0a:
+ *(me->U.utf_buf) = '\0';
+ clong = UCH(c);
+ /*
+ * We jump to here from above if we have converted
+ * the input, or a multibyte sequence across calls,
+ * to a Unicode value and loaded it into clong (to
+ * which unsign_c has been defined), and from below
+ * when we are recycling a character (e.g., because
+ * it terminated an entity but is not the standard
+ * semi-colon). The character will already have
+ * been put through the Unicode conversions. - FM
+ */
+ top1:
+ /*
+ * Ignore low ISO 646 7-bit control characters if HTCJK is not set. - FM
+ */
+ /*
+ * Works for both ASCII and EBCDIC. -- gil
+ * S/390 -- gil -- 0811
+ */
+ if (TOASCII(clong) < 32 &&
+ c != '\t' && c != '\n' && c != '\r' &&
+ !IS_CJK_TTY &&
+ !(UTF8_TTY_ISO2022JP && (TOASCII(c) == '\033')))
+ goto after_switch;
+
+ /*
+ * Ignore 127 if we don't have HTPassHighCtrlRaw or HTCJK set. - FM
+ */
+#define PASSHICTRL (me->T.transp || \
+ clong >= LYlowest_eightbit[me->inUCLYhndl])
+ if (TOASCII(c) == 127 && /* S/390 -- gil -- 0830 */
+ !(PASSHICTRL || IS_CJK_TTY))
+ goto after_switch;
+
+ /*
+ * Ignore 8-bit control characters 128 - 159 if neither HTPassHighCtrlRaw
+ * nor HTCJK is set. - FM
+ */
+ if (TOASCII(clong) > 127 && TOASCII(clong) < 160 && /* S/390 -- gil -- 0847 */
+ !(PASSHICTRL || IS_CJK_TTY)) {
+ /*
+ * If we happen to be reading from an "ISO-8859-1" or "US-ASCII"
+ * document, allow the cp-1252 codes, to accommodate the HTML5 draft
+ * recommendation for replacement encoding:
+ *
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
+ */
+ if (AssumeCP1252(me)) {
+ clong = LYcp1252ToUnicode((UCode_t) c);
+ goto top1;
+ }
+ goto after_switch;
+ }
+
+ /* Almost all CJK characters are double byte but only Japanese
+ * JIS X0201 Kana is single byte. To prevent to fail SGML parsing
+ * we have to take care of them here. -- TH
+ */
+ if ((HTCJK == JAPANESE) && (me->state == S_in_kanji) &&
+ !IS_JAPANESE_2BYTE(me->kanji_buf, UCH(c))
+#ifdef USE_JAPANESEUTF8_SUPPORT
+ && !me->T.decode_utf8
+#endif
+ ) {
+#ifdef CONV_JISX0201KANA_JISX0208KANA
+ if (IS_SJIS_X0201KANA(me->kanji_buf)) {
+ unsigned char sjis_hi, sjis_lo;
+
+ JISx0201TO0208_SJIS(me->kanji_buf, &sjis_hi, &sjis_lo);
+ PUTC(sjis_hi);
+ PUTC(sjis_lo);
+ } else
+#endif
+ PUTC(me->kanji_buf);
+ me->state = S_text;
+ }
+
+ /*
+ * Handle character based on me->state.
+ */
+ CTRACE2(TRACE_SGML, (tfp, "SGML before %s|%.*s|%c|\n",
+ state_name(me->state),
+ string->size,
+ NonNull(string->data),
+ UCH(c)));
+ switch (me->state) {
+
+ case S_in_kanji:
+ /*
+ * Note that if we don't have a CJK input, then this is not the second
+ * byte of a CJK di-byte, and we're trashing the input. That's why
+ * 8-bit characters followed by, for example, '<' can cause the tag to
+ * be treated as text, not markup. We could try to deal with it by
+ * holding each first byte and then checking byte pairs, but that
+ * doesn't seem worth the overhead (see below). - FM
+ */
+ me->state = S_text;
+ PUTC(me->kanji_buf);
+ PUTC(c);
+ break;
+
+ case S_tagname_slash:
+ /*
+ * We had something link "<name/" so far, set state to S_text but keep
+ * me->slashedtag as a flag; except if we get '>' directly
+ * after the "<name/", and really have a tag for that name in
+ * me->slashedtag, in which case keep state as is and let code
+ * below deal with it. - kw
+ */
+ if (!(c == '>' && me->slashedtag && TOASCII(clong) < 127)) {
+ me->state = S_text;
+ }
+ /* FALLTHRU */
+ case S_text:
+#ifdef EXP_CHINESEUTF8_SUPPORT
+ if (IS_CJK_TTY &&
+ (!strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn") ||
+ !strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "big5") ||
+ !strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-kr"))) {
+ /*
+ * Leave the case statement if we have not collected both of the
+ * bytes for the EUC-CN character. If we have, then continue on
+ * to convert it to Unicode.
+ */
+ if (clong == ucCannotConvert) {
+ break;
+ }
+ } else
+#endif
+ if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0)
+#ifdef USE_JAPANESEUTF8_SUPPORT
+ && !me->T.decode_utf8
+#endif
+ ) { /* S/390 -- gil -- 0864 */
+ /*
+ * Setting up for Kanji multibyte handling (based on Takuya ASADA's
+ * (asada@three-a.co.jp) CJK Lynx). Note that if the input is not
+ * in fact CJK, the next byte also will be mishandled, as explained
+ * above. Toggle raw mode off in such cases, or select the "7 bit
+ * approximations" display character set, which is largely
+ * equivalent to having raw mode off with CJK. - FM
+ */
+ me->state = S_in_kanji;
+ me->kanji_buf = c;
+ break;
+ } else if ((IS_CJK_TTY || UTF8_TTY_ISO2022JP) && TOASCII(c) == '\033') {
+ /* S/390 -- gil -- 0881 */
+ /*
+ * Setting up for CJK escape sequence handling (based on Takuya
+ * ASADA's (asada@three-a.co.jp) CJK Lynx). - FM
+ */
+ me->state = S_esc;
+ if (!UTF8_TTY_ISO2022JP)
+ PUTC(c);
+ break;
+ }
+
+ if (c == '&' || c == '<') {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) { /*there is nothing useful in the element_stack */
+ testtag = me->current_tag;
+ } else
+#endif
+ {
+ testtag = me->element_stack ?
+ me->element_stack->tag : NULL;
+ }
+ }
+
+ if (c == '&' && TOASCII(clong) < 127 && /* S/390 -- gil -- 0898 */
+ (!testtag ||
+ (testtag->contents == SGML_MIXED ||
+ testtag->contents == SGML_ELEMENT ||
+ testtag->contents == SGML_PCDATA ||
+#ifdef USE_PRETTYSRC
+ testtag->contents == SGML_EMPTY ||
+#endif
+ testtag->contents == SGML_RCDATA))) {
+ /*
+ * Setting up for possible entity, without the leading '&'. - FM
+ */
+ string->size = 0;
+ me->state = S_ero;
+ } else if (c == '<' && TOASCII(clong) < 127) { /* S/390 -- gil -- 0915 */
+ /*
+ * Setting up for possible tag. - FM
+ */
+ string->size = 0;
+ if (testtag && testtag->contents == SGML_PCDATA) {
+ me->state = S_pcdata;
+ } else if (testtag && (testtag->contents == SGML_LITTERAL
+ || testtag->contents == SGML_CDATA)) {
+ me->state = S_litteral;
+ } else if (testtag && (testtag->contents == SGML_SCRIPT)) {
+ me->state = S_script;
+ } else {
+ me->state = S_tag;
+ }
+ me->slashedtag = NULL;
+ } else if (me->slashedtag &&
+ me->slashedtag->name &&
+ (c == '/' ||
+ (c == '>' && me->state == S_tagname_slash)) &&
+ TOASCII(clong) < 127) {
+ /*
+ * We got either the second slash of a pending "<NAME/blah blah/"
+ * shortref construct, or the '>' of a mere "<NAME/>". In both
+ * cases generate a "</NAME>" end tag in the recover buffer for
+ * reparsing unless NAME is really an empty element. - kw
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC(c);
+ PSRCSTOP(abracket);
+ } else
+#endif
+ if (me->slashedtag != me->unknown_tag &&
+ !ReallyEmptyTag(me->slashedtag)) {
+ if (me->recover == NULL) {
+ StrAllocCopy(me->recover, "</");
+ me->recover_index = 0;
+ } else {
+ StrAllocCat(me->recover, "</");
+ }
+ StrAllocCat(me->recover, me->slashedtag->name);
+ StrAllocCat(me->recover, ">");
+ }
+ me->slashedtag = NULL;
+
+ } else if (me->element_stack &&
+ (me->element_stack->tag->flags & Tgf_frecyc)) {
+ /*
+ * The element stack says we are within the contents of an element
+ * that the next stage (HTML.c) may want to feed us back again (via
+ * the *include string). So try to output text in UTF-8 if
+ * possible, using the same logic as for attribute values (which
+ * should be in line with what me->current_tag_charset
+ * indicates). - kw
+ */
+ if (me->T.decode_utf8 &&
+ *me->U.utf_buf) {
+ PUTS(me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ } else if (!IS_CJK_TTY &&
+ (me->T.output_utf8 ||
+ me->T.trans_from_uni)) {
+ if (LYIsASCII(clong)) {
+ PUTC(c);
+ } else if (clong == UCS_REPL && saved_char_in &&
+ HTPassEightBitRaw &&
+ saved_char_in >=
+ LYlowest_eightbit[me->outUCLYhndl]) {
+ PUTUTF8((UCode_t) (0xf000 | saved_char_in));
+ } else {
+ PUTUTF8(clong);
+ }
+ } else if (saved_char_in && me->T.use_raw_char_in) {
+ PUTC(saved_char_in);
+ } else {
+ PUTC(c);
+ }
+
+#define PASS8859SPECL me->T.pass_160_173_raw
+ /*
+ * Convert 160 (nbsp) to Lynx special character if neither
+ * HTPassHighCtrlRaw nor HTCJK is set. - FM
+ */
+ } else if (clong == CH_NBSP && /* S/390 -- gil -- 0932 */
+ !me->no_lynx_specialcodes &&
+ !(PASS8859SPECL || IS_CJK_TTY)) {
+ PUTC(HT_NON_BREAK_SPACE);
+ /*
+ * Convert 173 (shy) to Lynx special character if neither
+ * HTPassHighCtrlRaw nor HTCJK is set. - FM
+ */
+ } else if (clong == CH_SHY && /* S/390 -- gil -- 0949 */
+ !me->no_lynx_specialcodes &&
+ !(PASS8859SPECL || IS_CJK_TTY)) {
+ PUTC(LY_SOFT_HYPHEN);
+ /*
+ * Handle the case in which we think we have a character which
+ * doesn't need further processing (e.g., a koi8-r input for a
+ * koi8-r output). - FM
+ */
+ } else if (me->T.use_raw_char_in && saved_char_in) {
+ /*
+ * Only if the original character is still in saved_char_in,
+ * otherwise we may be iterating from a goto top. - KW
+ */
+ PUTC(saved_char_in);
+ } else if ((chk = (BOOL) (me->T.trans_from_uni &&
+ TOASCII(clong) >= 160)) && /* S/390 -- gil -- 0968 */
+ (uck = UCTransUniChar(clong,
+ me->outUCLYhndl)) >= ' ' &&
+ uck < 256) {
+ CTRACE((tfp, "UCTransUniChar returned 0x%.2" PRI_UCode_t
+ ":'%c'.\n",
+ uck, FROMASCII((char)uck)));
+ /*
+ * We got one octet from the conversions, so use it. - FM
+ */
+ PUTC(FROMASCII((char) uck));
+ } else if ((chk &&
+ (uck == -4 ||
+ (me->T.repl_translated_C0 &&
+ uck > 0 && uck < 32))) &&
+ /*
+ * Not found; look for replacement string. - KW
+ */
+ (uck = UCTransUniCharStr(replace_buf, 60, clong,
+ me->outUCLYhndl,
+ 0) >= 0)) {
+ /*
+ * Got a replacement string. No further tests for validity -
+ * assume that whoever defined replacement strings knew what she
+ * was doing. - KW
+ */
+ PUTS(replace_buf);
+ /*
+ * If we're displaying UTF-8, try that now. - FM
+ */
+ } else if (me->T.output_utf8 && PUTUTF8(clong)) {
+ ; /* do nothing more */
+ /*
+ * If it's any other (> 160) 8-bit character, and we have not set
+ * HTPassEightBitRaw nor HTCJK, nor have the "ISO Latin 1"
+ * character set selected, back translate for our character set. -
+ * FM
+ */
+#define IncludesLatin1Enc \
+ (me->outUCLYhndl == LATIN1 || \
+ (me->outUCI && \
+ (me->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1))))
+
+#define PASSHI8BIT (HTPassEightBitRaw || \
+ (me->T.do_8bitraw && !me->T.trans_from_uni))
+
+ } else if (clong > 160 && clong < 256 &&
+ !(PASSHI8BIT || IS_CJK_TTY) &&
+ !IncludesLatin1Enc) {
+#ifdef USE_PRETTYSRC
+ int psrc_view_backup = 0;
+#endif
+
+ string->size = 0;
+ EntityName = HTMLGetEntityName((UCode_t) (clong - 160));
+ HTChunkPuts(string, EntityName);
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ /* we need to disable it temporarily */
+ if (psrc_view) {
+ psrc_view_backup = 1;
+ psrc_view = 0;
+ }
+#endif
+ handle_entity(me, '\0');
+#ifdef USE_PRETTYSRC
+ /* we need to disable it temporarily */
+ if (psrc_view_backup)
+ psrc_view = TRUE;
+#endif
+
+ string->size = 0;
+ if (!FoundEntity)
+ PUTC(';');
+ /*
+ * If we get to here and have an ASCII char, pass the character. -
+ * KW
+ */
+ } else if (TOASCII(clong) < 127 && clong > 0) { /* S/390 -- gil -- 0987 */
+ PUTC(c);
+ /*
+ * If we get to here, and should have translated, translation has
+ * failed so far. - KW
+ *
+ * We should have sent UTF-8 output to the parser already, but what
+ * the heck, try again. - FM
+ */
+ } else if (me->T.output_utf8 && *me->U.utf_buf) {
+ PUTS(me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ /*
+ * If we don't actually want the character, make it safe and output
+ * that now. - FM
+ */
+ } else if (TOASCII(UCH(c)) < /* S/390 -- gil -- 0997 */
+ LYlowest_eightbit[me->outUCLYhndl] ||
+ (me->T.trans_from_uni && !HTPassEightBitRaw)) {
+ /*
+ * If we get to here, pass the character. - FM
+ */
+ } else {
+ PUTC(c);
+ }
+ break;
+
+ /*
+ * Found '<' in SGML_PCDATA content; treat this mode nearly like
+ * S_litteral, but recognize '<!' and '<?' to filter out comments and
+ * processing instructions. - kw
+ */
+ case S_pcdata:
+ if (!string->size && TOASCII(clong) < 127) { /* first after '<' */
+ if (c == '!') { /* <! */
+ /*
+ * Terminate and set up for possible comment, identifier,
+ * declaration, or marked section as under S_tag. - kw
+ */
+ me->state = S_exclamation;
+ me->lead_exclamation = TRUE;
+ me->doctype_bracket = FALSE;
+ me->first_bracket = FALSE;
+ HTChunkPutc(string, c);
+ break;
+ } else if (c == '?') { /* <? - ignore as a PI until '>' - kw */
+ CTRACE((tfp,
+ "SGML: Found PI in PCDATA, junking it until '>'\n"));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTS("<?");
+ PSRCSTOP(abracket);
+ }
+#endif
+ me->state = S_pi;
+ break;
+ }
+ }
+ goto case_S_litteral;
+
+ /*
+ * Found '<' in SGML_SCRIPT content; treat this mode nearly like
+ * S_litteral, but recognize '<!' to allow the content to be treated as
+ * a comment by lynx.
+ */
+ case S_script:
+ if (!string->size && TOASCII(clong) < 127) { /* first after '<' */
+ if (c == '!') { /* <! */
+ /*
+ * Terminate and set up for possible comment, identifier,
+ * declaration, or marked section as under S_tag. - kw
+ */
+ me->state = S_exclamation;
+ me->lead_exclamation = TRUE;
+ me->doctype_bracket = FALSE;
+ me->first_bracket = FALSE;
+ HTChunkPutc(string, c);
+ break;
+ }
+ }
+ goto case_S_litteral;
+
+ /*
+ * In literal mode, waits only for specific end tag (for compatibility
+ * with old servers, and for Lynx). - FM
+ */
+ case_S_litteral:
+ case S_litteral:
+ /*PSRC:this case not understood completely by HV, not done */
+ HTChunkPutc(string, c);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ /* there is nothing useful in the element_stack */
+ testtag = me->current_tag;
+ } else
+#endif
+ testtag = (me->element_stack
+ ? me->element_stack->tag
+ : NULL);
+
+ if (testtag == NULL || testtag->name == NULL) {
+ string->size--;
+ me->state = S_text;
+ goto top1;
+ }
+
+ /*
+ * Normally when we get the closing ">",
+ * testtag contains something like "TITLE"
+ * string contains something like "/title>"
+ * so we decrement by 2 to compare the final character of each.
+ */
+ testlast = string->size - 2 - me->trailing_spaces - me->leading_spaces;
+
+#ifdef USE_COLOR_STYLE
+#define TagSize(p) ((p)->name_len)
+#else
+#define TagSize(p) (strlen((p)->name))
+#endif
+
+ if (TOUPPER(c) != ((testlast < 0)
+ ? '/'
+ : ((testlast < (int) TagSize(testtag))
+ ? testtag->name[testlast]
+ : 0))) {
+ int i;
+
+ /*
+ * If complete match, end literal.
+ */
+ if ((c == '>') &&
+ testlast >= 0 && !testtag->name[testlast]) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ char *trailing = NULL;
+
+ if (me->trailing_spaces) {
+ StrAllocCopy(trailing,
+ string->data
+ + string->size
+ - 1
+ - me->trailing_spaces);
+ trailing[me->trailing_spaces] = '\0';
+ }
+
+ PSRCSTART(abracket);
+ PUTS("</");
+ PSRCSTOP(abracket);
+ PSRCSTART(tag);
+
+ strcpy(string->data, me->current_tag->name);
+ transform_tag(me, string);
+ PUTS(string->data);
+
+ if (trailing) {
+ PUTS(trailing);
+ FREE(trailing);
+ }
+
+ PSRCSTOP(tag);
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+
+ me->current_tag = NULL;
+ } else
+#endif
+ end_element(me, me->element_stack->tag);
+
+ string->size = 0;
+ me->current_attribute_number = INVALID;
+ me->state = S_text;
+ me->leading_spaces = 0;
+ me->trailing_spaces = 0;
+ break;
+ }
+
+ /*
+ * Allow whitespace between the "<" or ">" and the keyword, for
+ * error-recovery.
+ */
+ if (isspace(UCH(c))) {
+ if (testlast == -1) {
+ me->leading_spaces += 1;
+ CTRACE2(TRACE_SGML, (tfp, "leading spaces: %d\n", me->leading_spaces));
+ break;
+ } else if (testlast > 0) {
+ me->trailing_spaces += 1;
+ CTRACE2(TRACE_SGML, (tfp, "trailing spaces: %d\n", me->trailing_spaces));
+ break;
+ }
+ }
+
+ /*
+ * Mismatch - recover.
+ */
+ me->leading_spaces = 0;
+ me->trailing_spaces = 0;
+ if (((testtag->contents != SGML_LITTERAL &&
+ (testtag->flags & Tgf_strict)) ||
+ (me->state == S_pcdata &&
+ (testtag->flags & (Tgf_strict | Tgf_endO)))) &&
+ (testlast > -1 &&
+ (c == '>' || testlast > 0 || IsNmStart(c)))) {
+ me->state = S_end;
+ string->size--;
+ for (i = 0; i < string->size; i++) /* remove '/' */
+ string->data[i] = string->data[i + 1];
+ if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c))
+ break;
+ string->size--;
+ goto top1;
+ }
+ if (me->state == S_pcdata &&
+ (testtag->flags & (Tgf_strict | Tgf_endO)) &&
+ (testlast < 0 && IsNmStart(c))) {
+ me->state = S_tag;
+ break;
+ }
+ /*
+ * If Mismatch: recover string literally.
+ */
+ PUTC('<');
+ for (i = 0; i < string->size - 1; i++) /* recover, except last c */
+ PUTC(string->data[i]);
+ string->size = 0;
+ me->state = S_text;
+ goto top1; /* to recover last c */
+ }
+ break;
+
+ /*
+ * Character reference (numeric entity) or named entity.
+ */
+ case S_ero:
+ if (c == '#') {
+ /*
+ * Setting up for possible numeric entity.
+ */
+ me->state = S_cro; /* &# is Char Ref Open */
+ break;
+ }
+ me->state = S_entity; /* Fall through! */
+
+ /*
+ * Handle possible named entity.
+ */
+ case S_entity:
+ if (TOASCII(clong) < 127 && (string->size ? /* S/390 -- gil -- 1029 */
+ isalnum(UCH(c)) : isalpha(UCH(c)))) {
+ /* Should probably use IsNmStart/IsNmChar above (is that right?),
+ but the world is not ready for that - there's &nbsp: (note
+ colon!) and stuff around. */
+ /*
+ * Accept valid ASCII character. - FM
+ */
+ HTChunkPutc(string, c);
+ } else if (string->size == 0) {
+ /*
+ * It was an ampersand that's just text, so output the ampersand
+ * and recycle this character. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ PUTC('&');
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ me->state = S_text;
+ goto top1;
+ } else {
+ /*
+ * Terminate entity name and try to handle it. - FM
+ */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ entity_string = string->data;
+#endif
+ if (!strcmp(string->data, "zwnj") &&
+ (!me->element_stack ||
+ (me->element_stack->tag &&
+ me->element_stack->tag->contents == SGML_MIXED))) {
+ /*
+ * Handle zwnj (8204) as <WBR>. - FM
+ */
+ char temp[8];
+
+ CTRACE((tfp,
+ "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n"));
+
+ if (c != ';') {
+ sprintf(temp, "<WBR>%c", c);
+ } else {
+ sprintf(temp, "<WBR>");
+ }
+ if (me->recover == NULL) {
+ StrAllocCopy(me->recover, temp);
+ me->recover_index = 0;
+ } else {
+ StrAllocCat(me->recover, temp);
+ }
+ string->size = 0;
+ me->state = S_text;
+ break;
+ } else {
+ handle_entity(me, '\0');
+ }
+ string->size = 0;
+ me->state = S_text;
+ /*
+ * Don't eat the terminator if we didn't find the entity name and
+ * therefore sent the raw string via handle_entity(), or if the
+ * terminator is not the "standard" semi-colon for HTML. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view && FoundEntity && c == ';') {
+ PSRCSTART(entity);
+ PUTC(c);
+ PSRCSTOP(entity);
+ }
+#endif
+ if (!FoundEntity || c != ';')
+ goto top1;
+ }
+ break;
+
+ /*
+ * Check for a numeric entity.
+ */
+ case S_cro:
+ if (TOASCII(clong) < 127 && TOLOWER(UCH(c)) == 'x') { /* S/390 -- gil -- 1060 */
+ me->isHex = TRUE;
+ me->state = S_incro;
+ } else if (TOASCII(clong) < 127 && isdigit(UCH(c))) {
+ /*
+ * Accept only valid ASCII digits. - FM
+ */
+ HTChunkPutc(string, c); /* accumulate a character NUMBER */
+ me->isHex = FALSE;
+ me->state = S_incro;
+ } else if (string->size == 0) {
+ /*
+ * No 'x' or digit following the "&#" so recover them and recycle
+ * the character. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ PUTC('&');
+ PUTC('#');
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ me->state = S_text;
+ goto top1;
+ }
+ break;
+
+ /*
+ * Handle a numeric entity.
+ */
+ case S_incro:
+ /* S/390 -- gil -- 1075 */
+ if ((TOASCII(clong) < 127) &&
+ (me->isHex
+ ? isxdigit(UCH(c))
+ : isdigit(UCH(c)))) {
+ /*
+ * Accept only valid hex or ASCII digits. - FM
+ */
+ HTChunkPutc(string, c); /* accumulate a character NUMBER */
+ } else if (string->size == 0) {
+ /*
+ * No hex digit following the "&#x" so recover them and recycle the
+ * character. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ PUTS("&#x");
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ me->isHex = FALSE;
+ me->state = S_text;
+ goto top1;
+ } else {
+ /*
+ * Terminate the numeric entity and try to handle it. - FM
+ */
+ UCode_t code;
+ int i;
+
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ entity_string = string->data;
+#endif
+ if (UCScanCode(&code, string->data, me->isHex)) {
+
+/* =============== work in ASCII below here =============== S/390 -- gil -- 1092 */
+ if (AssumeCP1252(me)) {
+ code = LYcp1252ToUnicode(code);
+ }
+ /*
+ * Check for special values. - FM
+ */
+ if ((code == 8204) &&
+ (!me->element_stack ||
+ (me->element_stack->tag &&
+ me->element_stack->tag->contents == SGML_MIXED))) {
+ /*
+ * Handle zwnj (8204) as <WBR>. - FM
+ */
+ char temp[8];
+
+ CTRACE((tfp,
+ "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n"));
+
+ /*
+ * Include the terminator if it is not the standard
+ * semi-colon. - FM
+ */
+ if (c != ';') {
+ sprintf(temp, "<WBR>%c", c);
+ } else {
+ sprintf(temp, "<WBR>");
+ }
+ /*
+ * Add the replacement string to the recover buffer for
+ * processing. - FM
+ */
+ if (me->recover == NULL) {
+ StrAllocCopy(me->recover, temp);
+ me->recover_index = 0;
+ } else {
+ StrAllocCat(me->recover, temp);
+ }
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ break;
+ } else if (put_special_unicodes(me, code)) {
+ /*
+ * We handled the value as a special character, so recycle
+ * the terminator or break. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(entity);
+ PUTS((me->isHex ? "&#x" : "&#"));
+ PUTS(entity_string);
+ if (c == ';')
+ PUTC(';');
+ PSRCSTOP(entity);
+ }
+#endif
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ if (c != ';')
+ goto top1;
+ break;
+ }
+ /*
+ * Seek a translation from the chartrans tables.
+ */
+ if ((uck = UCTransUniChar(code,
+ me->outUCLYhndl)) >= 32 &&
+ uck < 256 &&
+ (uck < 127 ||
+ uck >= LYlowest_eightbit[me->outUCLYhndl])) {
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) {
+#endif
+ PUTC(FROMASCII((char) uck));
+#ifdef USE_PRETTYSRC
+ } else {
+ put_pretty_number(me);
+ }
+#endif
+ } else if ((uck == -4 ||
+ (me->T.repl_translated_C0 &&
+ uck > 0 && uck < 32)) &&
+ /*
+ * Not found; look for replacement string.
+ */
+ (uck = UCTransUniCharStr(replace_buf, 60, code,
+ me->outUCLYhndl,
+ 0) >= 0)) {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_number(me);
+ } else
+#endif
+ PUTS(replace_buf);
+ /*
+ * If we're displaying UTF-8, try that now. - FM
+ */
+ } else if (me->T.output_utf8 && PUTUTF8(code)) {
+ ; /* do nothing more */
+ /*
+ * Ignore 8205 (zwj), 8206 (lrm), and 8207 (rln), if we get
+ * to here. - FM
+ */
+ } else if (code == 8205 ||
+ code == 8206 ||
+ code == 8207) {
+ if (TRACE) {
+ string->size--;
+ LYStrNCpy(replace_buf,
+ string->data,
+ (string->size < 64 ? string->size : 63));
+ fprintf(tfp,
+ "SGML_character: Ignoring '%s%s'.\n",
+ (me->isHex ? "&#x" : "&#"),
+ replace_buf);
+ }
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(badseq);
+ PUTS((me->isHex ? "&#x" : "&#"));
+ PUTS(entity_string);
+ if (c == ';')
+ PUTC(';');
+ PSRCSTOP(badseq);
+ }
+#endif
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ if (c != ';')
+ goto top1;
+ break;
+ /*
+ * Show the numeric entity if we get to here and the value:
+ * (1) Is greater than 255 (but use ASCII characters for
+ * spaces or dashes).
+ * (2) Is less than 32, and not valid or we don't have
+ * HTCJK set.
+ * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK
+ * set.
+ * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum
+ * set.
+ * - FM
+ */
+ } else if ((code > 255) ||
+ (code < ' ' && /* S/390 -- gil -- 1140 */
+ code != '\t' && code != '\n' && code != '\r' &&
+ !IS_CJK_TTY) ||
+ (TOASCII(code) == 127 &&
+ !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
+ (TOASCII(code) > 127 && code < 160 &&
+ !HTPassHighCtrlNum)) {
+ /*
+ * Unhandled or illegal value. Recover the "&#" or "&#x"
+ * and digit(s), and recycle the terminator. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(badseq);
+ }
+#endif
+ if (me->isHex) {
+ PUTS("&#x");
+ me->isHex = FALSE;
+ } else {
+ PUTS("&#");
+ }
+ string->size--;
+ for (i = 0; i < string->size; i++) /* recover */
+ PUTC(string->data[i]);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTOP(badseq);
+ }
+#endif
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ goto top1;
+ } else if (TOASCII(code) < 161 || /* S/390 -- gil -- 1162 */
+ HTPassEightBitNum ||
+ IncludesLatin1Enc) {
+ /*
+ * No conversion needed. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ put_pretty_number(me);
+ } else
+#endif
+ PUTC(FROMASCII((char) code));
+ } else {
+ /*
+ * Handle as named entity. - FM
+ */
+ code -= 160;
+ EntityName = HTMLGetEntityName(code);
+ if (EntityName && EntityName[0] != '\0') {
+ string->size = 0;
+ HTChunkPuts(string, EntityName);
+ HTChunkTerminate(string);
+ handle_entity(me, '\0');
+ /*
+ * Add a semi-colon if something went wrong and
+ * handle_entity() sent the string. - FM
+ */
+ if (!FoundEntity) {
+ PUTC(';');
+ }
+ } else {
+ /*
+ * Our conversion failed, so recover the "&#" and
+ * digit(s), and recycle the terminator. - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ if (me->isHex) {
+ PUTS("&#x");
+ me->isHex = FALSE;
+ } else {
+ PUTS("&#");
+ }
+ string->size--;
+ for (i = 0; i < string->size; i++) /* recover */
+ PUTC(string->data[i]);
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ goto top1;
+ }
+ }
+ /*
+ * If we get to here, we succeeded. Hoorah!!! - FM
+ */
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ /*
+ * Don't eat the terminator if it's not the "standard"
+ * semi-colon for HTML. - FM
+ */
+ if (c != ';') {
+ goto top1;
+ }
+ } else {
+ /*
+ * Not an entity, and don't know why not, so add the terminator
+ * to the string, output the "&#" or "&#x", and process the
+ * string via the recover element. - FM
+ */
+ string->size--;
+ HTChunkPutc(string, c);
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ if (me->isHex) {
+ PUTS("&#x");
+ me->isHex = FALSE;
+ } else {
+ PUTS("&#");
+ }
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ if (me->recover == NULL) {
+ StrAllocCopy(me->recover, string->data);
+ me->recover_index = 0;
+ } else {
+ StrAllocCat(me->recover, string->data);
+ }
+ string->size = 0;
+ me->isHex = FALSE;
+ me->state = S_text;
+ break;
+ }
+ }
+ break;
+
+ /*
+ * Tag
+ */
+ case S_tag: /* new tag */
+ if (TOASCII(clong) < 127 && (string->size ? /* S/390 -- gil -- 1179 */
+ IsNmChar(c) : IsNmStart(c))) {
+ /*
+ * Add valid ASCII character. - FM
+ */
+ HTChunkPutc(string, c);
+ } else if (c == '!' && !string->size) { /* <! */
+ /*
+ * Terminate and set up for possible comment, identifier,
+ * declaration, or marked section. - FM
+ */
+ me->state = S_exclamation;
+ me->lead_exclamation = TRUE;
+ me->doctype_bracket = FALSE;
+ me->first_bracket = FALSE;
+ HTChunkPutc(string, c);
+ break;
+ } else if (!string->size &&
+ (TOASCII(clong) <= 160 && /* S/390 -- gil -- 1196 */
+ (c != '/' && c != '?' && c != '_' && c != ':'))) {
+ /*
+ * '<' must be followed by an ASCII letter to be a valid start tag.
+ * Here it isn't, nor do we have a '/' for an end tag, nor one of
+ * some other characters with a special meaning for SGML or which
+ * are likely to be legal Name Start characters in XML or some
+ * other extension. So recover the '<' and following character as
+ * data. - FM & KW
+ */
+ me->state = S_text;
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ PUTC('<');
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ goto top1;
+ } else { /* End of tag name */
+ /*
+ * Try to handle tag. - FM
+ */
+ HTTag *t;
+
+ if (c == '/') {
+ if (string->size == 0) {
+ me->state = S_end;
+ break;
+ }
+ CTRACE((tfp, "SGML: `<%.*s/' found!\n", string->size, string->data));
+ }
+ HTChunkTerminate(string);
+
+ t = SGMLFindTag(dtd, string->data);
+ if (t == me->unknown_tag &&
+ ((c == ':' &&
+ string->size == 4 && 0 == strcasecomp(string->data, "URL")) ||
+ (string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) {
+ /*
+ * Treat <URL: as text rather than a junk tag, so we display
+ * it and the URL (Lynxism 8-). - FM
+ */
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(badseq);
+#endif
+ PUTC('<');
+ PUTS(string->data); /* recover */
+ PUTC(c);
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTOP(badseq);
+#endif
+ CTRACE((tfp, "SGML: Treating <%s%c as text\n",
+ string->data, c));
+ string->size = 0;
+ me->state = S_text;
+ break;
+ }
+ if (c == '/' && t) {
+ /*
+ * Element name was ended by '/'. Remember the tag that ended
+ * thusly, we'll interpret this as either an indication of an
+ * empty element (if '>' follows directly) or do some
+ * SGMLshortref-ish treatment. - kw
+ */
+ me->slashedtag = t;
+ }
+ if (!t) {
+ if (c == '?' && string->size <= 1) {
+ CTRACE((tfp, "SGML: Found PI, looking for '>'\n"));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTS("<?");
+ PSRCSTOP(abracket);
+ }
+#endif
+ string->size = 0;
+ me->state = S_pi;
+ HTChunkPutc(string, c);
+ break;
+ }
+ CTRACE((tfp, "SGML: *** Invalid element %s\n",
+ string->data));
+
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC('<');
+ PSRCSTOP(abracket);
+ PSRCSTART(badtag);
+ transform_tag(me, string);
+ PUTS(string->data);
+ if (c == '>') {
+ PSRCSTOP(badtag);
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ } else {
+ PUTC(c);
+ }
+ }
+#endif
+ me->state = (c == '>') ? S_text : S_junk_tag;
+ break;
+ } else if (t == me->unknown_tag) {
+ CTRACE((tfp, "SGML: *** Unknown element \"%s\"\n",
+ string->data));
+ /*
+ * Fall through and treat like valid tag for attribute parsing.
+ * - KW
+ */
+
+ }
+ me->current_tag = t;
+
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC('<');
+ PSRCSTOP(abracket);
+ if (t != me->unknown_tag)
+ PSRCSTART(tag);
+ else
+ PSRCSTART(badtag);
+ transform_tag(me, string);
+ PUTS(string->data);
+ if (t != me->unknown_tag)
+ PSRCSTOP(tag);
+ else
+ PSRCSTOP(badtag);
+ }
+ if (!psrc_view) /*don't waste time */
+#endif
+ {
+ /*
+ * Clear out attributes.
+ */
+ memset((void *) me->present, 0, sizeof(BOOL) *
+ (unsigned) (me->current_tag->number_of_attributes));
+ }
+
+ string->size = 0;
+ me->current_attribute_number = INVALID;
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ if (c == '>' || c == '<' || (c == '/' && me->slashedtag)) {
+ if (c != '<') {
+ PSRCSTART(abracket);
+ PUTC(c);
+ PSRCSTOP(abracket);
+ me->state = (c == '>') ? S_text : S_tagname_slash;
+ } else {
+ me->state = S_tag;
+ }
+ } else {
+ if (!WHITE(c))
+ PUTC(c);
+ me->state = S_tag_gap;
+ }
+ } else
+#endif
+ if (c == '>' || c == '<' || (c == '/' && me->slashedtag)) {
+ if (me->current_tag->name)
+ start_element(me);
+ me->state = (c == '>') ? S_text :
+ (c == '<') ? S_tag : S_tagname_slash;
+ } else {
+ me->state = S_tag_gap;
+ }
+ }
+ break;
+
+ case S_exclamation:
+ if (me->lead_exclamation && c == '-') {
+ /*
+ * Set up for possible comment. - FM
+ */
+ me->lead_exclamation = FALSE;
+ me->first_dash = TRUE;
+ HTChunkPutc(string, c);
+ break;
+ }
+ if (me->lead_exclamation && c == '[') {
+ /*
+ * Set up for possible marked section. - FM
+ */
+ me->lead_exclamation = FALSE;
+ me->first_bracket = TRUE;
+ me->second_bracket = FALSE;
+ HTChunkPutc(string, c);
+ me->state = S_marked;
+ break;
+ }
+ if (me->first_dash && c == '-') {
+ /*
+ * Set up to handle comment. - FM
+ */
+ me->lead_exclamation = FALSE;
+ me->first_dash = FALSE;
+ me->end_comment = FALSE;
+ HTChunkPutc(string, c);
+ me->state = S_comment;
+ break;
+ }
+ me->lead_exclamation = FALSE;
+ me->first_dash = FALSE;
+ if (c == '>') {
+ /*
+ * Try to handle identifier. - FM
+ */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_identifier(me);
+ string->size = 0;
+ me->state = S_text;
+ break;
+ }
+ if (WHITE(c)) {
+ if (string->size == 8 &&
+ !strncasecomp(string->data, "!DOCTYPE", 8)) {
+ /*
+ * Set up for DOCTYPE declaration. - FM
+ */
+ HTChunkPutc(string, c);
+ me->doctype_bracket = FALSE;
+ me->state = S_doctype;
+ break;
+ }
+ if (string->size == 7 &&
+ !strncasecomp(string->data, "!ENTITY", 7)) {
+ /*
+ * Set up for ENTITY declaration. - FM
+ */
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ me->end_comment = TRUE;
+ me->state = S_sgmlent;
+ break;
+ }
+ if (string->size == 8 &&
+ !strncasecomp(string->data, "!ELEMENT", 8)) {
+ /*
+ * Set up for ELEMENT declaration. - FM
+ */
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ me->end_comment = TRUE;
+ me->state = S_sgmlele;
+ break;
+ }
+ if (string->size == 8 &&
+ !strncasecomp(string->data, "!ATTLIST", 8)) {
+ /*
+ * Set up for ATTLIST declaration. - FM
+ */
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ me->end_comment = TRUE;
+ me->state = S_sgmlatt;
+ break;
+ }
+ }
+ HTChunkPutc(string, c);
+ break;
+
+ case S_comment: /* Expecting comment. - FM */
+ if (historical_comments) {
+ /*
+ * Any '>' terminates. - FM
+ */
+ if (c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(comm);
+ PUTC('<');
+ PUTS_TR(string->data);
+ PUTC('>');
+ PSRCSTOP(comm);
+ } else
+#endif
+ handle_comment(me);
+ string->size = 0;
+ me->end_comment = FALSE;
+ me->first_dash = FALSE;
+ me->state = S_text;
+ break;
+ }
+ goto S_comment_put_c;
+ }
+ if (!me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = TRUE;
+ break;
+ }
+ if (me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ if (!me->end_comment)
+ me->end_comment = TRUE;
+ else if (!minimal_comments)
+ /*
+ * Validly treat '--' pairs as successive comments (for
+ * minimal, any "--WHITE>" terminates). - FM
+ */
+ me->end_comment = FALSE;
+ break;
+ }
+ if (me->end_comment && c == '>') {
+ /*
+ * Terminate and handle the comment. - FM
+ */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(comm);
+ PUTC('<');
+ PUTS_TR(string->data);
+ PUTC('>');
+ PSRCSTOP(comm);
+ } else
+#endif
+ handle_comment(me);
+ string->size = 0;
+ me->end_comment = FALSE;
+ me->first_dash = FALSE;
+ me->state = S_text;
+ break;
+ }
+ me->first_dash = FALSE;
+ if (me->end_comment && !isspace(UCH(c)))
+ me->end_comment = FALSE;
+
+ S_comment_put_c:
+ if (me->T.decode_utf8 &&
+ *me->U.utf_buf) {
+ HTChunkPuts(string, me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ } else if (!IS_CJK_TTY &&
+ (me->T.output_utf8 ||
+ me->T.trans_from_uni)) {
+ if (clong == UCS_REPL && saved_char_in &&
+ HTPassEightBitRaw &&
+ saved_char_in >=
+ LYlowest_eightbit[me->outUCLYhndl]) {
+ (HTChunkPutUtf8Char) (string,
+ (UCode_t) (0xf000 | saved_char_in));
+ } else {
+ HTChunkPutUtf8Char(string, clong);
+ }
+ } else if (saved_char_in && me->T.use_raw_char_in) {
+ HTChunkPutc(string, saved_char_in);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+
+ case S_doctype: /* Expecting DOCTYPE. - FM */
+ if (me->doctype_bracket) {
+ HTChunkPutc(string, c);
+ if (c == ']')
+ me->doctype_bracket = FALSE;
+ break;
+ }
+ if (c == '[' && WHITE(string->data[string->size - 1])) {
+ HTChunkPutc(string, c);
+ me->doctype_bracket = TRUE;
+ break;
+ }
+ if (c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_doctype(me);
+ string->size = 0;
+ me->state = S_text;
+ break;
+ }
+ HTChunkPutc(string, c);
+ break;
+
+ case S_marked: /* Expecting marked section. - FM */
+ if (me->first_bracket && c == '[') {
+ HTChunkPutc(string, c);
+ me->first_bracket = FALSE;
+ me->second_bracket = TRUE;
+ break;
+ }
+ if (me->second_bracket && c == ']' &&
+ string->data[string->size - 1] == ']') {
+ HTChunkPutc(string, c);
+ me->second_bracket = FALSE;
+ break;
+ }
+ if (!me->second_bracket && c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_marked(me);
+ string->size = 0;
+ me->state = S_text;
+ break;
+ }
+
+ if (me->T.decode_utf8) {
+ HTChunkPutUtf8Char(string, clong);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+ case S_sgmlent: /* Expecting ENTITY. - FM */
+ if (!me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = TRUE;
+ break;
+ }
+ if (me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ if (!me->end_comment)
+ me->end_comment = TRUE;
+ else
+ me->end_comment = FALSE;
+ break;
+ }
+ if (me->end_comment && c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_sgmlent(me);
+ string->size = 0;
+ me->end_comment = FALSE;
+ me->first_dash = FALSE;
+ me->state = S_text;
+ break;
+ }
+ me->first_dash = FALSE;
+ HTChunkPutc(string, c);
+ break;
+
+ case S_sgmlele: /* Expecting ELEMENT. - FM */
+ if (!me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = TRUE;
+ break;
+ }
+ if (me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ if (!me->end_comment)
+ me->end_comment = TRUE;
+ else
+ me->end_comment = FALSE;
+ break;
+ }
+ if (me->end_comment && c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_sgmlele(me);
+ string->size = 0;
+ me->end_comment = FALSE;
+ me->first_dash = FALSE;
+ me->state = S_text;
+ break;
+ }
+ me->first_dash = FALSE;
+ HTChunkPutc(string, c);
+ break;
+
+ case S_sgmlatt: /* Expecting ATTLIST. - FM */
+ if (!me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = TRUE;
+ break;
+ }
+ if (me->first_dash && c == '-') {
+ HTChunkPutc(string, c);
+ me->first_dash = FALSE;
+ if (!me->end_comment)
+ me->end_comment = TRUE;
+ else
+ me->end_comment = FALSE;
+ break;
+ }
+ if (me->end_comment && c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(sgmlspecial);
+ PUTC('<');
+ PUTS(string->data);
+ PUTC('>');
+ PSRCSTOP(sgmlspecial);
+ } else
+#endif
+ handle_sgmlatt(me);
+ string->size = 0;
+ me->end_comment = FALSE;
+ me->first_dash = FALSE;
+ me->state = S_text;
+ break;
+ }
+ me->first_dash = FALSE;
+ HTChunkPutc(string, c);
+ break;
+
+ case S_tag_gap: /* Expecting attribute or '>' */
+ if (WHITE(c)) {
+ /* PUTC(c); - no, done as special case */
+ break; /* Gap between attributes */
+ }
+ if (c == '>') { /* End of tag */
+#ifdef USE_PRETTYSRC
+ if (!psrc_view)
+#endif
+ if (me->current_tag->name)
+ start_element(me);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ }
+#endif
+ me->state = S_text;
+ break;
+ }
+ HTChunkPutc(string, c);
+ me->state = S_attr; /* Get attribute */
+ break;
+
+ /* accumulating value */
+ case S_attr:
+ if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */
+ if ((c == '>')
+ && (string->size >= 1)
+ && (string->data[string->size - 1] == '/')) {
+ if ((LYxhtml_parsing || me->extended_html)
+ && ignore_when_empty(me->current_tag)) {
+ discard_empty(me);
+ } else {
+ HTChunkTerminate(string);
+ }
+ } else {
+ HTChunkTerminate(string);
+ handle_attribute_name(me, string->data);
+ }
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) {
+#endif
+ string->size = 0;
+ if (c == '>') { /* End of tag */
+ if (me->current_tag->name)
+ start_element(me);
+ me->state = S_text;
+ break;
+ }
+#ifdef USE_PRETTYSRC
+ } else {
+ PUTC(' ');
+ if (me->current_attribute_number == INVALID)
+ PSRCSTART(badattr);
+ else
+ PSRCSTART(attrib);
+ if (attrname_transform != 1) {
+ if (attrname_transform == 0)
+ LYLowerCase(string->data);
+ else
+ LYUpperCase(string->data);
+ }
+ PUTS(string->data);
+ if (c == '=' || WHITE(c))
+ PUTC(c);
+ if (c == '=' || c == '>') {
+ if (me->current_attribute_number == INVALID) {
+ PSRCSTOP(badattr);
+ } else {
+ PSRCSTOP(attrib);
+ }
+ }
+ if (c == '>') {
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ me->state = S_text;
+ break;
+ }
+ string->size = 0;
+ }
+#endif
+ me->state = (c == '=' ? S_equals : S_attr_gap);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+
+ case S_attr_gap: /* Expecting attribute or '=' or '>' */
+ if (WHITE(c)) {
+ PRETTYSRC_PUTC(c);
+ break; /* Gap after attribute */
+ }
+ if (c == '>') { /* End of tag */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ if (me->current_attribute_number == INVALID) {
+ PSRCSTOP(badattr);
+ } else {
+ PSRCSTOP(attrib);
+ }
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ } else
+#endif
+ if (me->current_tag->name)
+ start_element(me);
+ me->state = S_text;
+ break;
+ } else if (c == '=') {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PUTC('=');
+ if (me->current_attribute_number == INVALID) {
+ PSRCSTOP(badattr);
+ } else {
+ PSRCSTOP(attrib);
+ }
+ }
+#endif
+ me->state = S_equals;
+ break;
+ }
+ HTChunkPutc(string, c);
+ me->state = S_attr; /* Get next attribute */
+ break;
+
+ case S_equals: /* After attr = */
+ if (WHITE(c)) {
+ PRETTYSRC_PUTC(c);
+ break; /* Before attribute value */
+ }
+ if (c == '>') { /* End of tag */
+ CTRACE((tfp, "SGML: found = but no value\n"));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ } else
+#endif
+ if (me->current_tag->name)
+ start_element(me);
+ me->state = S_text;
+ break;
+
+ } else if (c == '\'') {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(attrval);
+ PUTC(c);
+ }
+#endif
+ me->state = S_squoted;
+ break;
+
+ } else if (c == '"') {
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(attrval);
+ PUTC(c);
+ }
+#endif
+ me->state = S_dquoted;
+ break;
+ }
+#ifdef USE_PRETTYSRC
+ if (psrc_view)
+ PSRCSTART(attrval);
+#endif
+ me->state = S_value;
+ /* FALLTHRU */
+
+ case S_value:
+ if (WHITE(c) || (c == '>')) { /* End of word */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (!end_if_prettysrc(me, string, 0))
+#endif
+ {
+#ifdef CJK_EX /* Quick hack. - JH7AYN */
+ if (IS_CJK_TTY) {
+ if (string->data[0] == '$') {
+ if (string->data[1] == 'B' || string->data[1] == '@') {
+ char *jis_buf = 0;
+
+ HTSprintf0(&jis_buf, "\033%s", string->data);
+ TO_EUC((const unsigned char *) jis_buf,
+ (unsigned char *) string->data);
+ FREE(jis_buf);
+ }
+ }
+ }
+#endif
+ handle_attribute_value(me, string->data);
+ }
+ string->size = 0;
+ if (c == '>') { /* End of tag */
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ } else
+#endif
+ if (me->current_tag->name)
+ start_element(me);
+ me->state = S_text;
+ break;
+ } else
+ me->state = S_tag_gap;
+ } else if (me->T.decode_utf8 &&
+ *me->U.utf_buf) {
+ HTChunkPuts(string, me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ } else if (!IS_CJK_TTY &&
+ (me->T.output_utf8 ||
+ me->T.trans_from_uni)) {
+ if (clong == UCS_REPL && saved_char_in &&
+ HTPassEightBitRaw &&
+ saved_char_in >=
+ LYlowest_eightbit[me->outUCLYhndl]) {
+ (HTChunkPutUtf8Char) (string,
+ (UCode_t) (0xf000 | saved_char_in));
+ } else {
+ HTChunkPutUtf8Char(string, clong);
+ }
+ } else if (saved_char_in && me->T.use_raw_char_in) {
+ HTChunkPutc(string, saved_char_in);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+
+ case S_squoted: /* Quoted attribute value */
+ if (c == '\'') { /* End of attribute value */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (!end_if_prettysrc(me, string, '\''))
+#endif
+ handle_attribute_value(me, string->data);
+ string->size = 0;
+ me->state = S_tag_gap;
+ } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1213 */
+ /*
+ * Setting up for possible single quotes in CJK escape sequences.
+ * - Takuya ASADA (asada@three-a.co.jp)
+ */
+ me->state = S_esc_sq;
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ } else if (me->T.decode_utf8 &&
+ *me->U.utf_buf) {
+ HTChunkPuts(string, me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ } else if (!IS_CJK_TTY &&
+ (me->T.output_utf8 ||
+ me->T.trans_from_uni)) {
+ if (clong == UCS_REPL && saved_char_in &&
+ HTPassEightBitRaw &&
+ saved_char_in >=
+ LYlowest_eightbit[me->outUCLYhndl]) {
+ (HTChunkPutUtf8Char) (string,
+ (UCode_t) (0xf000 | saved_char_in));
+ } else {
+ HTChunkPutUtf8Char(string, clong);
+ }
+ } else if (saved_char_in && me->T.use_raw_char_in) {
+ HTChunkPutc(string, saved_char_in);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+
+ case S_dquoted: /* Quoted attribute value */
+ if (c == '"' || /* Valid end of attribute value */
+ (soft_dquotes && /* If emulating old Netscape bug, treat '>' */
+ c == '>')) { /* as a co-terminator of dquoted and tag */
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (!end_if_prettysrc(me, string, (char) c))
+#endif
+ handle_attribute_value(me, string->data);
+ string->size = 0;
+ me->state = S_tag_gap;
+ if (c == '>') /* We emulated the Netscape bug, so we go */
+ goto top1; /* back and treat it as the tag terminator */
+ } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1230 */
+ /*
+ * Setting up for possible double quotes in CJK escape sequences.
+ * - Takuya ASADA (asada@three-a.co.jp)
+ */
+ me->state = S_esc_dq;
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ } else if (me->T.decode_utf8 &&
+ *me->U.utf_buf) {
+ HTChunkPuts(string, me->U.utf_buf);
+ me->U.utf_buf_p = me->U.utf_buf;
+ *(me->U.utf_buf_p) = '\0';
+ } else if (!IS_CJK_TTY &&
+ (me->T.output_utf8 ||
+ me->T.trans_from_uni)) {
+ if (clong == UCS_REPL && saved_char_in &&
+ HTPassEightBitRaw &&
+ saved_char_in >=
+ LYlowest_eightbit[me->outUCLYhndl]) {
+ (HTChunkPutUtf8Char) (string,
+ (UCode_t) (0xf000 | saved_char_in));
+ } else {
+ HTChunkPutUtf8Char(string, clong);
+ }
+ } else if (saved_char_in && me->T.use_raw_char_in) {
+ HTChunkPutc(string, saved_char_in);
+ } else {
+ HTChunkPutc(string, c);
+ }
+ break;
+
+ case S_end: /* </ */
+ if (TOASCII(clong) < 127 && (string->size ? /* S/390 -- gil -- 1247 */
+ IsNmChar(c) : IsNmStart(c))) {
+ HTChunkPutc(string, c);
+ } else { /* End of end tag name */
+ HTTag *t = 0;
+
+#ifdef USE_PRETTYSRC
+ BOOL psrc_tagname_processed = FALSE;
+#endif
+
+ HTChunkTerminate(string);
+ if (!*string->data) { /* Empty end tag */
+ if (me->element_stack)
+ t = me->element_stack->tag;
+ } else {
+ t = SGMLFindTag(dtd, string->data);
+ }
+ if (!t || t == me->unknown_tag) {
+ CTRACE((tfp, "Unknown end tag </%s>\n", string->data));
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PSRCSTART(abracket);
+ PUTS("</");
+ PSRCSTOP(abracket);
+ PSRCSTART(badtag);
+ transform_tag(me, string);
+ PUTS(string->data);
+ if (c != '>') {
+ PUTC(c);
+ } else {
+ PSRCSTOP(badtag);
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ }
+ psrc_tagname_processed = TRUE;
+ }
+ } else if (psrc_view) {
+#endif
+ } else {
+ BOOL tag_OK = (BOOL) (c == '>' || WHITE(c));
+ HTMLElement e = TAGNUM_OF_TAGP(t);
+ int branch = 2; /* it can be 0,1,2 */
+
+ me->current_tag = t;
+ if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) &&
+ me->element_stack &&
+ ALT_TAGP(t) == me->element_stack->tag)
+ me->element_stack->tag = NORMAL_TAGP(me->element_stack->tag);
+
+ if (tag_OK && Old_DTD) {
+ switch (e) {
+ case HTML_DD:
+ case HTML_DT:
+ case HTML_LI:
+ case HTML_LH:
+ case HTML_TD:
+ case HTML_TH:
+ case HTML_TR:
+ case HTML_THEAD:
+ case HTML_TFOOT:
+ case HTML_TBODY:
+ case HTML_COLGROUP:
+ branch = 0;
+ break;
+
+ case HTML_A:
+ case HTML_B:
+ case HTML_BLINK:
+ case HTML_CITE:
+ case HTML_EM:
+ case HTML_FONT:
+ case HTML_FORM:
+ case HTML_I:
+ case HTML_P:
+ case HTML_STRONG:
+ case HTML_TT:
+ case HTML_U:
+ branch = 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * Just handle ALL end tags normally :-) - kw
+ */
+ if (!Old_DTD) {
+ end_element(me, me->current_tag);
+ } else if (tag_OK && (branch == 0)) {
+ /*
+ * Don't treat these end tags as invalid, nor act on them.
+ * - FM
+ */
+ CTRACE((tfp, "SGML: `</%s%c' found! Ignoring it.\n",
+ string->data, c));
+ string->size = 0;
+ me->current_attribute_number = INVALID;
+ if (c != '>') {
+ me->state = S_junk_tag;
+ } else {
+ me->current_tag = NULL;
+ me->state = S_text;
+ }
+ break;
+ } else if (tag_OK && (branch == 1)) {
+ /*
+ * Handle end tags for container elements declared as
+ * SGML_EMPTY to prevent "expected tag substitution" but
+ * still processed via HTML_end_element() in HTML.c with
+ * checks there to avoid throwing the HTML.c stack out of
+ * whack (Ugh, what a hack! 8-). - FM
+ */
+ if (me->inSELECT) {
+ /*
+ * We are in a SELECT block. - FM
+ */
+ if (strcasecomp(string->data, "FORM")) {
+ /*
+ * It is not at FORM end tag, so ignore it. - FM
+ */
+ CTRACE((tfp,
+ "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
+ string->data));
+ } else {
+ /*
+ * End the SELECT block and then handle the FORM
+ * end tag. - FM
+ */
+ CTRACE((tfp,
+ "SGML: ***Faking SELECT end tag before </%s> end tag.\n",
+ string->data));
+ end_element(me,
+ SGMLFindTag(me->dtd, "SELECT"));
+ CTRACE((tfp, "SGML: End </%s>\n", string->data));
+
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) /* Don't actually call if viewing psrc - kw */
+#endif
+ (*me->actions->end_element)
+ (me->target,
+ (int) TAGNUM_OF_TAGP(me->current_tag),
+ &me->include);
+ }
+ } else if (!strcasecomp(string->data, "P")) {
+ /*
+ * Treat a P end tag like a P start tag (Ugh, what a
+ * hack! 8-). - FM
+ */
+ CTRACE((tfp,
+ "SGML: `</%s%c' found! Treating as '<%s%c'.\n",
+ string->data, c, string->data, c));
+ {
+ int i;
+
+ for (i = 0;
+ i < me->current_tag->number_of_attributes;
+ i++) {
+ me->present[i] = NO;
+ }
+ }
+ if (me->current_tag->name)
+ start_element(me);
+ } else {
+ CTRACE((tfp, "SGML: End </%s>\n", string->data));
+
+#ifdef USE_PRETTYSRC
+ if (!psrc_view) /* Don't actually call if viewing psrc - kw */
+#endif
+ (*me->actions->end_element)
+ (me->target,
+ (int) TAGNUM_OF_TAGP(me->current_tag),
+ &me->include);
+ }
+ string->size = 0;
+ me->current_attribute_number = INVALID;
+ if (c != '>') {
+ me->state = S_junk_tag;
+ } else {
+ me->current_tag = NULL;
+ me->state = S_text;
+ }
+ break;
+ } else {
+ /*
+ * Handle all other end tags normally. - FM
+ */
+ end_element(me, me->current_tag);
+ }
+ }
+
+#ifdef USE_PRETTYSRC
+ if (psrc_view && !psrc_tagname_processed) {
+ PSRCSTART(abracket);
+ PUTS("</");
+ PSRCSTOP(abracket);
+ PSRCSTART(tag);
+ if (tagname_transform != 1) {
+ if (tagname_transform == 0)
+ LYLowerCase(string->data);
+ else
+ LYUpperCase(string->data);
+ }
+ PUTS(string->data);
+ PSRCSTOP(tag);
+ if (c != '>') {
+ PSRCSTART(badtag);
+ PUTC(c);
+ } else {
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ }
+ }
+#endif
+
+ string->size = 0;
+ me->current_attribute_number = INVALID;
+ if (c != '>') {
+ if (!WHITE(c))
+ CTRACE((tfp, "SGML: `</%s%c' found!\n", string->data, c));
+ me->state = S_junk_tag;
+ } else {
+ me->current_tag = NULL;
+ me->state = S_text;
+ }
+ }
+ break;
+
+ case S_esc: /* Expecting '$'or '(' following CJK ESC. */
+ if (c == '$') {
+ me->state = S_dollar;
+ } else if (c == '(') {
+ me->state = S_paren;
+ } else {
+ me->state = S_text;
+ if (UTF8_TTY_ISO2022JP)
+ goto top1;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ PUTC(c);
+ break;
+
+ case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
+ if (c == '@' || c == 'B' || c == 'A') {
+ me->state = S_nonascii_text;
+ } else if (c == '(') {
+ me->state = S_dollar_paren;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ PUTC(c);
+ break;
+
+ case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */
+ if (c == 'C') {
+ me->state = S_nonascii_text;
+ } else {
+ me->state = S_text;
+ if (UTF8_TTY_ISO2022JP) {
+ PUTS("$(");
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ PUTC(c);
+ break;
+
+ case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
+ if (c == 'B' || c == 'J' || c == 'T') {
+ me->state = S_text;
+ } else if (c == 'I') {
+ me->state = S_nonascii_text;
+ if (UTF8_TTY_ISO2022JP)
+ me->kanji_buf = '\t'; /* flag for single byte katakana */
+ } else {
+ me->state = S_text;
+ if (UTF8_TTY_ISO2022JP) {
+ PUTC('(');
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ PUTC(c);
+ break;
+
+ case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */
+ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */
+ me->state = S_esc;
+ } else if (c < 32) {
+ me->state = S_text;
+ }
+ if (UTF8_TTY_ISO2022JP) {
+ if (TOASCII(c) != '\033')
+ PUTUTF8(clong);
+ } else
+ PUTC(c);
+ break;
+
+ case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */
+ if (c == '$') {
+ me->state = S_dollar_sq;
+ } else if (c == '(') {
+ me->state = S_paren_sq;
+ } else {
+ me->state = S_squoted;
+ if (UTF8_TTY_ISO2022JP)
+ goto top1;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
+ if (c == '@' || c == 'B' || c == 'A') {
+ me->state = S_nonascii_text_sq;
+ } else if (c == '(') {
+ me->state = S_dollar_paren_sq;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */
+ if (c == 'C') {
+ me->state = S_nonascii_text_sq;
+ } else {
+ me->state = S_squoted;
+ if (UTF8_TTY_ISO2022JP) {
+ HTChunkPuts(string, "$(");
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
+ if (c == 'B' || c == 'J' || c == 'T') {
+ me->state = S_squoted;
+ } else if (c == 'I') {
+ me->state = S_nonascii_text_sq;
+ if (UTF8_TTY_ISO2022JP)
+ me->kanji_buf = '\t'; /* flag for single byte katakana */
+ } else {
+ me->state = S_squoted;
+ if (UTF8_TTY_ISO2022JP) {
+ HTChunkPutc(string, '(');
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */
+ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */
+ me->state = S_esc_sq;
+ }
+ if (UTF8_TTY_ISO2022JP) {
+ if (TOASCII(c) != '\033')
+ HTChunkPutUtf8Char(string, clong);
+ } else
+ HTChunkPutc(string, c);
+ break;
+
+ case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */
+ if (c == '$') {
+ me->state = S_dollar_dq;
+ } else if (c == '(') {
+ me->state = S_paren_dq;
+ } else {
+ me->state = S_dquoted;
+ if (UTF8_TTY_ISO2022JP)
+ goto top1;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
+ if (c == '@' || c == 'B' || c == 'A') {
+ me->state = S_nonascii_text_dq;
+ } else if (c == '(') {
+ me->state = S_dollar_paren_dq;
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */
+ if (c == 'C') {
+ me->state = S_nonascii_text_dq;
+ } else {
+ me->state = S_dquoted;
+ if (UTF8_TTY_ISO2022JP) {
+ HTChunkPuts(string, "$(");
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
+ if (c == 'B' || c == 'J' || c == 'T') {
+ me->state = S_dquoted;
+ } else if (c == 'I') {
+ me->state = S_nonascii_text_dq;
+ if (UTF8_TTY_ISO2022JP)
+ me->kanji_buf = '\t'; /* flag for single byte katakana */
+ } else {
+ me->state = S_dquoted;
+ if (UTF8_TTY_ISO2022JP) {
+ HTChunkPutc(string, '(');
+ goto top1;
+ }
+ }
+ if (!UTF8_TTY_ISO2022JP)
+ HTChunkPutc(string, c);
+ break;
+
+ case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */
+ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */
+ me->state = S_esc_dq;
+ }
+ if (UTF8_TTY_ISO2022JP) {
+ if (TOASCII(c) != '\033')
+ HTChunkPutUtf8Char(string, clong);
+ } else
+ HTChunkPutc(string, c);
+ break;
+
+ case S_junk_tag:
+ case S_pi:
+ if (c == '>') {
+ HTChunkTerminate(string);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ if (me->state == S_junk_tag) {
+ PSRCSTOP(badtag);
+ }
+ PSRCSTART(abracket);
+ PUTC('>');
+ PSRCSTOP(abracket);
+ }
+#endif
+ if (me->state == S_pi)
+ handle_processing_instruction(me);
+ string->size = 0;
+ me->current_tag = NULL;
+ me->state = S_text;
+ } else {
+ HTChunkPutc(string, c);
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ PUTC(c);
+ }
+#endif
+ }
+
+ } /* switch on me->state */
+ CTRACE2(TRACE_SGML, (tfp, "SGML after %s|%.*s|%c|\n",
+ state_name(me->state),
+ string->size,
+ NonNull(string->data),
+ UCH(c)));
+
+ after_switch:
+ /*
+ * Check whether an external function has added anything to the include
+ * buffer. If so, move the new stuff to the beginning of active_include.
+ * - kw
+ */
+ if (me->include != NULL) {
+ if (me->include[0] == '\0') {
+ FREE(me->include);
+ } else {
+ if (me->active_include &&
+ me->active_include[me->include_index] != '\0')
+ StrAllocCat(me->include,
+ me->active_include + me->include_index);
+ FREE(me->active_include);
+ me->active_include = me->include;
+ me->include_index = 0;
+ me->include = NULL;
+ }
+ }
+
+ /*
+ * Check whether we've added anything to the recover buffer. - FM
+ */
+ if (me->recover != NULL) {
+ if (me->recover[me->recover_index] == '\0') {
+ FREE(me->recover);
+ me->recover_index = 0;
+ } else {
+ c = UCH(me->recover[me->recover_index]);
+ me->recover_index++;
+ goto top;
+ }
+ }
+
+ /*
+ * Check whether an external function had added anything to the include
+ * buffer; it should now be in active_include. - FM / kw
+ */
+ if (me->active_include != NULL) {
+ if (me->active_include[me->include_index] == '\0') {
+ FREE(me->active_include);
+ me->include_index = 0;
+ } else {
+ if (me->current_tag_charset == UTF8_handle ||
+ me->T.trans_from_uni) {
+ /*
+ * If it looks like we would have fed UTF-8 to the next
+ * processing stage, assume that whatever we were fed back is
+ * in UTF-8 form, too. This won't be always true for all uses
+ * of the include buffer, but it's a start. - kw
+ */
+ const char *puni = me->active_include + me->include_index;
+
+ c = UCH(*puni);
+ clong = UCGetUniFromUtf8String(&puni);
+ if (clong < 256 && clong >= 0) {
+ c = UCH((clong & 0xff));
+ }
+ saved_char_in = '\0';
+ me->include_index = (int) (puni
+ - me->active_include
+ + 1);
+ goto top1;
+ } else {
+ /*
+ * Otherwise assume no UTF-8 - do charset-naive processing and
+ * hope for the best. - kw
+ */
+ c = UCH(me->active_include[me->include_index]);
+ me->include_index++;
+ goto top;
+ }
+ }
+ }
+
+ /*
+ * Check whether an external function has added anything to the csi buffer.
+ * - FM
+ */
+ if (me->csi != NULL) {
+ if (me->csi[me->csi_index] == '\0') {
+ FREE(me->csi);
+ me->csi_index = 0;
+ } else {
+ c = UCH(me->csi[me->csi_index]);
+ me->csi_index++;
+ goto top;
+ }
+ }
+} /* SGML_character */
+
+static void InferUtfFromBom(HTStream *me, int chndl)
+{
+ HTAnchor_setUCInfoStage(me->node_anchor, chndl,
+ UCT_STAGE_PARSER,
+ UCT_SETBY_PARSER);
+ change_chartrans_handling(me);
+}
+
+/*
+ * Avoid rewrite of SGML_character() to handle hypothetical case of UTF-16
+ * webpages, by pretending that the data is UTF-8.
+ */
+static void SGML_widechar(HTStream *me, int ch)
+{
+ if (!UCPutUtf8_charstring(me, SGML_character, (UCode_t) ch)) {
+ SGML_character(me, ch);
+ }
+}
+
+static void SGML_write(HTStream *me, const char *str, int l)
+{
+ const char *p;
+ const char *e = str + l;
+
+ if (sgml_offset == 0) {
+ if (l > 3
+ && !MemCmp(str, "\357\273\277", 3)) {
+ CTRACE((tfp, "SGML_write found UTF-8 BOM\n"));
+ InferUtfFromBom(me, UTF8_handle);
+ str += 3;
+ } else if (l > 2) {
+ if (!MemCmp(str, "\377\376", 2)) {
+ CTRACE((tfp, "SGML_write found UCS-2 LE BOM\n"));
+ InferUtfFromBom(me, UTF8_handle);
+ str += 2;
+ me->T.ucs_mode = -1;
+ } else if (!MemCmp(str, "\376\377", 2)) {
+ CTRACE((tfp, "SGML_write found UCS-2 BE BOM\n"));
+ InferUtfFromBom(me, UTF8_handle);
+ str += 2;
+ me->T.ucs_mode = 1;
+ }
+ }
+ }
+ switch (me->T.ucs_mode) {
+ case -1:
+ for (p = str; p < e; p += 2)
+ SGML_widechar(me, (UCH(p[1]) << 8) | UCH(p[0]));
+ break;
+ case 1:
+ for (p = str; p < e; p += 2)
+ SGML_widechar(me, (UCH(p[0]) << 8) | UCH(p[1]));
+ break;
+ default:
+ for (p = str; p < e; p++)
+ SGML_character(me, *p);
+ break;
+ }
+}
+
+static void SGML_string(HTStream *me, const char *str)
+{
+ SGML_write(me, str, (int) strlen(str));
+}
+
+/*_______________________________________________________________________
+*/
+
+/* Structured Object Class
+ * -----------------------
+ */
+const HTStreamClass SGMLParser =
+{
+ "SGMLParser",
+ SGML_free,
+ SGML_abort,
+ SGML_character,
+ SGML_string,
+ SGML_write,
+};
+
+/* Create SGML Engine
+ * ------------------
+ *
+ * On entry,
+ * dtd represents the DTD, along with
+ * actions is the sink for the data as a set of routines.
+ *
+ */
+
+HTStream *SGML_new(const SGML_dtd * dtd,
+ HTParentAnchor *anchor,
+ HTStructured * target,
+ int extended_html)
+{
+ HTStream *me = typecalloc(struct _HTStream);
+
+ if (!me)
+ outofmem(__FILE__, "SGML_begin");
+
+ me->isa = &SGMLParser;
+ me->string = HTChunkCreate(128); /* Grow by this much */
+ me->dtd = dtd;
+ me->target = target;
+ me->actions = (const HTStructuredClass *) (((HTStream *) target)->isa);
+ /* Ugh: no OO */
+ me->unknown_tag = &HTTag_unrecognized;
+ me->current_tag = me->slashedtag = NULL;
+ me->state = S_text;
+#ifdef CALLERDATA
+ me->callerData = (void *) callerData;
+#endif /* CALLERDATA */
+
+ me->node_anchor = anchor; /* Could be NULL? */
+ me->U.utf_buf_p = me->U.utf_buf;
+ UCTransParams_clear(&me->T);
+ me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
+ UCT_STAGE_PARSER);
+ if (me->inUCLYhndl < 0) {
+ HTAnchor_copyUCInfoStage(anchor,
+ UCT_STAGE_PARSER,
+ UCT_STAGE_MIME,
+ -1);
+ me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
+ UCT_STAGE_PARSER);
+ }
+#ifdef CAN_SWITCH_DISPLAY_CHARSET /* Allow a switch to a more suitable display charset */
+ else if (anchor->UCStages
+ && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl >= 0
+ && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl != current_char_set) {
+ int o = anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl;
+
+ anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl = -1; /* Force reset */
+ HTAnchor_resetUCInfoStage(anchor, o, UCT_STAGE_PARSER,
+ /* Preserve change this: */
+ anchor->UCStages->s[UCT_STAGE_PARSER].lock);
+ }
+#endif
+
+ me->inUCI = HTAnchor_getUCInfoStage(anchor,
+ UCT_STAGE_PARSER);
+ set_chartrans_handling(me, anchor, -1);
+
+ me->recover = NULL;
+ me->recover_index = 0;
+ me->include = NULL;
+ me->active_include = NULL;
+ me->include_index = 0;
+ me->url = NULL;
+ me->csi = NULL;
+ me->csi_index = 0;
+
+#ifdef USE_PRETTYSRC
+ if (psrc_view) {
+ psrc_view = FALSE;
+ mark_htext_as_source = TRUE;
+ SGML_string(me,
+ "<HTML><HEAD><TITLE>source</TITLE></HEAD><BODY><PRE>");
+ psrc_view = TRUE;
+ psrc_convert_string = FALSE;
+ sgml_in_psrc_was_initialized = TRUE;
+ }
+#endif
+ if (extended_html) {
+ me->extended_html = TRUE;
+ }
+
+ sgml_offset = 0;
+ return me;
+}
+
+/*
+ * Return the offset within the document where we're parsing. This is used
+ * to help identify anchors which shift around while reparsing.
+ */
+int SGML_offset(void)
+{
+ int result = sgml_offset;
+
+#ifdef USE_PRETTYSRC
+ result += psrc_view;
+#endif
+ return result;
+}
+
+/* Asian character conversion functions
+ * ====================================
+ *
+ * Added 24-Mar-96 by FM, based on:
+ *
+ ////////////////////////////////////////////////////////////////////////
+Copyright (c) 1993 Electrotechnical Laboratory (ETL)
+
+Permission to use, copy, modify, and distribute this material
+for any purpose and without fee is hereby granted, provided
+that the above copyright notice and this permission notice
+appear in all copies, and that the name of ETL not be
+used in advertising or publicity pertaining to this
+material without the specific, prior written permission
+of an authorized representative of ETL.
+ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
+OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS",
+WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
+/////////////////////////////////////////////////////////////////////////
+Content-Type: program/C; charset=US-ASCII
+Program: SJIS.c
+Author: Yutaka Sato <ysato@etl.go.jp>
+Description:
+History:
+ 930923 extracted from codeconv.c of cosmos
+///////////////////////////////////////////////////////////////////////
+*/
+
+static int TREAT_SJIS = 1;
+
+void JISx0201TO0208_EUC(unsigned IHI,
+ unsigned ILO,
+ unsigned char *OHI,
+ unsigned char *OLO)
+{
+ static const char *table[] =
+ {
+ "\241\243", /* A1,A3 */
+ "\241\326", /* A1,D6 */
+ "\241\327", /* A1,D7 */
+ "\241\242", /* A1,A2 */
+ "\241\246", /* A1,A6 */
+ "\245\362", /* A5,F2 */
+ "\245\241", /* A5,A1 */
+ "\245\243", /* A5,A3 */
+ "\245\245", /* A5,A5 */
+ "\245\247", /* A5,A7 */
+ "\245\251", /* A5,A9 */
+ "\245\343", /* A5,E3 */
+ "\245\345", /* A5,E5 */
+ "\245\347", /* A5,E7 */
+ "\245\303", /* A5,C3 */
+ "\241\274", /* A1,BC */
+ "\245\242", /* A5,A2 */
+ "\245\244", /* A5,A4 */
+ "\245\246", /* A5,A6 */
+ "\245\250", /* A5,A8 */
+ "\245\252", /* A5,AA */
+ "\245\253", /* A5,AB */
+ "\245\255", /* A5,AD */
+ "\245\257", /* A5,AF */
+ "\245\261", /* A5,B1 */
+ "\245\263", /* A5,B3 */
+ "\245\265", /* A5,B5 */
+ "\245\267", /* A5,B7 */
+ "\245\271", /* A5,B9 */
+ "\245\273", /* A5,BB */
+ "\245\275", /* A5,BD */
+ "\245\277", /* A5,BF */
+ "\245\301", /* A5,C1 */
+ "\245\304", /* A5,C4 */
+ "\245\306", /* A5,C6 */
+ "\245\310", /* A5,C8 */
+ "\245\312", /* A5,CA */
+ "\245\313", /* A5,CB */
+ "\245\314", /* A5,CC */
+ "\245\315", /* A5,CD */
+ "\245\316", /* A5,CE */
+ "\245\317", /* A5,CF */
+ "\245\322", /* A5,D2 */
+ "\245\325", /* A5,D5 */
+ "\245\330", /* A5,D8 */
+ "\245\333", /* A5,DB */
+ "\245\336", /* A5,DE */
+ "\245\337", /* A5,DF */
+ "\245\340", /* A5,E0 */
+ "\245\341", /* A5,E1 */
+ "\245\342", /* A5,E2 */
+ "\245\344", /* A5,E4 */
+ "\245\346", /* A5,E6 */
+ "\245\350", /* A5,E8 */
+ "\245\351", /* A5,E9 */
+ "\245\352", /* A5,EA */
+ "\245\353", /* A5,EB */
+ "\245\354", /* A5,EC */
+ "\245\355", /* A5,ED */
+ "\245\357", /* A5,EF */
+ "\245\363", /* A5,F3 */
+ "\241\253", /* A1,AB */
+ "\241\254" /* A1,AC */
+ };
+
+ if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) {
+ *OHI = UCH(table[ILO - 0xA1][0]);
+ *OLO = UCH(table[ILO - 0xA1][1]);
+ } else {
+ *OHI = UCH(IHI);
+ *OLO = UCH(ILO);
+ }
+}
+
+static int IS_SJIS_STR(const unsigned char *str)
+{
+ const unsigned char *s;
+ unsigned char ch;
+ int is_sjis = 0;
+
+ s = str;
+ while ((ch = *s++) != '\0') {
+ if (ch & 0x80)
+ if (IS_SJIS(ch, *s, is_sjis))
+ return 1;
+ }
+ return 0;
+}
+
+unsigned char *SJIS_TO_JIS1(unsigned HI,
+ unsigned LO,
+ unsigned char *JCODE)
+{
+ HI = UCH(HI - (unsigned) UCH((HI <= 0x9F) ? 0x71 : 0xB1));
+ HI = UCH((HI << 1) + 1);
+ if (0x7F < LO)
+ LO--;
+ if (0x9E <= LO) {
+ LO = UCH(LO - UCH(0x7D));
+ HI++;
+ } else {
+ LO = UCH(LO - UCH(0x1F));
+ }
+ JCODE[0] = UCH(HI);
+ JCODE[1] = UCH(LO);
+ return JCODE;
+}
+
+unsigned char *JIS_TO_SJIS1(unsigned HI,
+ unsigned LO,
+ unsigned char *SJCODE)
+{
+ if (HI & 1)
+ LO = UCH(LO + UCH(0x1F));
+ else
+ LO = UCH(LO + UCH(0x7D));
+ if (0x7F <= LO)
+ LO++;
+
+ HI = UCH(((HI - 0x21) >> 1) + 0x81);
+ if (0x9F < HI)
+ HI = UCH(HI + UCH(0x40));
+ SJCODE[0] = UCH(HI);
+ SJCODE[1] = UCH(LO);
+ return SJCODE;
+}
+
+unsigned char *EUC_TO_SJIS1(unsigned HI,
+ unsigned LO,
+ unsigned char *SJCODE)
+{
+ unsigned char HI_data[2];
+ unsigned char LO_data[2];
+
+ HI_data[0] = UCH(HI);
+ LO_data[0] = UCH(LO);
+ if (HI == 0x8E) {
+ JISx0201TO0208_EUC(HI, LO, HI_data, LO_data);
+ }
+ JIS_TO_SJIS1(UCH(HI_data[0] & 0x7F), UCH(LO_data[0] & 0x7F), SJCODE);
+ return SJCODE;
+}
+
+void JISx0201TO0208_SJIS(unsigned I,
+ unsigned char *OHI,
+ unsigned char *OLO)
+{
+ unsigned char SJCODE[2];
+
+ JISx0201TO0208_EUC(0x8E, I, OHI, OLO);
+ JIS_TO_SJIS1(UCH(*OHI & 0x7F), UCH(*OLO & 0x7F), SJCODE);
+ *OHI = SJCODE[0];
+ *OLO = SJCODE[1];
+}
+
+unsigned char *SJIS_TO_EUC1(unsigned HI,
+ unsigned LO,
+ unsigned char *data)
+{
+ SJIS_TO_JIS1(HI, LO, data);
+ data[0] |= 0x80;
+ data[1] |= 0x80;
+ return data;
+}
+
+unsigned char *SJIS_TO_EUC(unsigned char *src,
+ unsigned char *dst)
+{
+ unsigned char hi, lo, *sp, *dp;
+ int in_sjis = 0;
+
+ in_sjis = IS_SJIS_STR(src);
+ for (sp = src, dp = dst; (hi = sp[0]) != '\0';) {
+ lo = sp[1];
+ if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) {
+ SJIS_TO_JIS1(hi, lo, dp);
+ dp[0] |= 0x80;
+ dp[1] |= 0x80;
+ dp += 2;
+ sp += 2;
+ } else
+ *dp++ = *sp++;
+ }
+ *dp = 0;
+ return dst;
+}
+
+unsigned char *EUC_TO_SJIS(unsigned char *src,
+ unsigned char *dst)
+{
+ unsigned char *sp, *dp;
+
+ for (sp = src, dp = dst; *sp;) {
+ if (*sp & 0x80) {
+ if (sp[1] && (sp[1] & 0x80)) {
+ JIS_TO_SJIS1(UCH(sp[0] & 0x7F), UCH(sp[1] & 0x7F), dp);
+ dp += 2;
+ sp += 2;
+ } else {
+ sp++;
+ }
+ } else {
+ *dp++ = *sp++;
+ }
+ }
+ *dp = 0;
+ return dst;
+}
+
+#define Strcpy(a,b) (strcpy((char*)a,(const char*)b),&a[strlen((const char*)a)])
+
+unsigned char *EUC_TO_JIS(unsigned char *src,
+ unsigned char *dst,
+ const char *toK,
+ const char *toA)
+{
+ unsigned char kana_mode = 0;
+ unsigned char cch;
+ unsigned char *sp = src;
+ unsigned char *dp = dst;
+ int is_JIS = 0;
+
+ while ((cch = *sp++) != '\0') {
+ if (cch & 0x80) {
+ if (!IS_EUC(cch, *sp)) {
+ if (cch == 0xA0 && is_JIS) /* ignore NBSP */
+ continue;
+ is_JIS++;
+ *dp++ = cch;
+ continue;
+ }
+ if (!kana_mode) {
+ kana_mode = UCH(~kana_mode);
+ dp = Strcpy(dp, toK);
+ }
+ if (*sp & 0x80) {
+ *dp++ = UCH(cch & ~0x80);
+ *dp++ = UCH(*sp++ & ~0x80);
+ }
+ } else {
+ if (kana_mode) {
+ kana_mode = UCH(~kana_mode);
+ dp = Strcpy(dp, toA);
+ }
+ *dp++ = cch;
+ }
+ }
+ if (kana_mode)
+ dp = Strcpy(dp, toA);
+
+ if (dp)
+ *dp = 0;
+ return dst;
+}
+
+#define IS_JIS7(c1,c2) (0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F)
+#define SO ('N'-0x40)
+#define SI ('O'-0x40)
+
+static int repair_JIS = 0;
+
+static const unsigned char *repairJIStoEUC(const unsigned char *src,
+ unsigned char **dstp)
+{
+ const unsigned char *s;
+ unsigned char *d, ch1, ch2;
+
+ d = *dstp;
+ s = src;
+ while ((ch1 = s[0]) && (ch2 = s[1])) {
+ s += 2;
+ if (ch1 == '(')
+ if (ch2 == 'B' || ch2 == 'J') {
+ *dstp = d;
+ return s;
+ }
+ if (!IS_JIS7(ch1, ch2))
+ return 0;
+
+ *d++ = UCH(0x80 | ch1);
+ *d++ = UCH(0x80 | ch2);
+ }
+ return 0;
+}
+
+unsigned char *TO_EUC(const unsigned char *jis,
+ unsigned char *euc)
+{
+ const unsigned char *s;
+ unsigned char c, jis_stat;
+ unsigned char *d;
+ int to1B, to2B;
+ int in_sjis = 0;
+ static int nje;
+ int n8bits;
+ int is_JIS;
+
+ nje++;
+ n8bits = 0;
+ s = jis;
+ d = euc;
+ jis_stat = 0;
+ to2B = TO_2BCODE;
+ to1B = TO_1BCODE;
+ in_sjis = IS_SJIS_STR(jis);
+ is_JIS = 0;
+
+ while ((c = *s++) != '\0') {
+ if (c == 0x80)
+ continue; /* ignore it */
+ if (c == 0xA0 && is_JIS)
+ continue; /* ignore Non-breaking space */
+
+ if (c == to2B && jis_stat == 0 && repair_JIS) {
+ if (*s == 'B' || *s == '@') {
+ const unsigned char *ts;
+
+ if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) {
+ s = ts;
+ continue;
+ }
+ }
+ }
+ if (c == CH_ESC) {
+ if (*s == to2B) {
+ if ((s[1] == 'B') || (s[1] == '@')) {
+ jis_stat = 0x80;
+ s += 2;
+ is_JIS++;
+ continue;
+ }
+ jis_stat = 0;
+ } else if (*s == to1B) {
+ jis_stat = 0;
+ if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) {
+ s += 2;
+ continue;
+ }
+ } else if (*s == ',') { /* MULE */
+ jis_stat = 0;
+ }
+ }
+ if (c & 0x80)
+ n8bits++;
+
+ if (IS_SJIS(c, *s, in_sjis)) {
+ SJIS_TO_EUC1(c, *s, d);
+ d += 2;
+ s++;
+ is_JIS++;
+ } else if (jis_stat) {
+ if (c <= 0x20 || 0x7F <= c) {
+ *d++ = c;
+ if (c == '\n')
+ jis_stat = 0;
+ } else {
+ if (IS_JIS7(c, *s)) {
+ *d++ = jis_stat | c;
+ *d++ = jis_stat | *s++;
+ } else
+ *d++ = c;
+ }
+ } else {
+ if (n8bits == 0 && (c == SI || c == SO)) {
+ } else {
+ *d++ = c;
+ }
+ }
+ }
+ *d = 0;
+ return euc;
+}
+
+#define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F)
+
+static int is_EUC_JP(unsigned char *euc)
+{
+ unsigned char *cp;
+ int ch1, ch2;
+
+ for (cp = euc; (ch1 = *cp) != '\0'; cp++) {
+ if (ch1 & 0x80) {
+ ch2 = cp[1] & 0xFF;
+ if ((ch2 & 0x80) == 0) {
+ /* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */
+ return 0;
+ }
+ if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) {
+ /* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */
+ return 0;
+ }
+ cp++;
+ }
+ }
+ return 1;
+}
+
+void TO_SJIS(const unsigned char *arg,
+ unsigned char *sjis)
+{
+ unsigned char *euc;
+
+ euc = typeMallocn(unsigned char, strlen((const char *) arg) + 1);
+
+#ifdef CJK_EX
+ if (!euc)
+ outofmem(__FILE__, "TO_SJIS");
+#endif
+ TO_EUC(arg, euc);
+ if (is_EUC_JP(euc))
+ EUC_TO_SJIS(euc, sjis);
+ else
+ strcpy((char *) sjis, (const char *) arg);
+ free(euc);
+}
+
+void TO_JIS(const unsigned char *arg,
+ unsigned char *jis)
+{
+ unsigned char *euc;
+
+ if (arg[0] == 0) {
+ jis[0] = 0;
+ return;
+ }
+ euc = typeMallocn(unsigned char, strlen((const char *)arg) + 1);
+#ifdef CJK_EX
+ if (!euc)
+ outofmem(__FILE__, "TO_JIS");
+#endif
+ TO_EUC(arg, euc);
+ EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);
+
+ free(euc);
+}