diff options
Diffstat (limited to 'src/HTML.h')
-rw-r--r-- | src/HTML.h | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/src/HTML.h b/src/HTML.h new file mode 100644 index 0000000..6e5ebc3 --- /dev/null +++ b/src/HTML.h @@ -0,0 +1,283 @@ +/* + * $LynxId: HTML.h,v 1.36 2022/07/22 20:22:13 tom Exp $ + * + * HTML to rich text converter for libwww + * + * THE HTML TO RTF OBJECT CONVERTER + * + * This interprets the HTML semantics. + */ +#ifndef HTML_H +#define HTML_H + +#ifndef HTUTILS_H +#include <HTUtils.h> +#endif /* HTUTILS_H */ + +#include <UCDefs.h> +#include <UCAux.h> +#include <HTAnchor.h> +#include <HTMLDTD.h> + +#ifdef __cplusplus +extern "C" { +#endif +/* #define ATTR_CS_IN (me->T.output_utf8 ? me->UCLYhndl : 0) */ +#define ATTR_CS_IN me->tag_charset +#define TRANSLATE_AND_UNESCAPE_ENTITIES(s, p, h) \ + LYUCTranslateHTMLString(s, ATTR_CS_IN, current_char_set, YES, p, h, st_HTML) +#define TRANSLATE_AND_UNESCAPE_ENTITIES5(s,cs_from,cs_to,p,h) \ + LYUCTranslateHTMLString(s, cs_from, cs_to, YES, p, h, st_HTML) +#define TRANSLATE_AND_UNESCAPE_ENTITIES6(s,cs_from,cs_to,spcls,p,h) \ + LYUCTranslateHTMLString(s, cs_from, cs_to, spcls, p, h, st_HTML) +#define TRANSLATE_HTML(s,p,h) \ + LYUCFullyTranslateString(s, me->UCLYhndl, current_char_set, NO, YES, p, h, NO, st_HTML) +#define TRANSLATE_HTML5(s,cs_from,cs_to,p,h) \ + LYUCFullyTranslateString(s, cs_from, cs_to, NO, YES, p, h, NO, st_HTML) +#define TRANSLATE_HTML7(s,cs_from,cs_to,spcls,p,h,Back) \ + LYUCFullyTranslateString(s, cs_from, cs_to, NO, spcls, p, h, Back, st_HTML) +/* + * Strings from attributes which should be converted to some kind of "standard" + * representation (character encoding), was Latin-1, esp. URLs (incl. + * #fragments) and HTML NAME and ID stuff. + */ +#define TRANSLATE_AND_UNESCAPE_TO_STD(s) \ + LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_URL) +#define UNESCAPE_FIELDNAME_TO_STD(s) \ + LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_HTML) + extern const HTStructuredClass HTMLPresentation; + +#ifdef Lynx_HTML_Handler +/* + * This section is semi-private to HTML.c and it's helper modules. - FM + * -------------------------------------------------------------------- + */ + + typedef struct _stack_element { + HTStyle *style; + int tag_number; + } stack_element; + +/* HTML Object + * ----------- + */ +#define MAX_NESTING 800 /* Should be checked by parser */ + + struct _HTStructured { + const HTStructuredClass *isa; + HTParentAnchor *node_anchor; + HText *text; + + HTStream *target; /* Output stream */ + HTStreamClass targetClass; /* Output routines */ + + HTChildAnchor *CurrentA; /* current HTML_A anchor */ + int CurrentANum; /* current HTML_A number */ + char *base_href; /* current HTML_BASE href */ + char *map_address; /* current HTML_MAP address */ + + HTChunk title; /* Grow by 128 */ + HTChunk object; /* Grow by 128 */ + BOOL object_started; + BOOL object_declare; + BOOL object_shapes; + BOOL object_ismap; + char *object_usemap; + char *object_id; + char *object_title; + char *object_data; + char *object_type; + char *object_classid; + char *object_codebase; + char *object_codetype; + char *object_name; + int objects_mixed_open, objects_figged_open; + HTChunk option; /* Grow by 128 */ + BOOL first_option; /* First OPTION in SELECT? */ + char *LastOptionValue; + BOOL LastOptionChecked; + BOOL select_disabled; + HTChunk textarea; /* Grow by 128 */ + char *textarea_name; + int textarea_name_cs; + char *textarea_accept_cs; + int textarea_cols; + int textarea_rows; + int textarea_disabled; + int textarea_readonly; + char *textarea_id; + HTChunk math; /* Grow by 128 */ + HTChunk style_block; /* Grow by 128 */ + HTChunk script; /* Grow by 128 */ + + /* + * Used for nested lists. - FM + */ + int List_Nesting_Level; /* counter for list nesting level */ + int OL_Counter[12]; /* counter for ordered lists */ + char OL_Type[12]; /* types for ordered lists */ + int Last_OL_Count; /* last count in ordered lists */ + char Last_OL_Type; /* last type in ordered lists */ + + int Division_Level; + short DivisionAlignments[MAX_NESTING]; + int Underline_Level; + int Quote_Level; + + BOOL UsePlainSpace; + BOOL HiddenValue; + int lastraw; + + const char *comment_start; /* for literate programming */ + const char *comment_end; + + HTTag *current_tag; + BOOL style_change; + HTStyle *new_style; + HTStyle *old_style; + int current_default_alignment; + BOOL in_word; /* Have just had a non-white char */ + stack_element stack[MAX_NESTING]; + stack_element *sp; /* Style stack pointer */ + BOOL stack_overrun; /* Was MAX_NESTING exceeded? */ + int skip_stack; /* flag to skip next style stack operation */ + + /* + * Track if we are in an anchor, paragraph, address, base, etc. + */ + BOOL inA; + BOOL inAPPLET; + BOOL inAPPLETwithP; + BOOL inBadHREF; + BOOL inBadHTML; + BOOL inBASE; + BOOL inBoldA; + BOOL inBoldH; + BOOL inCAPTION; + BOOL inCREDIT; + BOOL inFIG; + BOOL inFIGwithP; + BOOL inFONT; + BOOL inFORM; + BOOL inLABEL; + BOOL inP; + BOOL inPRE; + BOOL inSELECT; + BOOL inTABLE; + BOOL inTEXTAREA; + BOOL inUnderline; + + BOOL needBoldH; + + char *xinclude; /* if no include strin address passed */ + /* + * UCI and UCLYhndl give the UCInfo and charset registered for the HTML + * parser in the node_anchor's UCStages structure. It indicates what is + * fed to the HTML parser as the stream of character data (not necessarily + * tags and attributes). It should currently always be set to be the same + * as UCI and UCLhndl for the HTEXT stage in the node_anchor's UCStages + * structure, since the HTML parser sends its input character data to the + * output without further charset translation. + */ + LYUCcharset *UCI; + int UCLYhndl; + /* + * inUCI and inUCLYhndl indicate the UCInfo and charset which the HTML + * parser treats at the input charset. It is normally set to the UCI and + * UCLhndl for the SGML parser in the node_anchor's UCStages structure + * (which may be a dummy, based on the MIME parser's UCI and UCLhndl in + * that structure, when we are handling a local file or non-http(s) + * gateway). It could be changed temporarily by the HTML parser, for + * conversions of attribute strings, but should be reset once done. - FM + */ + LYUCcharset *inUCI; + int inUCLYhndl; + /* + * outUCI and outUCLYhndl indicate the UCInfo and charset which the HTML + * parser treats as the output charset. It is normally set to its own UCI + * and UCLhndl. It could be changed for conversions of attribute strings, + * but should be reset once done. - FM + */ + LYUCcharset *outUCI; + int outUCLYhndl; + /* + * T holds the transformation rules for conversions of strings between the + * input and output charsets by the HTML parser. - FM + */ + UCTransParams T; + + int tag_charset; /* charset for attribute values etc. */ + }; + + extern HTStyle *LYstyles(int style_number); + extern BOOL LYBadHTML(HTStructured * me); + extern void LYShowBadHTML(const char *s); + +/* + * Semi-Private functions. - FM + */ + extern void HTML_put_character(HTStructured * me, int c); + extern void HTML_put_string(HTStructured * me, const char *s); + extern void HTML_write(HTStructured * me, const char *s, int l); + extern int HTML_put_entity(HTStructured * me, int entity_number); + extern void actually_set_style(HTStructured * me); + +/* Style buffering avoids dummy paragraph begin/ends. +*/ +#define UPDATE_STYLE if (me->style_change) { actually_set_style(me); } +#endif /* Lynx_HTML_Handler */ + + extern void strtolower(char *i); + +/* P U B L I C +*/ + +/* + * HTConverter to present HTML + */ + extern HTStream *HTMLToPlain(HTPresentation *pres, + HTParentAnchor *anchor, + HTStream *sink); + + extern HTStream *HTMLParsedPresent(HTPresentation *pres, + HTParentAnchor *anchor, + HTStream *sink); + + extern HTStream *HTMLToC(HTPresentation *pres, + HTParentAnchor *anchor, + HTStream *sink); + + extern HTStream *HTMLPresent(HTPresentation *pres, + HTParentAnchor *anchor, + HTStream *sink); + + extern HTStream *XHTMLPresent(HTPresentation *pres, + HTParentAnchor *anchor, + HTStream *sink); + + extern HTStructured *HTML_new(HTParentAnchor *anchor, + HTFormat format_out, + HTStream *target); + +/* + * Record error message as a hypertext object. + * + * The error message should be marked as an error so that it can be reloaded + * later. This implementation just throws up an error message and leaves the + * document unloaded. + * + * On entry, + * sink is a stream to the output device if any + * number is the HTTP error number + * message is the human readable message. + * On exit, + * a return code like HT_LOADED if object exists else 60; 0 + */ + extern int HTLoadError(HTStream *sink, + int number, + const char *message); + +#ifdef __cplusplus +} +#endif +#endif /* HTML_H */ |