diff options
Diffstat (limited to '')
-rw-r--r-- | WWW/Library/Implementation/SGML.h | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/WWW/Library/Implementation/SGML.h b/WWW/Library/Implementation/SGML.h new file mode 100644 index 0000000..9fccdda --- /dev/null +++ b/WWW/Library/Implementation/SGML.h @@ -0,0 +1,286 @@ +/* + * $LynxId: SGML.h,v 1.46 2012/02/10 18:32:26 tom Exp $ + * SGML parse and stream definition for libwww + * SGML AND STRUCTURED STREAMS + * + * The SGML parser is a state machine. It is called for every character + * of the input stream. The DTD data structure contains pointers + * to functions which are called to implement the actual effect of the + * text read. When these functions are called, the attribute structures pointed to by the + * DTD are valid, and the function is passed a pointer to the current tag structure, and an + * "element stack" which represents the state of nesting within SGML elements. + * + * The following aspects are from Dan Connolly's suggestions: Binary search, + * Structured object scheme basically, SGML content enum type. + * + * (c) Copyright CERN 1991 - See Copyright.html + * + */ +#ifndef SGML_H +#define SGML_H + +#include <HTStream.h> +#include <HTAnchor.h> +#include <LYJustify.h> + +#ifdef __cplusplus +extern "C" { +#endif +/* + * + * SGML content types + * + */ typedef enum { + SGML_EMPTY, /* No content. */ + SGML_LITTERAL, /* Literal character data. Recognize exact close tag only. + Old www server compatibility only! Not SGML */ + SGML_CDATA, /* Character data. Recognize </ only. + (But we treat it just as SGML_LITTERAL.) */ + SGML_SCRIPT, /* Like CDATA, but allow it to be a comment */ + SGML_RCDATA, /* Replaceable character data. Should recognize </ and &ref; + (but we treat it like SGML_MIXED for old times' sake). */ + SGML_MIXED, /* Elements and parsed character data. + Recognize all markup. */ + SGML_ELEMENT, /* Any data found should be regarded as an error. + (But we treat it just like SGML_MIXED.) */ + SGML_PCDATA /* Should contain no elements but &ref; is parsed. + (We treat it like SGML_CDATA wrt. contained tags + i.e. pass them on literally, i.e. like we should + treat SGML_RCDATA) (added by KW). */ + } SGMLContent; + + typedef struct { + const char *name; /* The name of the attribute */ +#ifdef USE_PRETTYSRC + char type; /* code of the type of the attribute. Code + values are in HTMLDTD.h */ +#endif + } attr; + + typedef const attr *AttrList; + + typedef struct { + const char *name; + AttrList list; + } AttrType; + + typedef int TagClass; + + /* textflow */ +#define Tgc_FONTlike 0x00001 /* S,STRIKE,I,B,TT,U,BIG,SMALL,STYLE,BLINK;BR,TAB */ +#define Tgc_EMlike 0x00002 /* EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,INS,DEL,SPAN,.. */ +#define Tgc_MATHlike 0x00004 /* SUB,SUP,MATH,COMMENT */ +#define Tgc_Alike 0x00008 /* A */ +#define Tgc_formula 0x00010 /* not used until math is supported better... */ + /* used for special structures: forms, tables,... */ +#define Tgc_TRlike 0x00020 /* TR and similar */ +#define Tgc_SELECTlike 0x00040 /* SELECT,INPUT,TEXTAREA(,...) */ + /* structure */ +#define Tgc_FORMlike 0x00080 /* FORM itself */ +#define Tgc_Plike 0x00100 /* P,H1..H6,... structures containing text or + insertion but not other structures */ +#define Tgc_DIVlike 0x00200 /* ADDRESS,FIG,BDO,NOTE,FN,DIV,CENTER;FIG + structures which can contain other structures */ +#define Tgc_LIlike 0x00400 /* LH,LI,DT,DD;TH,TD structure-like, only valid + within certain other structures */ +#define Tgc_ULlike 0x00800 /* UL,OL,DL,DIR,MENU;TABLE;XMP,LISTING + special in some way, cannot contain (parsed) + text directly */ + /* insertions */ +#define Tgc_BRlike 0x01000 /* BR,IMG,TAB allowed in any text */ +#define Tgc_APPLETlike 0x02000 /* APPLET,OBJECT,EMBED,SCRIPT;BUTTON */ +#define Tgc_HRlike 0x04000 /* HR,MARQUEE can contain all kinds of things + and/or are not allowed (?) in running text */ +#define Tgc_MAPlike 0x08000 /* MAP,AREA some specials that never contain + (directly or indirectly) other things than + special insertions */ +#define Tgc_outer 0x10000 /* HTML,FRAMESET,FRAME,PLAINTEXT; */ +#define Tgc_BODYlike 0x20000 /* BODY,BODYTEXT,NOFRAMES,TEXTFLOW; */ +#define Tgc_HEADstuff 0x40000 /* HEAD,BASE,STYLE,TITLE; */ + /* special relations */ +#define Tgc_same 0x80000 + +/* + * Groups for contains-data. + */ +#define Tgc_INLINElike (Tgc_Alike | Tgc_APPLETlike | Tgc_BRlike | Tgc_EMlike | Tgc_FONTlike | Tgc_SELECTlike) +#define Tgc_LISTlike (Tgc_LIlike | Tgc_ULlike) +#define Tgc_BLOCKlike (Tgc_DIVlike | Tgc_LISTlike) + +/* Some more properties of tags (or rather, elements) and rules how + to deal with them. - kw */ + typedef int TagFlags; + +#define Tgf_endO 0x00001 /* end tag can be Omitted */ +#define Tgf_startO 0x00002 /* start tag can be Omitted */ +#define Tgf_mafse 0x00004 /* Make Attribute-Free Start-tag End instead + (if found invalid) */ +#define Tgf_strict 0x00008 /* Ignore contained invalid elements, + don't pass them on; or other variant + handling for some content types */ +#define Tgf_nreie 0x00010 /* Not Really Empty If Empty, + used by color style code */ +#define Tgf_frecyc 0x00020 /* Pass element content on in a form that + allows recycling, i.e. don't translate to + output (display) character set yet (treat + content similar to attribute values) */ +#define Tgf_nolyspcl 0x00040 /* Don't generate lynx special characters + for soft hyphen and various spaces (nbsp, + ensp,..) */ + +/* A tag structure describes an SGML element. + * ----------------------------------------- + * + * + * name is the string which comes after the tag opener "<". + * + * attributes points to a zero-terminated array + * of attribute names. + * + * litteral determines how the SGML engine parses the characters + * within the element. If set, tag openers are ignored + * except for that which opens a matching closing tag. + * + */ + typedef struct _tag HTTag; + struct _tag { + const char *name; /* The name of the tag */ +#ifdef USE_COLOR_STYLE + unsigned name_len; /* The length of the name */ +#endif +#ifdef USE_JUSTIFY_ELTS + BOOL can_justify; /* justification allowed? */ +#endif + AttrList attributes; /* The list of acceptable attributes */ + int number_of_attributes; /* Number of possible attributes */ + const AttrType *attr_types; + SGMLContent contents; /* End only on end tag @@ */ + TagClass tagclass; + TagClass contains; /* which classes of elements this one can contain directly */ + TagClass icontains; /* which classes of elements this one can contain indirectly */ + TagClass contained; /* in which classes can this tag be contained ? */ + TagClass icontained; /* in which classes can this tag be indirectly contained ? */ + TagClass canclose; /* which classes of elements can this one close + if something looks wrong ? */ + TagFlags flags; + }; + +/* DTD Information + * --------------- + * + * Not the whole DTD, but all this parser uses of it. + */ + typedef struct { + HTTag *tags; /* Must be in strcmp order by name */ + int number_of_tags; + STRING2PTR entity_names; /* Must be in strcmp order by name */ + size_t number_of_entities; + /* "entity_names" table probably unused, + * see comments in HTMLDTD.c near the top + */ + } SGML_dtd; + +/* SGML context passed to parsers +*/ + typedef struct _HTSGMLContext *HTSGMLContext; /* Hidden */ + +/*__________________________________________________________________________ +*/ + +/* + +Structured Object definition + + A structured object is something which can reasonably be represented + in SGML. I'll rephrase that. A structured object is an ordered + tree-structured arrangement of data which is representable as text. + The SGML parser outputs to a Structured object. A Structured object + can output its contents to another Structured Object. It's a kind of + typed stream. The architecture is largely Dan Conolly's. Elements and + entities are passed to the sob by number, implying a knowledge of the + DTD. Knowledge of the SGML syntax is not here, though. + + Superclass: HTStream + + The creation methods will vary on the type of Structured Object. + Maybe the callerData is enough info to pass along. + + */ + typedef struct _HTStructured HTStructured; + + typedef struct _HTStructuredClass { + + const char *name; /* Just for diagnostics */ + + void (*_free) (HTStructured * me); + + void (*_abort) (HTStructured * me, HTError e); + + void (*put_character) (HTStructured * me, int ch); + + void (*put_string) (HTStructured * me, const char *str); + + void (*put_block) (HTStructured * me, const char *str, int len); + + /* HTStreamClass ends here */ + + int (*start_element) (HTStructured * me, int element_number, + const BOOL *attribute_present, + STRING2PTR attribute_value, + int charset, + char **include); + + int (*end_element) (HTStructured * me, int element_number, + char **include); + + int (*put_entity) (HTStructured * me, int entity_number); + + } HTStructuredClass; + +/* + Equivalents to the following functions possibly could be generalised + into additional HTStructuredClass members. For now they don't do + anything target-specific. - kw + */ + extern BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, char **url); + extern void LYDoCSI(char *url, const char *comment, char **csi); + extern BOOLEAN LYCommentHacks(HTParentAnchor *anchor, const char *comment); + +/* + +Find a Tag by Name + + Returns a pointer to the tag within the DTD. + + */ + extern HTTag *SGMLFindTag(const SGML_dtd * dtd, + const char *string); + +/* + * Return the current offset within the file that SGML is parsing + */ + extern int SGML_offset(void); + +/* + +Create an SGML parser + + */ +/* + * On entry, + * dtd must point to a DTD structure as defined above + * callbacks must point to user routines. + * callData is returned in callbacks transparently. + * On exit, + * The default tag starter has been processed. + */ + extern HTStream *SGML_new(const SGML_dtd * dtd, + HTParentAnchor *anchor, + HTStructured * target); + + extern const HTStreamClass SGMLParser; + +#ifdef __cplusplus +} +#endif +#endif /* SGML_H */ |