diff options
Diffstat (limited to 'strings/xml.c')
-rw-r--r-- | strings/xml.c | 574 |
1 files changed, 574 insertions, 0 deletions
diff --git a/strings/xml.c b/strings/xml.c new file mode 100644 index 00000000..d16df34b --- /dev/null +++ b/strings/xml.c @@ -0,0 +1,574 @@ +/* Copyright (c) 2003, 2011, Oracle and/or its affiliates. + Copyright (c) 2011 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "strings_def.h" +#include "m_string.h" +#include "my_xml.h" +#include "my_sys.h" + + +#define MY_XML_UNKNOWN 'U' +#define MY_XML_EOF 'E' +#define MY_XML_STRING 'S' +#define MY_XML_IDENT 'I' +#define MY_XML_EQ '=' +#define MY_XML_LT '<' +#define MY_XML_GT '>' +#define MY_XML_SLASH '/' +#define MY_XML_COMMENT 'C' +#define MY_XML_TEXT 'T' +#define MY_XML_QUESTION '?' +#define MY_XML_EXCLAM '!' +#define MY_XML_CDATA 'D' + +typedef struct xml_attr_st +{ + const char *beg; + const char *end; +} MY_XML_ATTR; + + +/* + XML ctype: +*/ +#define MY_XML_ID0 0x01 /* Identifier initial character */ +#define MY_XML_ID1 0x02 /* Identifier medial character */ +#define MY_XML_SPC 0x08 /* Spacing character */ + + +/* + http://www.w3.org/TR/REC-xml/ + [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | + CombiningChar | Extender + [5] Name ::= (Letter | '_' | ':') (NameChar)* +*/ + +static char my_xml_ctype[256]= +{ +/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0, +/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */ +/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */ +/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */ +/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */ +/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */ +/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */ +/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 +}; + +#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC) +#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0) +#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1) + + +static const char *lex2str(int lex) +{ + switch(lex) + { + case MY_XML_EOF: return "END-OF-INPUT"; + case MY_XML_STRING: return "STRING"; + case MY_XML_IDENT: return "IDENT"; + case MY_XML_CDATA: return "CDATA"; + case MY_XML_EQ: return "'='"; + case MY_XML_LT: return "'<'"; + case MY_XML_GT: return "'>'"; + case MY_XML_SLASH: return "'/'"; + case MY_XML_COMMENT: return "COMMENT"; + case MY_XML_TEXT: return "TEXT"; + case MY_XML_QUESTION: return "'?'"; + case MY_XML_EXCLAM: return "'!'"; + } + return "unknown token"; +} + +static void my_xml_norm_text(MY_XML_ATTR *a) +{ + for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ ); + for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- ); +} + + +static inline my_bool +my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen) +{ + return (p->cur + slen > p->end) || memcmp(p->cur, s, slen); +} + + +static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a) +{ + int lex; + + for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++); + + if (p->cur >= p->end) + { + a->beg=p->end; + a->end=p->end; + lex=MY_XML_EOF; + goto ret; + } + + a->beg=p->cur; + a->end=p->cur; + + if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--"))) + { + for (; p->cur < p->end; p->cur++) + { + if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->"))) + { + p->cur+= 3; + break; + } + } + a->end=p->cur; + lex=MY_XML_COMMENT; + } + else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA["))) + { + p->cur+= 9; + for (; p->cur < p->end - 2 ; p->cur++) + { + if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>') + { + p->cur+= 3; + a->end= p->cur; + break; + } + } + lex= MY_XML_CDATA; + } + else if (strchr("?=/<>!",p->cur[0])) + { + p->cur++; + a->end=p->cur; + lex=a->beg[0]; + } + else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') ) + { + /* + "string" or 'string' found. + Scan until the closing quote/doublequote, or until the END-OF-INPUT. + */ + p->cur++; + for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++) + {} + a->end=p->cur; + if (p->cur < p->end) /* Closing quote or doublequote has been found */ + p->cur++; + a->beg++; + if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) + my_xml_norm_text(a); + lex=MY_XML_STRING; + } + else if (my_xml_is_id0(p->cur[0])) + { + p->cur++; + while (p->cur < p->end && my_xml_is_id1(p->cur[0])) + p->cur++; + a->end=p->cur; + my_xml_norm_text(a); + lex=MY_XML_IDENT; + } + else + lex= MY_XML_UNKNOWN; + +#if 0 + printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg); +#endif + +ret: + return lex; +} + + +static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len) +{ + return (st->value) ? (st->value)(st,str,len) : MY_XML_OK; +} + + +/** + Ensure the attr buffer is wide enough to hold the new value + + Expand and/or allocate dynamic buffer as needed to hold the concatenated + path and the terminating zero. + + @attr st the parser instance + @attr len the length of the attribute to be added + @return state + @retval 1 failed + @retval 0 success +*/ +static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len) +{ + size_t ofs= st->attr.end - st->attr.start; + len++; // Add terminating zero. + if (ofs + len > st->attr.buffer_size) + { + st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ? + st->attr.buffer_size * 2 + len : SIZE_T_MAX; + + if (!st->attr.buffer) + { + st->attr.buffer= (char *) my_malloc(PSI_INSTRUMENT_ME, st->attr.buffer_size, MYF(0)); + if (st->attr.buffer) + memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */); + } + else + st->attr.buffer= (char *) my_realloc(PSI_INSTRUMENT_ME, st->attr.buffer, + st->attr.buffer_size, MYF(0)); + st->attr.start= st->attr.buffer; + st->attr.end= st->attr.start + ofs; + + return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR; + } + return MY_XML_OK; +} + + +/** rewind the attr buffer to initial state */ +static void my_xml_attr_rewind(MY_XML_PARSER *p) +{ + /* keep the buffer already allocated */ + p->attr.end= p->attr.start; +} + + +static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len) +{ + if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */)) + return MY_XML_ERROR; + + if (st->attr.end > st->attr.start) + { + st->attr.end[0]= '/'; + st->attr.end++; + } + memcpy(st->attr.end, str, len); + st->attr.end+= len; + st->attr.end[0]= '\0'; + if (st->flags & MY_XML_FLAG_RELATIVE_NAMES) + return st->enter ? st->enter(st, str, len) : MY_XML_OK; + else + return st->enter ? + st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK; +} + + +static void mstr(char *s,const char *src,size_t l1, size_t l2) +{ + l1 = l1<l2 ? l1 : l2; + memcpy(s,src,l1); + s[l1]='\0'; +} + + +static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen) +{ + char *e, *tag; + size_t glen; + char s[32]; + char g[32]; + int rc; + + /* Find previous '/' or beginning */ + for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--); + glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e); + tag= e[0] == '/' ? e + 1 : e; + + if (str && (slen != glen || memcmp(str, tag, slen))) + { + mstr(s,str,sizeof(s)-1,slen); + if (glen) + { + mstr(g, tag, sizeof(g)-1, glen); + sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g); + } + else + sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s); + return MY_XML_ERROR; + } + + if (p->flags & MY_XML_FLAG_RELATIVE_NAMES) + rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK; + else + rc= (p->leave_xml ? + p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) : + MY_XML_OK); + + *e='\0'; + p->attr.end= e; + + return rc; +} + + +int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len) +{ + + my_xml_attr_rewind(p); + + p->beg=str; + p->cur=str; + p->end=str+len; + + while ( p->cur < p->end ) + { + MY_XML_ATTR a; + if (p->cur[0] == '<') + { + int lex; + int question=0; + int exclam=0; + + lex=my_xml_scan(p,&a); + + if (MY_XML_COMMENT == lex) + continue; + + if (lex == MY_XML_CDATA) + { + a.beg+= 9; + a.end-= 3; + my_xml_value(p, a.beg, (size_t) (a.end-a.beg)); + continue; + } + + lex=my_xml_scan(p,&a); + + if (MY_XML_SLASH == lex) + { + if (MY_XML_IDENT != (lex=my_xml_scan(p,&a))) + { + sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex)); + return MY_XML_ERROR; + } + if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))) + return MY_XML_ERROR; + lex=my_xml_scan(p,&a); + goto gt; + } + + if (MY_XML_EXCLAM == lex) + { + lex=my_xml_scan(p,&a); + exclam=1; + } + else if (MY_XML_QUESTION == lex) + { + lex=my_xml_scan(p,&a); + question=1; + } + + if (MY_XML_IDENT == lex) + { + p->current_node_type= MY_XML_NODE_TAG; + if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) + return MY_XML_ERROR; + } + else + { + sprintf(p->errstr,"%s unexpected (ident or '/' wanted)", + lex2str(lex)); + return MY_XML_ERROR; + } + + while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) || + ((MY_XML_STRING == lex && exclam))) + { + MY_XML_ATTR b; + if (MY_XML_EQ == (lex=my_xml_scan(p,&b))) + { + lex=my_xml_scan(p,&b); + if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) ) + { + p->current_node_type= MY_XML_NODE_ATTR; + if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || + (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) || + (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) + return MY_XML_ERROR; + } + else + { + sprintf(p->errstr,"%s unexpected (ident or string wanted)", + lex2str(lex)); + return MY_XML_ERROR; + } + } + else if (MY_XML_IDENT == lex) + { + p->current_node_type= MY_XML_NODE_ATTR; + if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) || + (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))) + return MY_XML_ERROR; + } + else if ((MY_XML_STRING == lex) && exclam) + { + /* + We are in <!DOCTYPE>, e.g. + <!DOCTYPE name SYSTEM "SystemLiteral"> + <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral"> + Just skip "SystemLiteral" and "PublicidLiteral" + */ + } + else + break; + } + + if (lex == MY_XML_SLASH) + { + if (MY_XML_OK != my_xml_leave(p,NULL,0)) + return MY_XML_ERROR; + lex=my_xml_scan(p,&a); + } + +gt: + if (question) + { + if (lex != MY_XML_QUESTION) + { + sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex)); + return MY_XML_ERROR; + } + if (MY_XML_OK != my_xml_leave(p,NULL,0)) + return MY_XML_ERROR; + lex=my_xml_scan(p,&a); + } + + if (exclam) + { + if (MY_XML_OK != my_xml_leave(p,NULL,0)) + return MY_XML_ERROR; + } + + if (lex != MY_XML_GT) + { + sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex)); + return MY_XML_ERROR; + } + } + else + { + a.beg=p->cur; + for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++); + a.end=p->cur; + + if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION)) + my_xml_norm_text(&a); + if (a.beg != a.end) + { + my_xml_value(p,a.beg,(size_t) (a.end-a.beg)); + } + } + } + + if (p->attr.start[0]) + { + sprintf(p->errstr,"unexpected END-OF-INPUT"); + return MY_XML_ERROR; + } + return MY_XML_OK; +} + + +void my_xml_parser_create(MY_XML_PARSER *p) +{ + memset(p, 0, sizeof(p[0])); + /* + Use static buffer while it's sufficient. + */ + p->attr.start= p->attr.end= p->attr.static_buffer; + p->attr.buffer_size= sizeof(p->attr.static_buffer); +} + + +void my_xml_parser_free(MY_XML_PARSER *p) +{ + if (p->attr.buffer) + { + my_free(p->attr.buffer); + p->attr.buffer= NULL; + } +} + + +void my_xml_set_value_handler(MY_XML_PARSER *p, + int (*action)(MY_XML_PARSER *p, const char *s, + size_t l)) +{ + p->value=action; +} + +void my_xml_set_enter_handler(MY_XML_PARSER *p, + int (*action)(MY_XML_PARSER *p, const char *s, + size_t l)) +{ + p->enter=action; +} + + +void my_xml_set_leave_handler(MY_XML_PARSER *p, + int (*action)(MY_XML_PARSER *p, const char *s, + size_t l)) +{ + p->leave_xml=action; +} + + +void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data) +{ + p->user_data=user_data; +} + + +const char *my_xml_error_string(MY_XML_PARSER *p) +{ + return p->errstr; +} + + +size_t my_xml_error_pos(MY_XML_PARSER *p) +{ + const char *beg=p->beg; + const char *s; + for ( s=p->beg ; s<p->cur; s++) + { + if (s[0] == '\n') + beg=s; + } + return (size_t) (p->cur-beg); +} + +uint my_xml_error_lineno(MY_XML_PARSER *p) +{ + uint res=0; + const char *s; + for (s=p->beg ; s<p->cur; s++) + { + if (s[0] == '\n') + res++; + } + return res; +} |