summaryrefslogtreecommitdiffstats
path: root/strings/xml.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/xml.c')
-rw-r--r--strings/xml.c574
1 files changed, 574 insertions, 0 deletions
diff --git a/strings/xml.c b/strings/xml.c
new file mode 100644
index 00000000..7260ecad
--- /dev/null
+++ b/strings/xml.c
@@ -0,0 +1,574 @@
+/* Copyright (c) 2003, 2011, Oracle and/or its affiliates.
+ Copyright (c) 2011 Monty Program Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include "strings_def.h"
+#include "m_string.h"
+#include "my_xml.h"
+#include "my_sys.h"
+
+
+#define MY_XML_UNKNOWN 'U'
+#define MY_XML_EOF 'E'
+#define MY_XML_STRING 'S'
+#define MY_XML_IDENT 'I'
+#define MY_XML_EQ '='
+#define MY_XML_LT '<'
+#define MY_XML_GT '>'
+#define MY_XML_SLASH '/'
+#define MY_XML_COMMENT 'C'
+#define MY_XML_TEXT 'T'
+#define MY_XML_QUESTION '?'
+#define MY_XML_EXCLAM '!'
+#define MY_XML_CDATA 'D'
+
+typedef struct xml_attr_st
+{
+ const char *beg;
+ const char *end;
+} MY_XML_ATTR;
+
+
+/*
+ XML ctype:
+*/
+#define MY_XML_ID0 0x01 /* Identifier initial character */
+#define MY_XML_ID1 0x02 /* Identifier medial character */
+#define MY_XML_SPC 0x08 /* Spacing character */
+
+
+/*
+ http://www.w3.org/TR/REC-xml/
+ [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
+ CombiningChar | Extender
+ [5] Name ::= (Letter | '_' | ':') (NameChar)*
+*/
+
+static char my_xml_ctype[256]=
+{
+/*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
+/*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+/*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
+/*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
+/*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
+/*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
+/*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
+/*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
+/*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+};
+
+#define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
+#define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
+#define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
+
+
+static const char *lex2str(int lex)
+{
+ switch(lex)
+ {
+ case MY_XML_EOF: return "END-OF-INPUT";
+ case MY_XML_STRING: return "STRING";
+ case MY_XML_IDENT: return "IDENT";
+ case MY_XML_CDATA: return "CDATA";
+ case MY_XML_EQ: return "'='";
+ case MY_XML_LT: return "'<'";
+ case MY_XML_GT: return "'>'";
+ case MY_XML_SLASH: return "'/'";
+ case MY_XML_COMMENT: return "COMMENT";
+ case MY_XML_TEXT: return "TEXT";
+ case MY_XML_QUESTION: return "'?'";
+ case MY_XML_EXCLAM: return "'!'";
+ }
+ return "unknown token";
+}
+
+static void my_xml_norm_text(MY_XML_ATTR *a)
+{
+ for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
+ for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
+}
+
+
+static inline my_bool
+my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
+{
+ return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
+}
+
+
+static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
+{
+ int lex;
+
+ for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
+
+ if (p->cur >= p->end)
+ {
+ a->beg=p->end;
+ a->end=p->end;
+ lex=MY_XML_EOF;
+ goto ret;
+ }
+
+ a->beg=p->cur;
+ a->end=p->cur;
+
+ if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
+ {
+ for (; p->cur < p->end; p->cur++)
+ {
+ if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
+ {
+ p->cur+= 3;
+ break;
+ }
+ }
+ a->end=p->cur;
+ lex=MY_XML_COMMENT;
+ }
+ else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
+ {
+ p->cur+= 9;
+ for (; p->cur < p->end - 2 ; p->cur++)
+ {
+ if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
+ {
+ p->cur+= 3;
+ a->end= p->cur;
+ break;
+ }
+ }
+ lex= MY_XML_CDATA;
+ }
+ else if (strchr("?=/<>!",p->cur[0]))
+ {
+ p->cur++;
+ a->end=p->cur;
+ lex=a->beg[0];
+ }
+ else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
+ {
+ /*
+ "string" or 'string' found.
+ Scan until the closing quote/doublequote, or until the END-OF-INPUT.
+ */
+ p->cur++;
+ for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
+ {}
+ a->end=p->cur;
+ if (p->cur < p->end) /* Closing quote or doublequote has been found */
+ p->cur++;
+ a->beg++;
+ if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
+ my_xml_norm_text(a);
+ lex=MY_XML_STRING;
+ }
+ else if (my_xml_is_id0(p->cur[0]))
+ {
+ p->cur++;
+ while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
+ p->cur++;
+ a->end=p->cur;
+ my_xml_norm_text(a);
+ lex=MY_XML_IDENT;
+ }
+ else
+ lex= MY_XML_UNKNOWN;
+
+#if 0
+ printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
+#endif
+
+ret:
+ return lex;
+}
+
+
+static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
+{
+ return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
+}
+
+
+/**
+ Ensure the attr buffer is wide enough to hold the new value
+
+ Expand and/or allocate dynamic buffer as needed to hold the concatenated
+ path and the terminating zero.
+
+ @attr st the parser instance
+ @attr len the length of the attribute to be added
+ @return state
+ @retval 1 failed
+ @retval 0 success
+*/
+static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
+{
+ size_t ofs= st->attr.end - st->attr.start;
+ len++; // Add terminating zero.
+ if (ofs + len > st->attr.buffer_size)
+ {
+ st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
+ st->attr.buffer_size * 2 + len : SIZE_T_MAX;
+
+ if (!st->attr.buffer)
+ {
+ st->attr.buffer= (char *) my_malloc(PSI_INSTRUMENT_ME, st->attr.buffer_size, MYF(0));
+ if (st->attr.buffer)
+ memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
+ }
+ else
+ st->attr.buffer= (char *) my_realloc(PSI_INSTRUMENT_ME, st->attr.buffer,
+ st->attr.buffer_size, MYF(0));
+ st->attr.start= st->attr.buffer;
+ st->attr.end= st->attr.start + ofs;
+
+ return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
+ }
+ return MY_XML_OK;
+}
+
+
+/** rewind the attr buffer to initial state */
+static void my_xml_attr_rewind(MY_XML_PARSER *p)
+{
+ /* keep the buffer already allocated */
+ p->attr.end= p->attr.start;
+}
+
+
+static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
+{
+ if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
+ return MY_XML_ERROR;
+
+ if (st->attr.end > st->attr.start)
+ {
+ st->attr.end[0]= '/';
+ st->attr.end++;
+ }
+ memcpy(st->attr.end, str, len);
+ st->attr.end+= len;
+ st->attr.end[0]= '\0';
+ if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
+ return st->enter ? st->enter(st, str, len) : MY_XML_OK;
+ else
+ return st->enter ?
+ st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
+}
+
+
+static void mstr(char *s,const char *src,size_t l1, size_t l2)
+{
+ l1 = l1<l2 ? l1 : l2;
+ memcpy(s,src,l1);
+ s[l1]='\0';
+}
+
+
+static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
+{
+ char *e, *tag;
+ size_t glen;
+ char s[32];
+ char g[32];
+ int rc;
+
+ /* Find previous '/' or beginning */
+ for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
+ glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
+ tag= e[0] == '/' ? e + 1 : e;
+
+ if (str && (slen != glen || memcmp(str, tag, slen)))
+ {
+ mstr(s,str,sizeof(s)-1,slen);
+ if (glen)
+ {
+ mstr(g, tag, sizeof(g)-1, glen);
+ snprintf(p->errstr,sizeof(p->errstr),"'</%s>' unexpected ('</%s>' wanted)",s,g);
+ }
+ else
+ snprintf(p->errstr,sizeof(p->errstr),"'</%s>' unexpected (END-OF-INPUT wanted)", s);
+ return MY_XML_ERROR;
+ }
+
+ if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
+ rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
+ else
+ rc= (p->leave_xml ?
+ p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
+ MY_XML_OK);
+
+ *e='\0';
+ p->attr.end= e;
+
+ return rc;
+}
+
+
+int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
+{
+
+ my_xml_attr_rewind(p);
+
+ p->beg=str;
+ p->cur=str;
+ p->end=str+len;
+
+ while ( p->cur < p->end )
+ {
+ MY_XML_ATTR a;
+ if (p->cur[0] == '<')
+ {
+ int lex;
+ int question=0;
+ int exclam=0;
+
+ lex=my_xml_scan(p,&a);
+
+ if (MY_XML_COMMENT == lex)
+ continue;
+
+ if (lex == MY_XML_CDATA)
+ {
+ a.beg+= 9;
+ a.end-= 3;
+ my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
+ continue;
+ }
+
+ lex=my_xml_scan(p,&a);
+
+ if (MY_XML_SLASH == lex)
+ {
+ if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"%s unexpected (ident wanted)",lex2str(lex));
+ return MY_XML_ERROR;
+ }
+ if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
+ return MY_XML_ERROR;
+ lex=my_xml_scan(p,&a);
+ goto gt;
+ }
+
+ if (MY_XML_EXCLAM == lex)
+ {
+ lex=my_xml_scan(p,&a);
+ exclam=1;
+ }
+ else if (MY_XML_QUESTION == lex)
+ {
+ lex=my_xml_scan(p,&a);
+ question=1;
+ }
+
+ if (MY_XML_IDENT == lex)
+ {
+ p->current_node_type= MY_XML_NODE_TAG;
+ if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
+ return MY_XML_ERROR;
+ }
+ else
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"%s unexpected (ident or '/' wanted)",
+ lex2str(lex));
+ return MY_XML_ERROR;
+ }
+
+ while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
+ ((MY_XML_STRING == lex && exclam)))
+ {
+ MY_XML_ATTR b;
+ if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
+ {
+ lex=my_xml_scan(p,&b);
+ if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
+ {
+ p->current_node_type= MY_XML_NODE_ATTR;
+ if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
+ (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
+ (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
+ return MY_XML_ERROR;
+ }
+ else
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"%s unexpected (ident or string wanted)",
+ lex2str(lex));
+ return MY_XML_ERROR;
+ }
+ }
+ else if (MY_XML_IDENT == lex)
+ {
+ p->current_node_type= MY_XML_NODE_ATTR;
+ if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
+ (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
+ return MY_XML_ERROR;
+ }
+ else if ((MY_XML_STRING == lex) && exclam)
+ {
+ /*
+ We are in <!DOCTYPE>, e.g.
+ <!DOCTYPE name SYSTEM "SystemLiteral">
+ <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
+ Just skip "SystemLiteral" and "PublicidLiteral"
+ */
+ }
+ else
+ break;
+ }
+
+ if (lex == MY_XML_SLASH)
+ {
+ if (MY_XML_OK != my_xml_leave(p,NULL,0))
+ return MY_XML_ERROR;
+ lex=my_xml_scan(p,&a);
+ }
+
+gt:
+ if (question)
+ {
+ if (lex != MY_XML_QUESTION)
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"%s unexpected ('?' wanted)",lex2str(lex));
+ return MY_XML_ERROR;
+ }
+ if (MY_XML_OK != my_xml_leave(p,NULL,0))
+ return MY_XML_ERROR;
+ lex=my_xml_scan(p,&a);
+ }
+
+ if (exclam)
+ {
+ if (MY_XML_OK != my_xml_leave(p,NULL,0))
+ return MY_XML_ERROR;
+ }
+
+ if (lex != MY_XML_GT)
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"%s unexpected ('>' wanted)",lex2str(lex));
+ return MY_XML_ERROR;
+ }
+ }
+ else
+ {
+ a.beg=p->cur;
+ for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
+ a.end=p->cur;
+
+ if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
+ my_xml_norm_text(&a);
+ if (a.beg != a.end)
+ {
+ my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
+ }
+ }
+ }
+
+ if (p->attr.start[0])
+ {
+ snprintf(p->errstr,sizeof(p->errstr),"unexpected END-OF-INPUT");
+ return MY_XML_ERROR;
+ }
+ return MY_XML_OK;
+}
+
+
+void my_xml_parser_create(MY_XML_PARSER *p)
+{
+ memset(p, 0, sizeof(p[0]));
+ /*
+ Use static buffer while it's sufficient.
+ */
+ p->attr.start= p->attr.end= p->attr.static_buffer;
+ p->attr.buffer_size= sizeof(p->attr.static_buffer);
+}
+
+
+void my_xml_parser_free(MY_XML_PARSER *p)
+{
+ if (p->attr.buffer)
+ {
+ my_free(p->attr.buffer);
+ p->attr.buffer= NULL;
+ }
+}
+
+
+void my_xml_set_value_handler(MY_XML_PARSER *p,
+ int (*action)(MY_XML_PARSER *p, const char *s,
+ size_t l))
+{
+ p->value=action;
+}
+
+void my_xml_set_enter_handler(MY_XML_PARSER *p,
+ int (*action)(MY_XML_PARSER *p, const char *s,
+ size_t l))
+{
+ p->enter=action;
+}
+
+
+void my_xml_set_leave_handler(MY_XML_PARSER *p,
+ int (*action)(MY_XML_PARSER *p, const char *s,
+ size_t l))
+{
+ p->leave_xml=action;
+}
+
+
+void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
+{
+ p->user_data=user_data;
+}
+
+
+const char *my_xml_error_string(MY_XML_PARSER *p)
+{
+ return p->errstr;
+}
+
+
+size_t my_xml_error_pos(MY_XML_PARSER *p)
+{
+ const char *beg=p->beg;
+ const char *s;
+ for ( s=p->beg ; s<p->cur; s++)
+ {
+ if (s[0] == '\n')
+ beg=s;
+ }
+ return (size_t) (p->cur-beg);
+}
+
+uint my_xml_error_lineno(MY_XML_PARSER *p)
+{
+ uint res=0;
+ const char *s;
+ for (s=p->beg ; s<p->cur; s++)
+ {
+ if (s[0] == '\n')
+ res++;
+ }
+ return res;
+}