diff options
Diffstat (limited to 'testing/web-platform/tests/tools/manifest/XMLParser.py')
-rw-r--r-- | testing/web-platform/tests/tools/manifest/XMLParser.py | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/manifest/XMLParser.py b/testing/web-platform/tests/tools/manifest/XMLParser.py new file mode 100644 index 0000000000..8dcdb45007 --- /dev/null +++ b/testing/web-platform/tests/tools/manifest/XMLParser.py @@ -0,0 +1,131 @@ +from collections import OrderedDict +from typing import Dict, List, Optional, Text, Union +from os.path import dirname, join +from xml.parsers import expat +import xml.etree.ElementTree as etree # noqa: N813 + + +_catalog = join(dirname(__file__), "catalog") + +def _wrap_error(e: expat.error) -> etree.ParseError: + err = etree.ParseError(e) + err.code = e.code + err.position = e.lineno, e.offset + raise err + +_names: Dict[Text, Text] = {} +def _fixname(key: Text) -> Text: + try: + name = _names[key] + except KeyError: + name = key + if "}" in name: + name = "{" + name + _names[key] = name + return name + + +_undefined_entity_code: int = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY] + + +class XMLParser: + """ + An XML parser with support for XHTML DTDs and all Python-supported encodings + + This implements the API defined by + xml.etree.ElementTree.XMLParser, but supports XHTML DTDs + (therefore allowing XHTML entities) and supports all encodings + Python does, rather than just those supported by expat. + """ + def __init__(self, encoding: Optional[Text] = None) -> None: + self._parser = expat.ParserCreate(encoding, "}") + self._target = etree.TreeBuilder() + # parser settings + self._parser.buffer_text = True + self._parser.ordered_attributes = True + self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) + # parser callbacks + self._parser.XmlDeclHandler = self._xml_decl + self._parser.StartElementHandler = self._start + self._parser.EndElementHandler = self._end + self._parser.CharacterDataHandler = self._data + self._parser.ExternalEntityRefHandler = self._external + self._parser.SkippedEntityHandler = self._skipped + # used for our horrible re-encoding hack + self._fed_data: Optional[List[bytes]] = [] + self._read_encoding: Optional[Text] = None + + def _xml_decl(self, version: Text, encoding: Optional[Text], standalone: int) -> None: + self._read_encoding = encoding + + def _start(self, tag: Text, attrib_in: List[str]) -> etree.Element: + assert isinstance(tag, str) + self._fed_data = None + tag = _fixname(tag) + attrib: Dict[Union[bytes, Text], Union[bytes, Text]] = OrderedDict() + if attrib_in: + for i in range(0, len(attrib_in), 2): + attrib[_fixname(attrib_in[i])] = attrib_in[i+1] + return self._target.start(tag, attrib) + + def _data(self, text: Text) -> None: + self._target.data(text) + + def _end(self, tag: Text) -> etree.Element: + return self._target.end(_fixname(tag)) + + def _external(self, context: Text, base: Optional[Text], system_id: Optional[Text], public_id: Optional[Text]) -> bool: + if public_id in { + "-//W3C//DTD XHTML 1.0 Transitional//EN", + "-//W3C//DTD XHTML 1.1//EN", + "-//W3C//DTD XHTML 1.0 Strict//EN", + "-//W3C//DTD XHTML 1.0 Frameset//EN", + "-//W3C//DTD XHTML Basic 1.0//EN", + "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN", + "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN", + "-//W3C//DTD MathML 2.0//EN", + "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" + }: + parser = self._parser.ExternalEntityParserCreate(context) + with open(join(_catalog, "xhtml.dtd"), "rb") as fp: + try: + parser.ParseFile(fp) + except expat.error: + return False + + return True + + def _skipped(self, name: Text, is_parameter_entity: bool) -> None: + err = expat.error("undefined entity %s: line %d, column %d" % + (name, self._parser.ErrorLineNumber, + self._parser.ErrorColumnNumber)) + err.code = _undefined_entity_code + err.lineno = self._parser.ErrorLineNumber + err.offset = self._parser.ErrorColumnNumber + raise err + + def feed(self, data: bytes) -> None: + if self._fed_data is not None: + self._fed_data.append(data) + try: + self._parser.Parse(data, False) + except expat.error as v: + _wrap_error(v) + except ValueError as e: + if e.args[0] == 'multi-byte encodings are not supported': + assert self._read_encoding is not None + assert self._fed_data is not None + xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8") + new_parser = XMLParser("utf-8") + self._parser = new_parser._parser + self._target = new_parser._target + self._fed_data = None + self.feed(xml) + + def close(self) -> etree.Element: + try: + self._parser.Parse("", True) + except expat.error as v: + _wrap_error(v) + tree = self._target.close() + return tree |