summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/tools/manifest/XMLParser.py
blob: 8dcdb4500721984ae004e9b4a0dd8ac5f6504f0a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from collections import OrderedDict
from typing import Dict, List, Optional, Text, Union
from os.path import dirname, join
from xml.parsers import expat
import xml.etree.ElementTree as etree  # noqa: N813


_catalog = join(dirname(__file__), "catalog")

def _wrap_error(e: expat.error) -> etree.ParseError:
    err = etree.ParseError(e)
    err.code = e.code
    err.position = e.lineno, e.offset
    raise err

_names: Dict[Text, Text] = {}
def _fixname(key: Text) -> Text:
    try:
        name = _names[key]
    except KeyError:
        name = key
        if "}" in name:
            name = "{" + name
        _names[key] = name
    return name


_undefined_entity_code: int = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY]


class XMLParser:
    """
    An XML parser with support for XHTML DTDs and all Python-supported encodings

    This implements the API defined by
    xml.etree.ElementTree.XMLParser, but supports XHTML DTDs
    (therefore allowing XHTML entities) and supports all encodings
    Python does, rather than just those supported by expat.
    """
    def __init__(self, encoding: Optional[Text] = None) -> None:
        self._parser = expat.ParserCreate(encoding, "}")
        self._target = etree.TreeBuilder()
        # parser settings
        self._parser.buffer_text = True
        self._parser.ordered_attributes = True
        self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
        # parser callbacks
        self._parser.XmlDeclHandler = self._xml_decl
        self._parser.StartElementHandler = self._start
        self._parser.EndElementHandler = self._end
        self._parser.CharacterDataHandler = self._data
        self._parser.ExternalEntityRefHandler = self._external
        self._parser.SkippedEntityHandler = self._skipped
        # used for our horrible re-encoding hack
        self._fed_data: Optional[List[bytes]] = []
        self._read_encoding: Optional[Text] = None

    def _xml_decl(self, version: Text, encoding: Optional[Text], standalone: int) -> None:
        self._read_encoding = encoding

    def _start(self, tag: Text, attrib_in: List[str]) -> etree.Element:
        assert isinstance(tag, str)
        self._fed_data = None
        tag = _fixname(tag)
        attrib: Dict[Union[bytes, Text], Union[bytes, Text]] = OrderedDict()
        if attrib_in:
            for i in range(0, len(attrib_in), 2):
                attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
        return self._target.start(tag, attrib)

    def _data(self, text: Text) -> None:
        self._target.data(text)

    def _end(self, tag: Text) -> etree.Element:
        return self._target.end(_fixname(tag))

    def _external(self, context: Text, base: Optional[Text], system_id: Optional[Text], public_id: Optional[Text]) -> bool:
        if public_id in {
                "-//W3C//DTD XHTML 1.0 Transitional//EN",
                "-//W3C//DTD XHTML 1.1//EN",
                "-//W3C//DTD XHTML 1.0 Strict//EN",
                "-//W3C//DTD XHTML 1.0 Frameset//EN",
                "-//W3C//DTD XHTML Basic 1.0//EN",
                "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
                "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
                "-//W3C//DTD MathML 2.0//EN",
                "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
        }:
            parser = self._parser.ExternalEntityParserCreate(context)
            with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
                try:
                    parser.ParseFile(fp)
                except expat.error:
                    return False

        return True

    def _skipped(self, name: Text, is_parameter_entity: bool) -> None:
        err = expat.error("undefined entity %s: line %d, column %d" %
                          (name, self._parser.ErrorLineNumber,
                           self._parser.ErrorColumnNumber))
        err.code = _undefined_entity_code
        err.lineno = self._parser.ErrorLineNumber
        err.offset = self._parser.ErrorColumnNumber
        raise err

    def feed(self, data: bytes) -> None:
        if self._fed_data is not None:
            self._fed_data.append(data)
        try:
            self._parser.Parse(data, False)
        except expat.error as v:
            _wrap_error(v)
        except ValueError as e:
            if e.args[0] == 'multi-byte encodings are not supported':
                assert self._read_encoding is not None
                assert self._fed_data is not None
                xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
                new_parser = XMLParser("utf-8")
                self._parser = new_parser._parser
                self._target = new_parser._target
                self._fed_data = None
                self.feed(xml)

    def close(self) -> etree.Element:
        try:
            self._parser.Parse("", True)
        except expat.error as v:
            _wrap_error(v)
        tree = self._target.close()
        return tree