summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/tools/manifest/XMLParser.py
blob: 689533421d77e723882e6a8a12ab6f5e4e46fd65 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from os.path import dirname, join

from collections import OrderedDict

from xml.parsers import expat
import xml.etree.ElementTree as etree  # noqa: N813

MYPY = False
if MYPY:
    # MYPY is set to True when run under Mypy.
    from typing import Dict
    from typing import List
    from typing import Optional
    from typing import Text
    from typing import Union

_catalog = join(dirname(__file__), "catalog")

def _wrap_error(e):
    # type: (expat.error) -> etree.ParseError
    err = etree.ParseError(e)
    err.code = e.code
    err.position = e.lineno, e.offset
    raise err

_names = {}  # type: Dict[Text, Text]
def _fixname(key):
    # type: (Text) -> Text
    try:
        name = _names[key]
    except KeyError:
        name = key
        if "}" in name:
            name = "{" + name
        _names[key] = name
    return name


_undefined_entity_code = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY]  # type: int


class XMLParser:
    """
    An XML parser with support for XHTML DTDs and all Python-supported encodings

    This implements the API defined by
    xml.etree.ElementTree.XMLParser, but supports XHTML DTDs
    (therefore allowing XHTML entities) and supports all encodings
    Python does, rather than just those supported by expat.
    """
    def __init__(self, encoding=None):
        # type: (Optional[Text]) -> None
        self._parser = expat.ParserCreate(encoding, "}")
        self._target = etree.TreeBuilder()
        # parser settings
        self._parser.buffer_text = True
        self._parser.ordered_attributes = True
        self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
        # parser callbacks
        self._parser.XmlDeclHandler = self._xml_decl
        self._parser.StartElementHandler = self._start
        self._parser.EndElementHandler = self._end
        self._parser.CharacterDataHandler = self._data
        self._parser.ExternalEntityRefHandler = self._external
        self._parser.SkippedEntityHandler = self._skipped  # type: ignore
        # used for our horrible re-encoding hack
        self._fed_data = []  # type: Optional[List[bytes]]
        self._read_encoding = None  # type: Optional[Text]

    def _xml_decl(self, version, encoding, standalone):
        # type: (Text, Optional[Text], int) -> None
        self._read_encoding = encoding

    def _start(self, tag, attrib_in):
        # type: (Text, List[str]) -> etree.Element
        assert isinstance(tag, str)
        self._fed_data = None
        tag = _fixname(tag)
        attrib = OrderedDict()  # type: Dict[Union[bytes, Text], Union[bytes, Text]]
        if attrib_in:
            for i in range(0, len(attrib_in), 2):
                attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
        return self._target.start(tag, attrib)

    def _data(self, text):
        # type: (Text) -> None
        self._target.data(text)

    def _end(self, tag):
        # type: (Text) -> etree.Element
        return self._target.end(_fixname(tag))

    def _external(self, context, base, system_id, public_id):
        # type: (Text, Optional[Text], Optional[Text], Optional[Text]) -> bool
        if public_id in {
                "-//W3C//DTD XHTML 1.0 Transitional//EN",
                "-//W3C//DTD XHTML 1.1//EN",
                "-//W3C//DTD XHTML 1.0 Strict//EN",
                "-//W3C//DTD XHTML 1.0 Frameset//EN",
                "-//W3C//DTD XHTML Basic 1.0//EN",
                "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
                "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
                "-//W3C//DTD MathML 2.0//EN",
                "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
        }:
            parser = self._parser.ExternalEntityParserCreate(context)
            with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
                try:
                    parser.ParseFile(fp)
                except expat.error:
                    return False

        return True

    def _skipped(self, name, is_parameter_entity):
        # type: (Text, bool) -> None
        err = expat.error("undefined entity %s: line %d, column %d" %
                          (name, self._parser.ErrorLineNumber,
                           self._parser.ErrorColumnNumber))
        err.code = _undefined_entity_code
        err.lineno = self._parser.ErrorLineNumber
        err.offset = self._parser.ErrorColumnNumber
        raise err

    def feed(self, data):
        # type: (bytes) -> None
        if self._fed_data is not None:
            self._fed_data.append(data)
        try:
            self._parser.Parse(data, False)
        except expat.error as v:
            _wrap_error(v)
        except ValueError as e:
            if e.args[0] == 'multi-byte encodings are not supported':
                assert self._read_encoding is not None
                assert self._fed_data is not None
                xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
                new_parser = XMLParser("utf-8")
                self._parser = new_parser._parser
                self._target = new_parser._target
                self._fed_data = None
                self.feed(xml)

    def close(self):
        # type: () -> etree.Element
        try:
            self._parser.Parse("", True)
        except expat.error as v:
            _wrap_error(v)
        tree = self._target.close()
        return tree