1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
from os.path import dirname, join
from collections import OrderedDict
from xml.parsers import expat
import xml.etree.ElementTree as etree # noqa: N813
MYPY = False
if MYPY:
# MYPY is set to True when run under Mypy.
from typing import Dict
from typing import List
from typing import Optional
from typing import Text
from typing import Union
_catalog = join(dirname(__file__), "catalog")
def _wrap_error(e):
# type: (expat.error) -> etree.ParseError
err = etree.ParseError(e)
err.code = e.code
err.position = e.lineno, e.offset
raise err
_names = {} # type: Dict[Text, Text]
def _fixname(key):
# type: (Text) -> Text
try:
name = _names[key]
except KeyError:
name = key
if "}" in name:
name = "{" + name
_names[key] = name
return name
_undefined_entity_code = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY] # type: int
class XMLParser:
"""
An XML parser with support for XHTML DTDs and all Python-supported encodings
This implements the API defined by
xml.etree.ElementTree.XMLParser, but supports XHTML DTDs
(therefore allowing XHTML entities) and supports all encodings
Python does, rather than just those supported by expat.
"""
def __init__(self, encoding=None):
# type: (Optional[Text]) -> None
self._parser = expat.ParserCreate(encoding, "}")
self._target = etree.TreeBuilder()
# parser settings
self._parser.buffer_text = True
self._parser.ordered_attributes = True
self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
# parser callbacks
self._parser.XmlDeclHandler = self._xml_decl
self._parser.StartElementHandler = self._start
self._parser.EndElementHandler = self._end
self._parser.CharacterDataHandler = self._data
self._parser.ExternalEntityRefHandler = self._external
self._parser.SkippedEntityHandler = self._skipped # type: ignore
# used for our horrible re-encoding hack
self._fed_data = [] # type: Optional[List[bytes]]
self._read_encoding = None # type: Optional[Text]
def _xml_decl(self, version, encoding, standalone):
# type: (Text, Optional[Text], int) -> None
self._read_encoding = encoding
def _start(self, tag, attrib_in):
# type: (Text, List[str]) -> etree.Element
assert isinstance(tag, str)
self._fed_data = None
tag = _fixname(tag)
attrib = OrderedDict() # type: Dict[Union[bytes, Text], Union[bytes, Text]]
if attrib_in:
for i in range(0, len(attrib_in), 2):
attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
return self._target.start(tag, attrib)
def _data(self, text):
# type: (Text) -> None
self._target.data(text)
def _end(self, tag):
# type: (Text) -> etree.Element
return self._target.end(_fixname(tag))
def _external(self, context, base, system_id, public_id):
# type: (Text, Optional[Text], Optional[Text], Optional[Text]) -> bool
if public_id in {
"-//W3C//DTD XHTML 1.0 Transitional//EN",
"-//W3C//DTD XHTML 1.1//EN",
"-//W3C//DTD XHTML 1.0 Strict//EN",
"-//W3C//DTD XHTML 1.0 Frameset//EN",
"-//W3C//DTD XHTML Basic 1.0//EN",
"-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
"-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
"-//W3C//DTD MathML 2.0//EN",
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
}:
parser = self._parser.ExternalEntityParserCreate(context)
with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
try:
parser.ParseFile(fp)
except expat.error:
return False
return True
def _skipped(self, name, is_parameter_entity):
# type: (Text, bool) -> None
err = expat.error("undefined entity %s: line %d, column %d" %
(name, self._parser.ErrorLineNumber,
self._parser.ErrorColumnNumber))
err.code = _undefined_entity_code
err.lineno = self._parser.ErrorLineNumber
err.offset = self._parser.ErrorColumnNumber
raise err
def feed(self, data):
# type: (bytes) -> None
if self._fed_data is not None:
self._fed_data.append(data)
try:
self._parser.Parse(data, False)
except expat.error as v:
_wrap_error(v)
except ValueError as e:
if e.args[0] == 'multi-byte encodings are not supported':
assert self._read_encoding is not None
assert self._fed_data is not None
xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
new_parser = XMLParser("utf-8")
self._parser = new_parser._parser
self._target = new_parser._target
self._fed_data = None
self.feed(xml)
def close(self):
# type: () -> etree.Element
try:
self._parser.Parse("", True)
except expat.error as v:
_wrap_error(v)
tree = self._target.close()
return tree
|