131 lines
4.9 KiB
Python
131 lines
4.9 KiB
Python
from collections import OrderedDict
|
|
from typing import Dict, List, Optional, Text, Union
|
|
from os.path import dirname, join
|
|
from xml.parsers import expat
|
|
import xml.etree.ElementTree as etree # noqa: N813
|
|
|
|
|
|
_catalog = join(dirname(__file__), "catalog")
|
|
|
|
def _wrap_error(e: expat.error) -> etree.ParseError:
|
|
err = etree.ParseError(e)
|
|
err.code = e.code
|
|
err.position = e.lineno, e.offset
|
|
raise err
|
|
|
|
_names: Dict[Text, Text] = {}
|
|
def _fixname(key: Text) -> Text:
|
|
try:
|
|
name = _names[key]
|
|
except KeyError:
|
|
name = key
|
|
if "}" in name:
|
|
name = "{" + name
|
|
_names[key] = name
|
|
return name
|
|
|
|
|
|
_undefined_entity_code: int = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY]
|
|
|
|
|
|
class XMLParser:
|
|
"""
|
|
An XML parser with support for XHTML DTDs and all Python-supported encodings
|
|
|
|
This implements the API defined by
|
|
xml.etree.ElementTree.XMLParser, but supports XHTML DTDs
|
|
(therefore allowing XHTML entities) and supports all encodings
|
|
Python does, rather than just those supported by expat.
|
|
"""
|
|
def __init__(self, encoding: Optional[Text] = None) -> None:
|
|
self._parser = expat.ParserCreate(encoding, "}")
|
|
self._target = etree.TreeBuilder()
|
|
# parser settings
|
|
self._parser.buffer_text = True
|
|
self._parser.ordered_attributes = True
|
|
self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
|
|
# parser callbacks
|
|
self._parser.XmlDeclHandler = self._xml_decl
|
|
self._parser.StartElementHandler = self._start
|
|
self._parser.EndElementHandler = self._end
|
|
self._parser.CharacterDataHandler = self._data
|
|
self._parser.ExternalEntityRefHandler = self._external
|
|
self._parser.SkippedEntityHandler = self._skipped
|
|
# used for our horrible re-encoding hack
|
|
self._fed_data: Optional[List[bytes]] = []
|
|
self._read_encoding: Optional[Text] = None
|
|
|
|
def _xml_decl(self, version: Text, encoding: Optional[Text], standalone: int) -> None:
|
|
self._read_encoding = encoding
|
|
|
|
def _start(self, tag: Text, attrib_in: List[str]) -> etree.Element:
|
|
assert isinstance(tag, str)
|
|
self._fed_data = None
|
|
tag = _fixname(tag)
|
|
attrib: Dict[Union[bytes, Text], Union[bytes, Text]] = OrderedDict()
|
|
if attrib_in:
|
|
for i in range(0, len(attrib_in), 2):
|
|
attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
|
|
return self._target.start(tag, attrib)
|
|
|
|
def _data(self, text: Text) -> None:
|
|
self._target.data(text)
|
|
|
|
def _end(self, tag: Text) -> etree.Element:
|
|
return self._target.end(_fixname(tag))
|
|
|
|
def _external(self, context: Text, base: Optional[Text], system_id: Optional[Text], public_id: Optional[Text]) -> bool:
|
|
if public_id in {
|
|
"-//W3C//DTD XHTML 1.0 Transitional//EN",
|
|
"-//W3C//DTD XHTML 1.1//EN",
|
|
"-//W3C//DTD XHTML 1.0 Strict//EN",
|
|
"-//W3C//DTD XHTML 1.0 Frameset//EN",
|
|
"-//W3C//DTD XHTML Basic 1.0//EN",
|
|
"-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
|
|
"-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
|
|
"-//W3C//DTD MathML 2.0//EN",
|
|
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
|
|
}:
|
|
parser = self._parser.ExternalEntityParserCreate(context)
|
|
with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
|
|
try:
|
|
parser.ParseFile(fp)
|
|
except expat.error:
|
|
return False
|
|
|
|
return True
|
|
|
|
def _skipped(self, name: Text, is_parameter_entity: bool) -> None:
|
|
err = expat.error("undefined entity %s: line %d, column %d" %
|
|
(name, self._parser.ErrorLineNumber,
|
|
self._parser.ErrorColumnNumber))
|
|
err.code = _undefined_entity_code
|
|
err.lineno = self._parser.ErrorLineNumber
|
|
err.offset = self._parser.ErrorColumnNumber
|
|
raise err
|
|
|
|
def feed(self, data: bytes) -> None:
|
|
if self._fed_data is not None:
|
|
self._fed_data.append(data)
|
|
try:
|
|
self._parser.Parse(data, False)
|
|
except expat.error as v:
|
|
_wrap_error(v)
|
|
except ValueError as e:
|
|
if e.args[0] == 'multi-byte encodings are not supported':
|
|
assert self._read_encoding is not None
|
|
assert self._fed_data is not None
|
|
xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
|
|
new_parser = XMLParser("utf-8")
|
|
self._parser = new_parser._parser
|
|
self._target = new_parser._target
|
|
self._fed_data = None
|
|
self.feed(xml)
|
|
|
|
def close(self) -> etree.Element:
|
|
try:
|
|
self._parser.Parse("", True)
|
|
except expat.error as v:
|
|
_wrap_error(v)
|
|
tree = self._target.close()
|
|
return tree
|