# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. from __future__ import absolute_import from __future__ import unicode_literals import re try: from html import unescape as html_unescape except ImportError: from HTMLParser import HTMLParser html_parser = HTMLParser() html_unescape = html_parser.unescape from .base import ( Entity, Comment, Junk, Parser ) class DTDEntityMixin(object): @property def val(self): '''Unescape HTML entities into corresponding Unicode characters. Named (&), decimal (&), and hex (& and &) formats are supported. Unknown entities are left intact. As of Python 2.7 and Python 3.6 the following 252 named entities are recognized and unescaped: https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py https://github.com/python/cpython/blob/3.6/Lib/html/entities.py ''' return html_unescape(self.raw_val) def value_position(self, offset=0): # DTDChecker already returns tuples of (line, col) positions if isinstance(offset, tuple): line_pos, col_pos = offset line, col = super(DTDEntityMixin, self).value_position() if line_pos == 1: col = col + col_pos else: col = col_pos line += line_pos - 1 return line, col else: return super(DTDEntityMixin, self).value_position(offset) class DTDEntity(DTDEntityMixin, Entity): pass class DTDParser(Parser): # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar # ":" | [A-Z] | "_" | [a-z] | # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | # [#x10000-#xEFFFF] CharMinusDash = '\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD' XmlComment = '' % CharMinusDash NameStartChar = ':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \ '\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \ '\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD' # + \U00010000-\U000EFFFF seems to be unsupported in python # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | # [#x0300-#x036F] | [#x203F-#x2040] NameChar = NameStartChar + r'\-\.0-9' + '\xB7\u0300-\u036F\u203F-\u2040' Name = '[' + NameStartChar + '][' + NameChar + ']*' reKey = re.compile('' + Name + ')[ \t\r\n]+' '(?P\"[^\"]*\"|\'[^\']*\'?)[ \t\r\n]*>', re.DOTALL | re.M) # add BOM to DTDs, details in bug 435002 reHeader = re.compile('^\ufeff') reComment = re.compile('' % CharMinusDash, re.S) rePE = re.compile('' + Name + ')' '[ \t\r\n]+SYSTEM[ \t\r\n]+' '(?P\"[^\"]*\"|\'[^\']*\')[ \t\r\n]*>[ \t\r\n]*' '%' + Name + ';' '(?:[ \t]*(?:' + XmlComment + u'[ \t\r\n]*)*\n?)?') class Comment(Comment): @property def val(self): if self._val_cache is None: # Strip "" to comment contents self._val_cache = self.all[4:-3] return self._val_cache def getNext(self, ctx, offset): ''' Overload Parser.getNext to special-case ParsedEntities. Just check for a parsed entity if that method claims junk. %foo; ''' if offset == 0 and self.reHeader.match(ctx.contents): offset += 1 entity = Parser.getNext(self, ctx, offset) if (entity and isinstance(entity, Junk)) or entity is None: m = self.rePE.match(ctx.contents, offset) if m: entity = DTDEntity( ctx, None, None, m.span(), m.span('key'), m.span('val')) return entity def createEntity(self, ctx, m, current_comment, white_space): valspan = m.span('val') valspan = (valspan[0]+1, valspan[1]-1) return DTDEntity(ctx, current_comment, white_space, m.span(), m.span('key'), valspan)