summaryrefslogtreecommitdiffstats
path: root/third_party/python/compare-locales/compare_locales/parser/dtd.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/python/compare-locales/compare_locales/parser/dtd.py')
-rw-r--r--third_party/python/compare-locales/compare_locales/parser/dtd.py118
1 files changed, 118 insertions, 0 deletions
diff --git a/third_party/python/compare-locales/compare_locales/parser/dtd.py b/third_party/python/compare-locales/compare_locales/parser/dtd.py
new file mode 100644
index 0000000000..5f0574f488
--- /dev/null
+++ b/third_party/python/compare-locales/compare_locales/parser/dtd.py
@@ -0,0 +1,118 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import re
+
+try:
+ from html import unescape as html_unescape
+except ImportError:
+ from HTMLParser import HTMLParser
+ html_parser = HTMLParser()
+ html_unescape = html_parser.unescape
+
+from .base import (
+ Entity, Comment, Junk,
+ Parser
+)
+
+
+class DTDEntityMixin(object):
+ @property
+ def val(self):
+ '''Unescape HTML entities into corresponding Unicode characters.
+
+ Named (&), decimal (&), and hex (& and &) formats
+ are supported. Unknown entities are left intact.
+
+ As of Python 2.7 and Python 3.6 the following 252 named entities are
+ recognized and unescaped:
+
+ https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py
+ https://github.com/python/cpython/blob/3.6/Lib/html/entities.py
+ '''
+ return html_unescape(self.raw_val)
+
+ def value_position(self, offset=0):
+ # DTDChecker already returns tuples of (line, col) positions
+ if isinstance(offset, tuple):
+ line_pos, col_pos = offset
+ line, col = super(DTDEntityMixin, self).value_position()
+ if line_pos == 1:
+ col = col + col_pos
+ else:
+ col = col_pos
+ line += line_pos - 1
+ return line, col
+ else:
+ return super(DTDEntityMixin, self).value_position(offset)
+
+
+class DTDEntity(DTDEntityMixin, Entity):
+ pass
+
+
+class DTDParser(Parser):
+ # http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
+ # ":" | [A-Z] | "_" | [a-z] |
+ # [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+ # | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
+ # [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+ # [#x10000-#xEFFFF]
+ CharMinusDash = '\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
+ XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
+ NameStartChar = ':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
+ '\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
+ '\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
+ # + \U00010000-\U000EFFFF seems to be unsupported in python
+
+ # NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
+ # [#x0300-#x036F] | [#x203F-#x2040]
+ NameChar = NameStartChar + r'\-\.0-9' + '\xB7\u0300-\u036F\u203F-\u2040'
+ Name = '[' + NameStartChar + '][' + NameChar + ']*'
+ reKey = re.compile('<!ENTITY[ \t\r\n]+(?P<key>' + Name + ')[ \t\r\n]+'
+ '(?P<val>\"[^\"]*\"|\'[^\']*\'?)[ \t\r\n]*>',
+ re.DOTALL | re.M)
+ # add BOM to DTDs, details in bug 435002
+ reHeader = re.compile('^\ufeff')
+ reComment = re.compile('<!--(?P<val>-?[%s])*?-->' % CharMinusDash,
+ re.S)
+ rePE = re.compile('<!ENTITY[ \t\r\n]+%[ \t\r\n]+(?P<key>' + Name + ')'
+ '[ \t\r\n]+SYSTEM[ \t\r\n]+'
+ '(?P<val>\"[^\"]*\"|\'[^\']*\')[ \t\r\n]*>[ \t\r\n]*'
+ '%' + Name + ';'
+ '(?:[ \t]*(?:' + XmlComment + u'[ \t\r\n]*)*\n?)?')
+
+ class Comment(Comment):
+ @property
+ def val(self):
+ if self._val_cache is None:
+ # Strip "<!--" and "-->" to comment contents
+ self._val_cache = self.all[4:-3]
+ return self._val_cache
+
+ def getNext(self, ctx, offset):
+ '''
+ Overload Parser.getNext to special-case ParsedEntities.
+ Just check for a parsed entity if that method claims junk.
+
+ <!ENTITY % foo SYSTEM "url">
+ %foo;
+ '''
+ if offset == 0 and self.reHeader.match(ctx.contents):
+ offset += 1
+ entity = Parser.getNext(self, ctx, offset)
+ if (entity and isinstance(entity, Junk)) or entity is None:
+ m = self.rePE.match(ctx.contents, offset)
+ if m:
+ entity = DTDEntity(
+ ctx, None, None, m.span(), m.span('key'), m.span('val'))
+ return entity
+
+ def createEntity(self, ctx, m, current_comment, white_space):
+ valspan = m.span('val')
+ valspan = (valspan[0]+1, valspan[1]-1)
+ return DTDEntity(ctx, current_comment, white_space,
+ m.span(), m.span('key'), valspan)