# mako/filters.py # Copyright 2006-2020 the Mako authors and contributors # # This module is part of Mako and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php import codecs import re from mako import compat from mako.compat import codepoint2name from mako.compat import name2codepoint from mako.compat import quote_plus from mako.compat import unquote_plus xml_escapes = { "&": "&", ">": ">", "<": "<", '"': """, # also " in html-only "'": "'", # also ' in html-only } # XXX: " is valid in HTML and XML # ' is not valid HTML, but is valid XML def legacy_html_escape(s): """legacy HTML escape for non-unicode mode.""" s = s.replace("&", "&") s = s.replace(">", ">") s = s.replace("<", "<") s = s.replace('"', """) s = s.replace("'", "'") return s try: import markupsafe html_escape = markupsafe.escape except ImportError: html_escape = legacy_html_escape def xml_escape(string): return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string) def url_escape(string): # convert into a list of octets string = string.encode("utf8") return quote_plus(string) def legacy_url_escape(string): # convert into a list of octets return quote_plus(string) def url_unescape(string): text = unquote_plus(string) if not is_ascii_str(text): text = text.decode("utf8") return text def trim(string): return string.strip() class Decode(object): def __getattr__(self, key): def decode(x): if isinstance(x, compat.text_type): return x elif not isinstance(x, compat.binary_type): return decode(str(x)) else: return compat.text_type(x, encoding=key) return decode decode = Decode() _ASCII_re = re.compile(r"\A[\x00-\x7f]*\Z") def is_ascii_str(text): return isinstance(text, str) and _ASCII_re.match(text) ################################################################ class XMLEntityEscaper(object): def __init__(self, codepoint2name, name2codepoint): self.codepoint2entity = dict( [ (c, compat.text_type("&%s;" % n)) for c, n in codepoint2name.items() ] ) self.name2codepoint = name2codepoint def escape_entities(self, text): """Replace characters with their character entity references. Only characters corresponding to a named entity are replaced. """ return compat.text_type(text).translate(self.codepoint2entity) def __escape(self, m): codepoint = ord(m.group()) try: return self.codepoint2entity[codepoint] except (KeyError, IndexError): return "&#x%X;" % codepoint __escapable = re.compile(r'["&<>]|[^\x00-\x7f]') def escape(self, text): """Replace characters with their character references. Replace characters by their named entity references. Non-ASCII characters, if they do not have a named entity reference, are replaced by numerical character references. The return value is guaranteed to be ASCII. """ return self.__escapable.sub( self.__escape, compat.text_type(text) ).encode("ascii") # XXX: This regexp will not match all valid XML entity names__. # (It punts on details involving involving CombiningChars and Extenders.) # # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef __characterrefs = re.compile( r"""& (?: \#(\d+) | \#x([\da-f]+) | ( (?!\d) [:\w] [-.:\w]+ ) ) ;""", re.X | re.UNICODE, ) def __unescape(self, m): dval, hval, name = m.groups() if dval: codepoint = int(dval) elif hval: codepoint = int(hval, 16) else: codepoint = self.name2codepoint.get(name, 0xFFFD) # U+FFFD = "REPLACEMENT CHARACTER" if codepoint < 128: return chr(codepoint) return chr(codepoint) def unescape(self, text): """Unescape character references. All character references (both entity references and numerical character references) are unescaped. """ return self.__characterrefs.sub(self.__unescape, text) _html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint) html_entities_escape = _html_entities_escaper.escape_entities html_entities_unescape = _html_entities_escaper.unescape def htmlentityreplace_errors(ex): """An encoding error handler. This python codecs error handler replaces unencodable characters with HTML entities, or, if no HTML entity exists for the character, XML character references:: >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace') 'The cost was €12.' """ if isinstance(ex, UnicodeEncodeError): # Handle encoding errors bad_text = ex.object[ex.start : ex.end] text = _html_entities_escaper.escape(bad_text) return (compat.text_type(text), ex.end) raise ex codecs.register_error("htmlentityreplace", htmlentityreplace_errors) # TODO: options to make this dynamic per-compilation will be added in a later # release DEFAULT_ESCAPES = { "x": "filters.xml_escape", "h": "filters.html_escape", "u": "filters.url_escape", "trim": "filters.trim", "entity": "filters.html_entities_escape", "unicode": "unicode", "decode": "decode", "str": "str", "n": "n", } if compat.py3k: DEFAULT_ESCAPES.update({"unicode": "str"}) NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy() NON_UNICODE_ESCAPES["h"] = "filters.legacy_html_escape" NON_UNICODE_ESCAPES["u"] = "filters.legacy_url_escape"