summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py
diff options
context:
space:
mode:
Diffstat (limited to 'testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py')
-rw-r--r--testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py277
1 files changed, 277 insertions, 0 deletions
diff --git a/testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py b/testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py
new file mode 100644
index 0000000000..7f73bc17ec
--- /dev/null
+++ b/testing/web-platform/tests/css/tools/w3ctestlib/HTMLSerializer.py
@@ -0,0 +1,277 @@
+#!/usr/bin/python
+# CSS Test Source Manipulation Library
+# Initial code by fantasai, joint copyright 2010 W3C and Microsoft
+# additions by peter.linss@hp.com copyright 2013 Hewlett-Packard
+# Licensed under BSD 3-Clause: <http://www.w3.org/Consortium/Legal/2008/03-bsd-license>
+
+import lxml
+from lxml import etree
+import htmlentitydefs
+import copy
+
+
+class HTMLSerializer(object):
+
+ gXMLns = 'http://www.w3.org/XML/1998/namespace'
+ gHTMLns = 'http://www.w3.org/1999/xhtml'
+
+ gDefaultNamespaces = {'http://www.w3.org/XML/1998/namespace': 'xmlns',
+ 'http://www.w3.org/2000/xmlns/': 'xmlns',
+ 'http://www.w3.org/1999/xlink': 'xlink'}
+
+ gVoidElements = frozenset((
+ 'base',
+ 'command',
+ 'event-source',
+ 'link',
+ 'meta',
+ 'hr',
+ 'br',
+ 'img',
+ 'embed',
+ 'param',
+ 'area',
+ 'col',
+ 'input',
+ 'source'
+ ))
+
+ gCDataElements = frozenset((
+ 'style',
+ 'script'
+ ))
+
+ gInvisibleChars = frozenset(
+ # ASCII control chars
+ range(0x0, 0x9) + range(0xB, 0xD) + range(0xE, 0x20) +
+ # Other control chars
+ # fixed-width spaces, zero-width marks, bidi marks
+ range(0x2000, 0x2010) +
+ # LS, PS, bidi control codes
+ range(0x2028, 0x2030) +
+ # nbsp, mathsp, ideosp, WJ, interlinear
+ [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
+ )
+
+ gXMLEscapes = frozenset(gInvisibleChars |
+ frozenset((ord('&'), ord('<'), ord('>'))))
+
+ gXMLEntityNames = {'"': 'quot', '&': 'amp', "'": 'apos', '<': 'lt', '>': 'gt'}
+
+ gDocTypes = {
+ 'html': '<!DOCTYPE html>',
+ 'html4':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">',
+ 'html4-transitional':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">',
+ 'html4-frameset':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">',
+ 'svg11':
+ '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">',
+ 'svg11-tiny':
+ '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Tiny//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd">',
+ 'xhtml10':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
+ 'xhtml10-transitional':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
+ 'xhtml10-frameset':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">',
+ 'xhtml11':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
+ 'xhtml-basic11':
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML Basic 1.1//EN" "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd">'
+ }
+
+
+ def __init__(self):
+ self._reset()
+
+ def _reset(self, xhtml = False):
+ self.mOutput = u''
+ self.mXHTML = xhtml
+
+ def _output(self, *args):
+ for arg in args:
+ self.mOutput += unicode(arg)
+
+ def _escape(self, text, escapeChars):
+ # This algorithm is O(MN) for M len(text) and N num escapable
+ # But it doesn't modify the text when N is zero (common case) and
+ # N is expected to be small (usually 1 or 2) in most other cases.
+ escapable = set()
+ for char in text:
+ if ord(char) in escapeChars:
+ escapable.add(char)
+ for char in escapable:
+ if (self.mXHTML):
+ name = self.gXMLEntityNames.get(char)
+ else:
+ name = htmlentitydefs.codepoint2name.get(ord(char))
+ escape = u'&%s;' % name if name else u'&#x%X;' % ord(char)
+ text = text.replace(char, escape)
+ return text
+
+ def _escapeXML(self, text):
+ return self._escape(text, self.gXMLEscapes)
+
+ def _escapeInvisible(self, text):
+ return self._escape(text, self.gInvisibleChars)
+
+ def _serializeElement(self, element, namespacePrefixes):
+ qName = etree.QName(element)
+ attrs = element.attrib.items() # in tree order
+
+ if (not namespacePrefixes):
+ namespacePrefixes = self.gDefaultNamespaces
+
+ if (self.mXHTML):
+ namespacePrefixes = copy.copy(namespacePrefixes)
+ for attr, value in attrs:
+ attrQName = etree.QName(attr)
+ if (self.gXMLns == attrQName.namespace):
+ namespacePrefixes[value] = attrQName.localname
+ elif ('xmlns' == attrQName.localname):
+ namespacePrefixes[value] = ''
+
+ if (self.mXHTML and qName.namespace and namespacePrefixes[qName.namespace]):
+ self._output('<', namespacePrefixes[qName.namespace], ':', qName.localname)
+ else:
+ self._output('<', qName.localname)
+
+ for attr, value in attrs:
+ attrQName = etree.QName(attr)
+ if ((attrQName.namespace == self.gXMLns) and ('lang' == attrQName.localname)):
+ if (self.mXHTML):
+ attr = 'xml:lang'
+ else:
+ attr = 'lang'
+ elif (attrQName.namespace and namespacePrefixes[attrQName.namespace]):
+ attr = namespacePrefixes[attrQName.namespace] + ':' + attrQName.localname
+ else:
+ attr = attrQName.localname
+
+ self._output(' ', attr, '=')
+ value = value.replace('&', '&amp;')
+ if (self.mXHTML):
+ value = value.replace('<', '&lt;')
+
+ if (('"' in value) and ("'" not in value)):
+ self._output("'", self._escapeInvisible(value), "'")
+ else:
+ self._output('"', self._escapeInvisible(value.replace('"', '&quot;')), '"')
+
+ if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gVoidElements)):
+ if (self.mXHTML):
+ self._output(' />')
+ else:
+ self._output('>')
+ else:
+ self._output('>')
+
+ if (None != element.text):
+ if ((qName.namespace == self.gHTMLns) and (qName.localname in self.gCDataElements)):
+ if (self.mXHTML):
+ self._output(self._escapeXML(element.text)) # or self._output('<![CDATA[', element.text, ']]>')
+ else:
+ self._output(element.text)
+ else:
+ self._output(self._escapeXML(element.text))
+
+ for child in list(element):
+ self._serializeNode(child, namespacePrefixes)
+
+ self._output('</', qName.localname, '>')
+
+ if (None != element.tail):
+ self._output(self._escapeXML(element.tail))
+
+ def _serializeEntity(self, entity):
+ self._output(entity.text)
+ if (None != entity.tail):
+ self._output(self._escapeXML(entity.tail))
+
+ def _serializePI(self, pi):
+ if (self.mXHTML):
+ self._output('<?', pi.target, ' ', pi.text, '?>')
+ else:
+ raise Exception("Processing Instructions can't be converted to HTML")
+ if (None != pi.tail):
+ self._output(self._escapeXML(pi.tail))
+
+ def _serializeComment(self, comment):
+ self._output('<!--', comment.text, '-->') # XXX escape comment?
+ if (None != comment.tail):
+ self._output(self._escapeXML(comment.tail))
+
+ def _serializeNode(self, node, namespacePrefixes = None):
+ if (isinstance(node, etree._Entity)):
+ self._serializeEntity(node)
+ elif (isinstance(node, etree._ProcessingInstruction)):
+ self._serializePI(node)
+ elif (isinstance(node, etree._Comment)):
+ self._serializeComment(node)
+ else:
+ self._serializeElement(node, namespacePrefixes)
+
+
+ def _serializeTree(self, tree):
+ root = tree.getroot()
+ preceding = [node for node in root.itersiblings(preceding = True)]
+ preceding.reverse()
+ for node in preceding:
+ self._serializeNode(node)
+ self._serializeNode(root)
+ for node in root.itersiblings():
+ self._serializeNode(node)
+
+ def _serializeDoctype(self, tree, doctype, default):
+ if (doctype):
+ self._output(self.gDocTypes[doctype], '\n')
+ else:
+ if (hasattr(tree, 'docinfo') and tree.docinfo and tree.docinfo.doctype):
+ doctypeSearch = tree.docinfo.doctype.lower()
+ for doctype in self.gDocTypes:
+ if (self.gDocTypes[doctype].lower() == doctypeSearch):
+ break
+ else:
+ doctype = None
+ if (self.mXHTML):
+ if ('html' == doctype):
+ doctype = 'xhtml10'
+ elif ('html4' == doctype):
+ doctype = 'xhtml10'
+ elif ('html4-transitional' == doctype):
+ doctype = 'xhtml10-transitional'
+ elif ('html4-frameset' == doctype):
+ doctype = 'xhtml10-frameset'
+ else:
+ if ('xhtml10' == doctype):
+ doctype = 'html4'
+ elif ('xhtml10-transitional' == doctype):
+ doctype = 'html4-transitional'
+ elif ('xhtml10-frameset' == doctype):
+ doctype = 'html4-frameset'
+ elif ('xhtml11' == doctype):
+ doctype = 'html4'
+ if (doctype):
+ self._output(self.gDocTypes[doctype], '\n')
+ else:
+ self._output(tree.docinfo.doctype, '\n')
+ else:
+ self._output(self.gDocTypes[default], '\n')
+
+
+ def serializeHTML(self, tree, doctype = None):
+ self._reset()
+ self._serializeDoctype(tree, doctype, 'html')
+ self._serializeTree(tree)
+ return self.mOutput
+
+ def serializeXHTML(self, tree, doctype = None):
+ self._reset(True)
+ # XXX '<!xml ...' ??
+ self._serializeDoctype(tree, doctype, 'xhtml11')
+ self._serializeTree(tree)
+ return self.mOutput
+
+