import glob import hashlib import itertools import json import os import re import shutil import site import subprocess import sys import tempfile import urllib from importlib import reload import genshi from genshi.template import MarkupTemplate TESTS_PATH = "html/syntax/parsing/" def get_paths(): script_path = os.path.dirname(os.path.abspath(__file__)) repo_base = get_repo_base(script_path) tests_path = os.path.join(repo_base, TESTS_PATH) return script_path, tests_path def get_repo_base(path): while path: if os.path.exists(os.path.join(path, ".git")): return path else: path = os.path.dirname(path) def get_expected(data): data = "#document\n" + data return data def get_hash(data, container=None): if container == None: container = "" return hashlib.sha1(b"#container%s#data%s"%(container.encode("utf8"), data.encode("utf8"))).hexdigest() class Html5libInstall: def __init__(self, rev=None, tests_rev=None): self.html5lib_dir = None self.rev = rev self.tests_rev = tests_rev def __enter__(self): self.html5lib_dir = tempfile.TemporaryDirectory() html5lib_path = self.html5lib_dir.__enter__() html5lib_python_path = os.path.join(html5lib_path, "html5lib") html5lib_tests_path = os.path.join( html5lib_python_path, "html5lib", "tests", "testdata" ) subprocess.check_call( [ "git", "clone", "--no-checkout", "https://github.com/html5lib/html5lib-python.git", "html5lib", ], cwd=html5lib_path, ) rev = self.rev if self.rev is not None else "origin/master" subprocess.check_call( ["git", "checkout", rev], cwd=html5lib_python_path ) subprocess.check_call( [ "git", "submodule", "update", "--init", "--recursive", ], cwd=html5lib_python_path, ) subprocess.check_call(["pip", "install", "-e", "html5lib"], cwd=html5lib_path) reload(site) tests_rev = self.tests_rev if self.tests_rev is not None else "origin/master" subprocess.check_call(["git", "checkout", tests_rev], cwd=html5lib_tests_path) def __exit__(self, *args, **kwargs): subprocess.call(["pip", "uninstall", "-y", "html5lib"], cwd=self.html5lib_dir.name) self.html5lib_dir.__exit__(*args, **kwargs) self.html5lib_dir = None def make_tests(script_dir, out_dir, input_file_name, test_data): tests = [] innerHTML_tests = [] ids_seen = {} print(input_file_name) for test in test_data: if "script-off" in test: continue is_innerHTML = "document-fragment" in test data = test["data"] container = test["document-fragment"] if is_innerHTML else None assert test["document"], test expected = get_expected(test["document"]) test_list = innerHTML_tests if is_innerHTML else tests test_id = get_hash(data, container) if test_id in ids_seen: print("WARNING: id %s seen multiple times in file %s this time for test (%s, %s) before for test %s, skipping"%(test_id, input_file_name, container, data, ids_seen[test_id])) continue ids_seen[test_id] = (container, data) test_list.append({'string_uri_encoded_input':"\"%s\""%urllib.parse.quote(data.encode("utf8")), 'input':data, 'expected':expected, 'string_escaped_expected':json.dumps(urllib.parse.quote(expected.encode("utf8"))), 'id':test_id, 'container':container }) path_normal = None if tests: path_normal = write_test_file(script_dir, out_dir, tests, "html5lib_%s"%input_file_name, "html5lib_test.xml") path_innerHTML = None if innerHTML_tests: path_innerHTML = write_test_file(script_dir, out_dir, innerHTML_tests, "html5lib_innerHTML_%s"%input_file_name, "html5lib_test_fragment.xml") return path_normal, path_innerHTML def write_test_file(script_dir, out_dir, tests, file_name, template_file_name): file_name = os.path.join(out_dir, file_name + ".html") short_name = os.path.basename(file_name) with open(os.path.join(script_dir, template_file_name), "r") as f: template = MarkupTemplate(f) stream = template.generate(file_name=short_name, tests=tests) with open(file_name, "w") as f: f.write(str(stream.render('html', doctype='html5', encoding="utf8"), "utf-8")) return file_name def escape_js_string(in_data): return in_data.encode("utf8").encode("string-escape") def serialize_filenames(test_filenames): return "[" + ",\n".join("\"%s\""%item for item in test_filenames) + "]" def main(): script_dir, out_dir = get_paths() test_files = [] inner_html_files = [] with open(os.path.join(script_dir, "html5lib_python_revision"), "r") as f: html5lib_rev = f.read().strip() with open(os.path.join(script_dir, "html5lib_tests_revision"), "r") as f: html5lib_tests_rev = f.read().strip() with Html5libInstall(html5lib_rev, html5lib_tests_rev): from html5lib.tests import support if len(sys.argv) > 2: test_iterator = zip( itertools.repeat(False), sorted(os.path.abspath(item) for item in glob.glob(os.path.join(sys.argv[2], "*.dat")))) else: test_iterator = itertools.chain( zip(itertools.repeat(False), sorted(support.get_data_files("tree-construction"))), zip(itertools.repeat(True), sorted(support.get_data_files( os.path.join("tree-construction", "scripted"))))) for (scripted, test_file) in test_iterator: input_file_name = os.path.splitext(os.path.basename(test_file))[0] if scripted: input_file_name = "scripted_" + input_file_name test_data = support.TestData(test_file) test_filename, inner_html_file_name = make_tests(script_dir, out_dir, input_file_name, test_data) if test_filename is not None: test_files.append(test_filename) if inner_html_file_name is not None: inner_html_files.append(inner_html_file_name) if __name__ == "__main__": main()