diff options
Diffstat (limited to 'testing/web-platform/tests/url/tools/IdnaTestV2-parser.py')
-rw-r--r-- | testing/web-platform/tests/url/tools/IdnaTestV2-parser.py | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py b/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py new file mode 100644 index 0000000000..e4154f0ee8 --- /dev/null +++ b/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py @@ -0,0 +1,174 @@ +# This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the +# URL Standard. +# +# The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to +# these issues: +# +# * https://github.com/whatwg/url/issues/341 +# * https://github.com/whatwg/url/issues/543 +# * https://github.com/whatwg/url/issues/733 +# * https://github.com/whatwg/url/issues/744 +# +# Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues. + +import argparse +import json +import os +import re +import requests + +def get_IdnaTestV2_lines(): + IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt") + if not os.path.exists(IdnaTestV2): + # Download IdnaTestV2.txt if it doesn't exist yet + open(IdnaTestV2, "w").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text) + return open(IdnaTestV2, "r").readlines() + +def remove_escapes(input): + return json.loads("\"" + input + "\"") + +def ends_in_a_number(input): + # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there + # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding. + # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It + # appears to suffice for the tests in question though. + parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input) + if not parts: + return False + if parts[-1] == "": + if len(parts) == 1: + return False + parts.pop() + return parts[-1].isascii() and parts[-1].isdigit() + +def contains_bidi_status(statuses): + for status in statuses: + if status in ["B1", "B2", "B3", "B4", "B5", "B6"]: + return True + return False + +def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi): + # Main quest. + output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."] + output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude_bidi: {exclude_bidi}") + + # Side quest. + unique_statuses = [] + + for line in lines: + # Remove newlines + line = line.rstrip() + + # Remove lines that are comments or empty + if line.startswith("#") or line == "": + continue + + # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source) + line = remove_escapes(line) + + # Normalize columns + # + # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care + # about the following columns: + # + # * Column 1 (source) + # * Column 4 (toAsciiN) + # * Column 5 (toAsciiNStatus) + # + # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion. + columns = [column.strip() for column in line.split(";")] + + # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source)) + source = columns[0] + to_unicode = columns[1] + if to_unicode == "": + to_unicode = source + + # Immediately exclude IPv4-like tests when desired. While we could force all their + # expectations to be failure instead, it's not clear we need that many additional tests that + # were actually trying to test something else. + if exclude_ipv4_like: + if ends_in_a_number(source): + continue + + if exclude_std3: + if re.search(r"\u2260|\u226E|\u226F|\<|\>|\$|,", to_unicode): + continue + + # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode)) + to_ascii = columns[3] + if to_ascii == "": + to_ascii = to_unicode + + # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus)) + temp_statuses = columns[4] + if temp_statuses == "": + temp_statuses = columns[2] + + statuses = [] + if temp_statuses != "": + assert temp_statuses.startswith("[") + statuses = [status.strip() for status in temp_statuses[1:-1].split(",")] + + # Side quest time. + for status in statuses: + if status not in unique_statuses: + unique_statuses.append(status) + + # The URL Standard has + # + # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though) + # * CheckHyphens=false; thus ignore V2, V3? + # * VerifyDnsLength=false; thus ignore A4_1 and A4_2 + ignored_statuses = [] + for status in statuses: + if status in ["A4_1", "A4_2", "U1", "V2", "V3"]: + ignored_statuses.append(status) + for status in ignored_statuses: + statuses.remove(status) + + if exclude_bidi and contains_bidi_status(statuses): + continue + + if len(statuses) > 0: + to_ascii = None + + test = { "input": source, "output": to_ascii } + comment = "" + for status in statuses: + comment += status + "; " + for status in ignored_statuses: + comment += status + " (ignored); " + if comment != "": + test["comment"] = comment.strip()[:-1] + output.append(test) + + unique_statuses.sort() + return { "tests": output, "unique_statuses": unique_statuses } + +def to_json(data): + handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w") + handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': '))) + handle.write("\n") + handle.close() + +def main(): + parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!") + parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.") + parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)") + parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)") + parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.") + parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.") + args = parser.parse_args() + + if args.generate or args.statuses: + output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi) + if args.statuses: + print(output["unique_statuses"]) + else: + assert args.generate + to_json(output["tests"]) + else: + parser.print_usage() + +main() |