summaryrefslogtreecommitdiffstats
path: root/testing/web-platform/tests/url/tools
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 17:32:43 +0000
commit6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
treea68f146d7fa01f0134297619fbe7e33db084e0aa /testing/web-platform/tests/url/tools
parentInitial commit. (diff)
downloadthunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'testing/web-platform/tests/url/tools')
-rw-r--r--testing/web-platform/tests/url/tools/IdnaTestV2-parser.py174
1 files changed, 174 insertions, 0 deletions
diff --git a/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py b/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py
new file mode 100644
index 0000000000..e4154f0ee8
--- /dev/null
+++ b/testing/web-platform/tests/url/tools/IdnaTestV2-parser.py
@@ -0,0 +1,174 @@
+# This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the
+# URL Standard.
+#
+# The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to
+# these issues:
+#
+# * https://github.com/whatwg/url/issues/341
+# * https://github.com/whatwg/url/issues/543
+# * https://github.com/whatwg/url/issues/733
+# * https://github.com/whatwg/url/issues/744
+#
+# Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues.
+
+import argparse
+import json
+import os
+import re
+import requests
+
+def get_IdnaTestV2_lines():
+ IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt")
+ if not os.path.exists(IdnaTestV2):
+ # Download IdnaTestV2.txt if it doesn't exist yet
+ open(IdnaTestV2, "w").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)
+ return open(IdnaTestV2, "r").readlines()
+
+def remove_escapes(input):
+ return json.loads("\"" + input + "\"")
+
+def ends_in_a_number(input):
+ # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there
+ # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding.
+ # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It
+ # appears to suffice for the tests in question though.
+ parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input)
+ if not parts:
+ return False
+ if parts[-1] == "":
+ if len(parts) == 1:
+ return False
+ parts.pop()
+ return parts[-1].isascii() and parts[-1].isdigit()
+
+def contains_bidi_status(statuses):
+ for status in statuses:
+ if status in ["B1", "B2", "B3", "B4", "B5", "B6"]:
+ return True
+ return False
+
+def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi):
+ # Main quest.
+ output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."]
+ output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude_bidi: {exclude_bidi}")
+
+ # Side quest.
+ unique_statuses = []
+
+ for line in lines:
+ # Remove newlines
+ line = line.rstrip()
+
+ # Remove lines that are comments or empty
+ if line.startswith("#") or line == "":
+ continue
+
+ # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
+ line = remove_escapes(line)
+
+ # Normalize columns
+ #
+ # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
+ # about the following columns:
+ #
+ # * Column 1 (source)
+ # * Column 4 (toAsciiN)
+ # * Column 5 (toAsciiNStatus)
+ #
+ # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion.
+ columns = [column.strip() for column in line.split(";")]
+
+ # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source))
+ source = columns[0]
+ to_unicode = columns[1]
+ if to_unicode == "":
+ to_unicode = source
+
+ # Immediately exclude IPv4-like tests when desired. While we could force all their
+ # expectations to be failure instead, it's not clear we need that many additional tests that
+ # were actually trying to test something else.
+ if exclude_ipv4_like:
+ if ends_in_a_number(source):
+ continue
+
+ if exclude_std3:
+ if re.search(r"\u2260|\u226E|\u226F|\<|\>|\$|,", to_unicode):
+ continue
+
+ # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode))
+ to_ascii = columns[3]
+ if to_ascii == "":
+ to_ascii = to_unicode
+
+ # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus))
+ temp_statuses = columns[4]
+ if temp_statuses == "":
+ temp_statuses = columns[2]
+
+ statuses = []
+ if temp_statuses != "":
+ assert temp_statuses.startswith("[")
+ statuses = [status.strip() for status in temp_statuses[1:-1].split(",")]
+
+ # Side quest time.
+ for status in statuses:
+ if status not in unique_statuses:
+ unique_statuses.append(status)
+
+ # The URL Standard has
+ #
+ # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
+ # * CheckHyphens=false; thus ignore V2, V3?
+ # * VerifyDnsLength=false; thus ignore A4_1 and A4_2
+ ignored_statuses = []
+ for status in statuses:
+ if status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
+ ignored_statuses.append(status)
+ for status in ignored_statuses:
+ statuses.remove(status)
+
+ if exclude_bidi and contains_bidi_status(statuses):
+ continue
+
+ if len(statuses) > 0:
+ to_ascii = None
+
+ test = { "input": source, "output": to_ascii }
+ comment = ""
+ for status in statuses:
+ comment += status + "; "
+ for status in ignored_statuses:
+ comment += status + " (ignored); "
+ if comment != "":
+ test["comment"] = comment.strip()[:-1]
+ output.append(test)
+
+ unique_statuses.sort()
+ return { "tests": output, "unique_statuses": unique_statuses }
+
+def to_json(data):
+ handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w")
+ handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
+ handle.write("\n")
+ handle.close()
+
+def main():
+ parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!")
+ parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.")
+ parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)")
+ parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)")
+ parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.")
+ parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.")
+ args = parser.parse_args()
+
+ if args.generate or args.statuses:
+ output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi)
+ if args.statuses:
+ print(output["unique_statuses"])
+ else:
+ assert args.generate
+ to_json(output["tests"])
+ else:
+ parser.print_usage()
+
+main()