summaryrefslogtreecommitdiffstats
path: root/js/src/util/make_unicode.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /js/src/util/make_unicode.py
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'js/src/util/make_unicode.py')
-rwxr-xr-xjs/src/util/make_unicode.py1572
1 files changed, 1572 insertions, 0 deletions
diff --git a/js/src/util/make_unicode.py b/js/src/util/make_unicode.py
new file mode 100755
index 0000000000..13fc354a9e
--- /dev/null
+++ b/js/src/util/make_unicode.py
@@ -0,0 +1,1572 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Based upon makeunicodedata.py
+# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
+# written by Fredrik Lundh (fredrik@pythonware.com)
+#
+# Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import print_function, unicode_literals
+
+import csv
+import io
+import re
+import os
+import sys
+from contextlib import closing
+from functools import partial
+from itertools import chain, tee
+from operator import is_not, itemgetter
+from zipfile import ZipFile
+
+if sys.version_info.major == 2:
+ from itertools import ifilter as filter, imap as map, izip_longest as zip_longest
+ from urllib2 import urlopen
+
+ range = xrange
+else:
+ from itertools import zip_longest
+ from urllib.request import urlopen
+
+
+class codepoint_dict(dict):
+ def name(self, code_point):
+ (_, _, name, alias) = self[code_point]
+ return "{}{}".format(name, (" (" + alias + ")" if alias else ""))
+
+ def full_name(self, code_point):
+ (_, _, name, alias) = self[code_point]
+ return "U+{:04X} {}{}".format(
+ code_point, name, (" (" + alias + ")" if alias else "")
+ )
+
+
+# ECMAScript 2016
+# §11.2 White Space
+whitespace = [
+ # python doesn't support using control character names :(
+ 0x9, # CHARACTER TABULATION
+ 0xB, # LINE TABULATION
+ 0xC, # FORM FEED
+ ord("\N{SPACE}"),
+ ord("\N{NO-BREAK SPACE}"),
+ ord("\N{ZERO WIDTH NO-BREAK SPACE}"), # also BOM
+]
+
+# §11.3 Line Terminators
+line_terminator = [
+ 0xA, # LINE FEED
+ 0xD, # CARRIAGE RETURN
+ ord("\N{LINE SEPARATOR}"),
+ ord("\N{PARAGRAPH SEPARATOR}"),
+]
+
+# These are also part of IdentifierPart §11.6 Names and Keywords
+compatibility_identifier_part = [
+ ord("\N{ZERO WIDTH NON-JOINER}"),
+ ord("\N{ZERO WIDTH JOINER}"),
+]
+
+FLAG_SPACE = 1 << 0
+FLAG_UNICODE_ID_START = 1 << 1
+FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2
+
+MAX_BMP = 0xFFFF
+
+public_domain = """
+/*
+ * Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/licenses/publicdomain/
+ */
+"""
+
+mpl_license = """\
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"""
+
+warning_message = """\
+/* Generated by make_unicode.py DO NOT MODIFY */
+"""
+
+unicode_version_message = """\
+/* Unicode version: {0} */
+"""
+
+
+def read_unicode_data(unicode_data):
+ """
+ If you want to understand how this wonderful file format works checkout
+ Unicode Standard Annex #44 - Unicode Character Database
+ http://www.unicode.org/reports/tr44/
+ """
+
+ reader = csv.reader(unicode_data, delimiter=str(";"))
+
+ while True:
+ row = next(reader, None)
+ if row is None:
+ return
+ name = row[1]
+
+ # We need to expand the UAX #44 4.2.3 Code Point Range
+ if name.startswith("<") and name.endswith("First>"):
+ next_row = next(reader)
+
+ for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
+ row[0] = i
+ row[1] = name[1:-8]
+
+ yield row
+ else:
+ row[0] = int(row[0], 16)
+ yield row
+
+
+def read_case_folding(case_folding):
+ """
+ File format is:
+ <code>; <status>; <mapping>; # <name>
+ """
+
+ for line in case_folding:
+ if line == "\n" or line.startswith("#"):
+ continue
+ row = line.split("; ")
+ if row[1] in ["F", "T"]:
+ continue
+ assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings"
+ code = int(row[0], 16)
+ mapping = int(row[2], 16)
+ yield (code, mapping)
+
+
+def read_derived_core_properties(derived_core_properties):
+ for line in derived_core_properties:
+ if line == "\n" or line.startswith("#"):
+ continue
+ row = line.split("#")[0].split(";")
+ char_range = row[0].strip()
+ char_property = row[1].strip()
+ if ".." not in char_range:
+ yield (int(char_range, 16), char_property)
+ else:
+ [start, end] = char_range.split("..")
+ for char in range(int(start, 16), int(end, 16) + 1):
+ yield (char, char_property)
+
+
+def read_special_casing(special_casing):
+ # Format:
+ # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+ for line in special_casing:
+ if line == "\n" or line.startswith("#"):
+ continue
+ row = line.split("#")[0].split(";")
+ code = int(row[0].strip(), 16)
+ lower = row[1].strip()
+ lower = [int(c, 16) for c in lower.split(" ")] if lower else []
+ upper = row[3].strip()
+ upper = [int(c, 16) for c in upper.split(" ")] if upper else []
+ languages = []
+ contexts = []
+ condition = row[4].strip()
+ if condition:
+ for cond in condition.split(" "):
+ if cond[0].islower():
+ languages.append(cond)
+ else:
+ contexts.append(cond)
+ pass
+ yield (code, lower, upper, languages, contexts)
+
+
+def int_ranges(ints):
+ """ Yields consecutive ranges (inclusive) from integer values. """
+ (a, b) = tee(sorted(ints))
+ start = next(b)
+ for (curr, succ) in zip_longest(a, b):
+ if curr + 1 != succ:
+ yield (start, curr)
+ start = succ
+
+
+def utf16_encode(code):
+ NonBMPMin = 0x10000
+ LeadSurrogateMin = 0xD800
+ TrailSurrogateMin = 0xDC00
+
+ lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin
+ trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
+
+ return lead, trail
+
+
+def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
+ # Find continuous range in convert_map.
+ convert_list = []
+ entry = None
+ for code in sorted(convert_map.keys()):
+ lead, trail = utf16_encode(code)
+ converted = convert_map[code]
+ diff = converted - code
+
+ if (
+ entry
+ and code == entry["code"] + entry["length"]
+ and diff == entry["diff"]
+ and lead == entry["lead"]
+ ):
+ entry["length"] += 1
+ continue
+
+ entry = {
+ "code": code,
+ "diff": diff,
+ "length": 1,
+ "lead": lead,
+ "trail": trail,
+ }
+ convert_list.append(entry)
+
+ # Generate macro call for each range.
+ lines = []
+ comment = []
+ for entry in convert_list:
+ from_code = entry["code"]
+ to_code = entry["code"] + entry["length"] - 1
+ diff = entry["diff"]
+
+ lead = entry["lead"]
+ from_trail = entry["trail"]
+ to_trail = entry["trail"] + entry["length"] - 1
+
+ lines.append(
+ " MACRO(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})".format(
+ from_code, to_code, lead, from_trail, to_trail, diff
+ )
+ )
+ comment.append(
+ "// {} .. {}".format(
+ codepoint_table.full_name(from_code), codepoint_table.full_name(to_code)
+ )
+ )
+
+ out_file.write("\n".join(comment))
+ out_file.write("\n")
+ out_file.write("#define FOR_EACH_NON_BMP_{}(MACRO) \\\n".format(name))
+ out_file.write(" \\\n".join(lines))
+ out_file.write("\n")
+
+
+def process_derived_core_properties(derived_core_properties):
+ id_start = set()
+ id_continue = set()
+
+ for (char, prop) in read_derived_core_properties(derived_core_properties):
+ if prop == "ID_Start":
+ id_start.add(char)
+ if prop == "ID_Continue":
+ id_continue.add(char)
+
+ return (id_start, id_continue)
+
+
+def process_unicode_data(unicode_data, derived_core_properties):
+ dummy = (0, 0, 0)
+ table = [dummy]
+ cache = {dummy: 0}
+ index = [0] * (MAX_BMP + 1)
+
+ codepoint_table = codepoint_dict()
+ test_space_table = []
+
+ non_bmp_lower_map = {}
+ non_bmp_upper_map = {}
+ non_bmp_id_start_set = {}
+ non_bmp_id_cont_set = {}
+ non_bmp_space_set = {}
+
+ (id_start, id_continue) = process_derived_core_properties(derived_core_properties)
+
+ for row in read_unicode_data(unicode_data):
+ code = row[0]
+ name = row[1]
+ category = row[2]
+ alias = row[-5]
+ uppercase = row[-3]
+ lowercase = row[-2]
+
+ if uppercase:
+ upper = int(uppercase, 16)
+ else:
+ upper = code
+
+ if lowercase:
+ lower = int(lowercase, 16)
+ else:
+ lower = code
+
+ codepoint_table[code] = (upper, lower, name, alias)
+
+ if code > MAX_BMP:
+ if code != lower:
+ non_bmp_lower_map[code] = lower
+ if code != upper:
+ non_bmp_upper_map[code] = upper
+ if category == "Zs":
+ non_bmp_space_set[code] = 1
+ test_space_table.append(code)
+ if code in id_start:
+ non_bmp_id_start_set[code] = 1
+ if code in id_continue:
+ non_bmp_id_cont_set[code] = 1
+ continue
+
+ assert lower <= MAX_BMP and upper <= MAX_BMP
+
+ flags = 0
+
+ # we combine whitespace and lineterminators because in pratice we don't need them separated
+ if category == "Zs" or code in whitespace or code in line_terminator:
+ flags |= FLAG_SPACE
+ test_space_table.append(code)
+
+ # §11.6 (IdentifierStart)
+ if code in id_start:
+ flags |= FLAG_UNICODE_ID_START
+
+ # §11.6 (IdentifierPart)
+ elif code in id_continue or code in compatibility_identifier_part:
+ flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
+
+ up_d = upper - code
+ low_d = lower - code
+
+ assert up_d > -65535 and up_d < 65535
+ assert low_d > -65535 and low_d < 65535
+
+ upper = up_d & 0xFFFF
+ lower = low_d & 0xFFFF
+
+ item = (upper, lower, flags)
+
+ i = cache.get(item)
+ if i is None:
+ assert item not in table
+ cache[item] = i = len(table)
+ table.append(item)
+ index[code] = i
+
+ return (
+ table,
+ index,
+ non_bmp_lower_map,
+ non_bmp_upper_map,
+ non_bmp_space_set,
+ non_bmp_id_start_set,
+ non_bmp_id_cont_set,
+ codepoint_table,
+ test_space_table,
+ )
+
+
+def process_case_folding(case_folding):
+ folding_map = {}
+ rev_folding_map = {}
+ folding_dummy = (0,)
+ folding_table = [folding_dummy]
+ folding_cache = {folding_dummy: 0}
+ folding_index = [0] * (MAX_BMP + 1)
+
+ folding_tests = []
+ folding_codes = set()
+
+ for (code, mapping) in read_case_folding(case_folding):
+ folding_map[code] = mapping
+
+ if mapping not in rev_folding_map:
+ rev_folding_map[mapping] = [code]
+ else:
+ rev_folding_map[mapping].append(code)
+
+ folding_codes.add(code)
+ folding_codes.add(mapping)
+
+ for code in sorted(folding_codes):
+ if code in folding_map:
+ folding = folding_map[code]
+ else:
+ folding = code
+
+ if code in rev_folding_map:
+ rev_folding = rev_folding_map[code]
+ elif folding in rev_folding_map:
+ rev_folding = [c for c in rev_folding_map[folding] if c != code]
+ else:
+ rev_folding = []
+
+ if folding != code or len(rev_folding):
+ item = [code]
+ if folding != code:
+ item.append(folding)
+ folding_tests.append(item + rev_folding)
+
+ if code > MAX_BMP:
+ continue
+
+ folding_d = folding - code
+
+ assert folding_d > -65535 and folding_d < 65535
+
+ folding = folding_d & 0xFFFF
+
+ item = (folding,)
+
+ i = folding_cache.get(item)
+ if i is None:
+ assert item not in folding_table
+ folding_cache[item] = i = len(folding_table)
+ folding_table.append(item)
+ folding_index[code] = i
+ return (folding_table, folding_index, folding_tests)
+
+
+def process_special_casing(special_casing, table, index):
+ # Unconditional special casing.
+ unconditional_tolower = {}
+ unconditional_toupper = {}
+
+ # Conditional special casing, language independent.
+ conditional_tolower = {}
+ conditional_toupper = {}
+
+ # Conditional special casing, language dependent.
+ lang_conditional_tolower = {}
+ lang_conditional_toupper = {}
+
+ def caseInfo(code):
+ (upper, lower, flags) = table[index[code]]
+ return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF)
+
+ for (code, lower, upper, languages, contexts) in read_special_casing(
+ special_casing
+ ):
+ assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code
+ assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages
+ assert len(contexts) <= 1, (
+ "Expected zero or one casing contexts: %s" % languages
+ )
+
+ (default_lower, default_upper) = caseInfo(code)
+ special_lower = len(lower) != 1 or lower[0] != default_lower
+ special_upper = len(upper) != 1 or upper[0] != default_upper
+
+ # Invariant: If |code| has casing per UnicodeData.txt, then it also has
+ # casing rules in SpecialCasing.txt.
+ assert code == default_lower or len(lower) != 1 or code != lower[0]
+ assert code == default_upper or len(upper) != 1 or code != upper[0]
+
+ language = languages[0] if languages else None
+ context = contexts[0] if contexts else None
+
+ if not language and not context:
+ if special_lower:
+ unconditional_tolower[code] = lower
+ if special_upper:
+ unconditional_toupper[code] = upper
+ elif not language and context:
+ if special_lower:
+ conditional_tolower[code] = (lower, context)
+ if special_upper:
+ conditional_toupper[code] = (upper, context)
+ else:
+ if language not in lang_conditional_tolower:
+ lang_conditional_tolower[language] = {}
+ lang_conditional_toupper[language] = {}
+ if special_lower:
+ lang_conditional_tolower[language][code] = (lower, context)
+ if special_upper:
+ lang_conditional_toupper[language][code] = (upper, context)
+
+ # Certain special casing rules are inlined in jsstr.cpp, ensure these cases
+ # still match the current SpecialCasing.txt file.
+ def lowerCase(code):
+ (lower, _) = caseInfo(code)
+ return lower
+
+ def upperCase(code):
+ (_, upper) = caseInfo(code)
+ return upper
+
+ def ascii(char_dict):
+ return (ch for ch in char_dict.keys() if ch <= 0x7F)
+
+ def latin1(char_dict):
+ return (ch for ch in char_dict.keys() if ch <= 0xFF)
+
+ def is_empty(iterable):
+ return not any(True for _ in iterable)
+
+ def is_equals(iter1, iter2):
+ return all(x == y for (x, y) in zip_longest(iter1, iter2))
+
+ # Ensure no ASCII characters have special case mappings.
+ assert is_empty(ascii(unconditional_tolower))
+ assert is_empty(ascii(unconditional_toupper))
+ assert is_empty(ascii(conditional_tolower))
+ assert is_empty(ascii(conditional_toupper))
+
+ # Ensure no Latin1 characters have special lower case mappings.
+ assert is_empty(latin1(unconditional_tolower))
+ assert is_empty(latin1(conditional_tolower))
+
+ # Ensure no Latin1 characters have conditional special upper case mappings.
+ assert is_empty(latin1(conditional_toupper))
+
+ # Ensure U+00DF is the only Latin1 character with a special upper case mapping.
+ assert is_equals([0x00DF], latin1(unconditional_toupper))
+
+ # Ensure U+0130 is the only character with a special lower case mapping.
+ assert is_equals([0x0130], unconditional_tolower)
+
+ # Ensure no characters have language independent conditional upper case mappings.
+ assert is_empty(conditional_toupper)
+
+ # Ensure U+03A3 is the only character with language independent conditional lower case mapping.
+ assert is_equals([0x03A3], conditional_tolower)
+
+ # Verify U+0130 and U+03A3 have simple lower case mappings.
+ assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
+
+ # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
+ assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys()))
+ assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys()))
+
+ # Maximum case mapping length is three characters.
+ assert (
+ max(
+ map(
+ len,
+ chain(
+ unconditional_tolower.values(),
+ unconditional_toupper.values(),
+ map(itemgetter(0), conditional_tolower.values()),
+ map(itemgetter(0), conditional_toupper.values()),
+ map(
+ itemgetter(0),
+ chain.from_iterable(
+ d.values() for d in lang_conditional_tolower.values()
+ ),
+ ),
+ map(
+ itemgetter(0),
+ chain.from_iterable(
+ d.values() for d in lang_conditional_toupper.values()
+ ),
+ ),
+ ),
+ )
+ )
+ <= 3
+ )
+
+ # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
+ assert set(
+ [
+ "After_I",
+ "After_Soft_Dotted",
+ "Final_Sigma",
+ "More_Above",
+ "Not_Before_Dot",
+ ]
+ ).issuperset(
+ set(
+ filter(
+ partial(is_not, None),
+ chain(
+ map(itemgetter(1), conditional_tolower.values()),
+ map(itemgetter(1), conditional_toupper.values()),
+ map(
+ itemgetter(1),
+ chain.from_iterable(
+ d.values() for d in lang_conditional_tolower.values()
+ ),
+ ),
+ map(
+ itemgetter(1),
+ chain.from_iterable(
+ d.values() for d in lang_conditional_toupper.values()
+ ),
+ ),
+ ),
+ )
+ )
+ )
+
+ # Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
+ assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [
+ 0x0053,
+ 0x0053,
+ ]
+
+ # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
+ assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
+
+ # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
+ assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == (
+ [0x03C2],
+ "Final_Sigma",
+ )
+
+ return (unconditional_tolower, unconditional_toupper)
+
+
+def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table):
+ file_name = "UnicodeNonBMP.h"
+ with io.open(file_name, mode="w", encoding="utf-8") as non_bmp_file:
+ non_bmp_file.write(mpl_license)
+ non_bmp_file.write("\n")
+ non_bmp_file.write(warning_message)
+ non_bmp_file.write(unicode_version_message.format(version))
+ non_bmp_file.write(
+ """
+#ifndef util_UnicodeNonBMP_h
+#define util_UnicodeNonBMP_h
+
+// |MACRO| receives the following arguments
+// MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)
+// FROM: code point where the range starts
+// TO: code point where the range ends
+// LEAD: common lead surrogate of FROM and TO
+// TRAIL_FROM: trail surrogate of FROM
+// TRAIL_FROM: trail surrogate of TO
+// DIFF: the difference between the code point in the range and
+// converted code point
+
+"""
+ )
+
+ make_non_bmp_convert_macro(
+ non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table
+ )
+ non_bmp_file.write("\n")
+ make_non_bmp_convert_macro(
+ non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table
+ )
+
+ non_bmp_file.write(
+ """
+#endif /* util_UnicodeNonBMP_h */
+"""
+ )
+
+
+def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
+ def hexlit(n):
+ """ Returns C++ hex-literal for |n|. """
+ return "0x{:04X}".format(n)
+
+ def describe_range(ranges, depth):
+ indent = depth * " "
+ for (start, end) in ranges:
+ if start == end:
+ println(indent, "// {}".format(codepoint_table.full_name(start)))
+ else:
+ println(
+ indent,
+ "// {} .. {}".format(
+ codepoint_table.full_name(start), codepoint_table.full_name(end)
+ ),
+ )
+
+ def out_range(start, end):
+ """ Tests if the input character isn't a member of the set {x | start <= x <= end}. """
+ if start == end:
+ return "ch != {}".format(hexlit(start))
+ return "ch < {} || ch > {}".format(hexlit(start), hexlit(end))
+
+ def in_range(start, end, parenthesize=False):
+ """ Tests if the input character is in the set {x | start <= x <= end}. """
+ if start == end:
+ return "ch == {}".format(hexlit(start))
+ (left, right) = ("(", ")") if parenthesize else ("", "")
+ return "{}ch >= {} && ch <= {}{}".format(
+ left, hexlit(start), hexlit(end), right
+ )
+
+ def in_any_range(ranges, spaces):
+ """ Tests if the input character is included in any of the given ranges. """
+ lines = [[]]
+ for (start, end) in ranges:
+ expr = in_range(start, end, parenthesize=True)
+ line = " || ".join(lines[-1] + [expr])
+ if len(line) < (100 - len(spaces) - len(" ||")):
+ lines[-1].append(expr)
+ else:
+ lines.append([expr])
+ return " ||\n{}".format(spaces).join(" || ".join(t) for t in lines)
+
+ def write_range_accept(parent_list, child_list, depth):
+ """ Accepts the input character if it matches any code unit in |child_list|. """
+ (min_parent, max_parent) = (parent_list[0], parent_list[-1])
+ (min_child, max_child) = (child_list[0], child_list[-1])
+ assert min_child >= min_parent
+ assert max_child <= max_parent
+ indent = depth * " "
+
+ child_ranges = list(int_ranges(child_list))
+ has_successor = max_child != max_parent
+
+ # If |child_list| is a contiguous list of code units, emit a simple
+ # range check: |min_child <= input <= max_child|.
+ if len(child_ranges) == 1:
+ describe_range(child_ranges, depth)
+ if has_successor:
+ println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
+ println(indent, " return ch >= {};".format(hexlit(min_child)))
+ println(indent, "}")
+ else:
+ println(indent, "return {};".format(in_range(min_child, max_child)))
+ return
+
+ # Otherwise create a disjunction over the subranges in |child_ranges|.
+ if not has_successor:
+ spaces = indent + len("return ") * " "
+ else:
+ spaces = indent + len(" return ") * " "
+ range_test_expr = in_any_range(child_ranges, spaces)
+
+ if min_child != min_parent:
+ println(indent, "if (ch < {}) {{".format(hexlit(min_child)))
+ println(indent, " return false;")
+ println(indent, "}")
+
+ # If there's no successor block, we can omit the |input <= max_child| check,
+ # because it was already checked when we emitted the parent range test.
+ if not has_successor:
+ describe_range(child_ranges, depth)
+ println(indent, "return {};".format(range_test_expr))
+ else:
+ println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
+ describe_range(child_ranges, depth + 1)
+ println(indent, " return {};".format(range_test_expr))
+ println(indent, "}")
+
+ def write_ChangesWhenUpperCasedSpecialCasing():
+ """ Checks if the input has a special upper case mapping. """
+ println("bool")
+ println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)")
+ println("{")
+
+ assert unconditional_toupper, "|unconditional_toupper| is not empty"
+
+ # Sorted list of code units with special upper case mappings.
+ code_list = sorted(unconditional_toupper.keys())
+
+ # Fail-fast if the input character isn't a special casing character.
+ println(" if ({}) {{".format(out_range(code_list[0], code_list[-1])))
+ println(" return false;")
+ println(" }")
+
+ for i in range(0, 16):
+ # Check if the input characters is in the range:
+ # |start_point <= input < end_point|.
+ start_point = i << 12
+ end_point = (i + 1) << 12
+ matches = [cu for cu in code_list if start_point <= cu < end_point]
+
+ # Skip empty ranges.
+ if not matches:
+ continue
+
+ # If |matches| consists of only a few characters, directly check
+ # the input against the characters in |matches|.
+ if len(matches) <= 8:
+ write_range_accept(code_list, matches, depth=1)
+ continue
+
+ # Otherwise split into further subranges.
+
+ # Only enter the if-block if the input is less-or-equals to the
+ # largest value in the current range.
+ is_last_block = matches[-1] == code_list[-1]
+ if not is_last_block:
+ println(" if (ch <= {}) {{".format(hexlit(matches[-1])))
+ else:
+ println(" if (ch < {}) {{".format(hexlit(matches[0])))
+ println(" return false;")
+ println(" }")
+
+ for j in range(0, 16):
+ inner_start = start_point + (j << 8)
+ inner_end = start_point + ((j + 1) << 8)
+ inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
+
+ if inner_matches:
+ d = 1 if is_last_block else 2
+ write_range_accept(matches, inner_matches, depth=d)
+
+ if not is_last_block:
+ println(" }")
+
+ println("}")
+
+ def write_LengthUpperCaseSpecialCasing():
+ """ Slow case: Special casing character was found, returns its mapping length. """
+ println("size_t")
+ println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)")
+ println("{")
+
+ println(" switch(ch) {")
+ for (code, converted) in sorted(
+ unconditional_toupper.items(), key=itemgetter(0)
+ ):
+ println(
+ " case {}: return {}; // {}".format(
+ hexlit(code), len(converted), codepoint_table.name(code)
+ )
+ )
+ println(" }")
+ println("")
+ println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+ println(" return 0;")
+
+ println("}")
+
+ def write_AppendUpperCaseSpecialCasing():
+ """ Slow case: Special casing character was found, append its mapping characters. """
+ println("void")
+ println(
+ "js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)" # NOQA: E501
+ )
+ println("{")
+
+ println(" switch(ch) {")
+ for (code, converted) in sorted(
+ unconditional_toupper.items(), key=itemgetter(0)
+ ):
+ println(
+ " case {}: // {}".format(hexlit(code), codepoint_table.name(code))
+ )
+ for ch in converted:
+ println(
+ " elements[(*index)++] = {}; // {}".format(
+ hexlit(ch), codepoint_table.name(ch)
+ )
+ )
+ println(" return;")
+ println(" }")
+ println("")
+ println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+
+ println("}")
+
+ write_ChangesWhenUpperCasedSpecialCasing()
+ println("")
+ write_LengthUpperCaseSpecialCasing()
+ println("")
+ write_AppendUpperCaseSpecialCasing()
+
+
+def write_ascii_lookup_tables(table, index, write, println):
+ def is_id_compat(code):
+ return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}")
+
+ def is_id_start(code):
+ (upper, lower, flags) = table[index[code]]
+ return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code)
+
+ def is_id_continue(code):
+ (upper, lower, flags) = table[index[code]]
+ return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code)
+
+ def is_space(code):
+ (upper, lower, flags) = table[index[code]]
+ return flags & FLAG_SPACE
+
+ def write_entries(name, predicate):
+ println("const bool unicode::{}[] = {{".format(name))
+ header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip()
+ println("/* {} */".format(header))
+ for i in range(0, 13):
+ write("/* {0: >2} */".format(i))
+ for j in range(0, 10):
+ code = i * 10 + j
+ if code <= 0x7F:
+ write(" {},".format("true" if predicate(code) else "____"))
+ println("")
+ println("};")
+
+ println("")
+ println("#define ____ false")
+
+ println(
+ """
+/*
+ * Identifier start chars:
+ * - 36: $
+ * - 65..90: A..Z
+ * - 95: _
+ * - 97..122: a..z
+ */"""
+ )
+ write_entries("js_isidstart", is_id_start)
+
+ println(
+ """
+/*
+ * Identifier chars:
+ * - 36: $
+ * - 48..57: 0..9
+ * - 65..90: A..Z
+ * - 95: _
+ * - 97..122: a..z
+ */"""
+ )
+ write_entries("js_isident", is_id_continue)
+
+ println(
+ """
+/* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */"""
+ )
+ write_entries("js_isspace", is_space)
+
+ println("")
+ println("#undef ____")
+
+
+def write_latin1_lookup_tables(table, index, write, println):
+ def case_info(code):
+ assert 0 <= code and code <= MAX_BMP
+ (upper, lower, flags) = table[index[code]]
+ return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags)
+
+ def toLowerCase(code):
+ (_, lower, _) = case_info(code)
+ assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1"
+ return lower
+
+ def write_entries(name, mapper):
+ println("const JS::Latin1Char unicode::{}[] = {{".format(name))
+ header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip()
+ println("/* {} */".format(header))
+ for i in range(0, 16):
+ write("/* {0: >2} */".format(i))
+ for j in range(0, 16):
+ code = i * 16 + j
+ if code <= 0xFF:
+ write(" 0x{:02X},".format(mapper(code)))
+ println("")
+ println("};")
+
+ println("")
+ write_entries("latin1ToLowerCaseTable", toLowerCase)
+
+
+def make_bmp_mapping_test(
+ version, codepoint_table, unconditional_tolower, unconditional_toupper
+):
+ def unicodeEsc(n):
+ return "\\u{:04X}".format(n)
+
+ file_name = "../tests/non262/String/string-upper-lower-mapping.js"
+ with io.open(file_name, mode="w", encoding="utf-8") as output:
+ write = partial(print, file=output, sep="", end="")
+ println = partial(print, file=output, sep="", end="\n")
+
+ write(warning_message)
+ write(unicode_version_message.format(version))
+ write(public_domain)
+ println("var mapping = [")
+ for code in range(0, MAX_BMP + 1):
+ entry = codepoint_table.get(code)
+
+ if entry:
+ (upper, lower, _, _) = entry
+ upper = (
+ unconditional_toupper[code]
+ if code in unconditional_toupper
+ else [upper]
+ )
+ lower = (
+ unconditional_tolower[code]
+ if code in unconditional_tolower
+ else [lower]
+ )
+ println(
+ ' ["{}", "{}"], /* {} */'.format(
+ "".join(map(unicodeEsc, upper)),
+ "".join(map(unicodeEsc, lower)),
+ codepoint_table.name(code),
+ )
+ )
+ else:
+ println(' ["{0}", "{0}"],'.format(unicodeEsc(code)))
+ println("];")
+ write(
+ """
+assertEq(mapping.length, 0x10000);
+for (var i = 0; i <= 0xffff; i++) {
+ var char = String.fromCharCode(i);
+ var info = mapping[i];
+
+ assertEq(char.toUpperCase(), info[0]);
+ assertEq(char.toLowerCase(), info[1]);
+}
+
+if (typeof reportCompare === "function")
+ reportCompare(true, true);
+"""
+ )
+
+
+def make_non_bmp_mapping_test(
+ version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
+):
+ file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js"
+ with io.open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping:
+ test_non_bmp_mapping.write(warning_message)
+ test_non_bmp_mapping.write(unicode_version_message.format(version))
+ test_non_bmp_mapping.write(public_domain)
+
+ for code in sorted(non_bmp_upper_map.keys()):
+ test_non_bmp_mapping.write(
+ """\
+assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(
+ code,
+ non_bmp_upper_map[code],
+ codepoint_table.name(code),
+ codepoint_table.name(non_bmp_upper_map[code]),
+ )
+ )
+
+ for code in sorted(non_bmp_lower_map.keys()):
+ test_non_bmp_mapping.write(
+ """\
+assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(
+ code,
+ non_bmp_lower_map[code],
+ codepoint_table.name(code),
+ codepoint_table.name(non_bmp_lower_map[code]),
+ )
+ )
+
+ test_non_bmp_mapping.write(
+ """
+if (typeof reportCompare === "function")
+ reportCompare(true, true);
+"""
+ )
+
+
+def make_space_test(version, test_space_table, codepoint_table):
+ def hex_and_name(c):
+ return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c))
+
+ file_name = "../tests/non262/String/string-space-trim.js"
+ with io.open(file_name, mode="w", encoding="utf-8") as test_space:
+ test_space.write(warning_message)
+ test_space.write(unicode_version_message.format(version))
+ test_space.write(public_domain)
+ test_space.write("var onlySpace = String.fromCharCode(\n")
+ test_space.write(",\n".join(map(hex_and_name, test_space_table)))
+ test_space.write("\n);\n")
+ test_space.write(
+ """
+assertEq(onlySpace.trim(), "");
+assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
+assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
+assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
+
+if (typeof reportCompare === "function")
+ reportCompare(true, true);
+"""
+ )
+
+
+def make_regexp_space_test(version, test_space_table, codepoint_table):
+ def hex_and_name(c):
+ return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c))
+
+ file_name = "../tests/non262/RegExp/character-class-escape-s.js"
+ with io.open(file_name, mode="w", encoding="utf-8") as test_space:
+ test_space.write(warning_message)
+ test_space.write(unicode_version_message.format(version))
+ test_space.write(public_domain)
+ test_space.write("var onlySpace = String.fromCodePoint(\n")
+ test_space.write(",\n".join(map(hex_and_name, test_space_table)))
+ test_space.write("\n);\n")
+ test_space.write(
+ """
+assertEq(/^\s+$/.exec(onlySpace) !== null, true);
+assertEq(/^[\s]+$/.exec(onlySpace) !== null, true);
+assertEq(/^[^\s]+$/.exec(onlySpace) === null, true);
+
+assertEq(/^\S+$/.exec(onlySpace) === null, true);
+assertEq(/^[\S]+$/.exec(onlySpace) === null, true);
+assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true);
+
+// Also test with Unicode RegExps.
+assertEq(/^\s+$/u.exec(onlySpace) !== null, true);
+assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true);
+assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true);
+
+assertEq(/^\S+$/u.exec(onlySpace) === null, true);
+assertEq(/^[\S]+$/u.exec(onlySpace) === null, true);
+assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true);
+
+if (typeof reportCompare === "function")
+ reportCompare(true, true);
+"""
+ )
+
+
+def make_icase_test(version, folding_tests, codepoint_table):
+ def char_hex(c):
+ return "0x{:04X}".format(c)
+
+ file_name = "../tests/non262/RegExp/unicode-ignoreCase.js"
+ with io.open(file_name, mode="w", encoding="utf-8") as test_icase:
+ test_icase.write(warning_message)
+ test_icase.write(unicode_version_message.format(version))
+ test_icase.write(public_domain)
+ test_icase.write(
+ """
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
+
+print(BUGNUMBER + ": " + summary);
+
+function test(code, ...equivs) {
+ var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
+ var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
+ assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
+ codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
+ assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
+}
+"""
+ )
+ for args in folding_tests:
+ test_icase.write(
+ "test({}); // {}\n".format(
+ ", ".join(map(char_hex, args)),
+ ", ".join(map(codepoint_table.name, args)),
+ )
+ )
+ test_icase.write(
+ """
+if (typeof reportCompare === "function")
+ reportCompare(true, true);
+"""
+ )
+
+
+def make_unicode_file(
+ version,
+ table,
+ index,
+ folding_table,
+ folding_index,
+ non_bmp_space_set,
+ non_bmp_id_start_set,
+ non_bmp_id_cont_set,
+ unconditional_toupper,
+ codepoint_table,
+):
+ index1, index2, shift = splitbins(index)
+
+ # Don't forget to update CharInfo in Unicode.h if you need to change this
+ assert shift == 6
+
+ folding_index1, folding_index2, folding_shift = splitbins(folding_index)
+
+ # Don't forget to update CaseFoldInfo in Unicode.h if you need to change this
+ assert folding_shift == 5
+
+ # verify correctness
+ for char in index:
+ test = table[index[char]]
+
+ idx = index1[char >> shift]
+ idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
+
+ assert test == table[idx]
+
+ # verify correctness
+ for char in folding_index:
+ test = folding_table[folding_index[char]]
+
+ idx = folding_index1[char >> folding_shift]
+ idx = folding_index2[
+ (idx << folding_shift) + (char & ((1 << folding_shift) - 1))
+ ]
+
+ assert test == folding_table[idx]
+
+ comment = """
+/*
+ * So how does indexing work?
+ * First let's have a look at a char16_t, 16-bits:
+ * [................]
+ * Step 1:
+ * Extracting the upper 11 bits from the char16_t.
+ * upper = char >> 5 ([***********.....])
+ * Step 2:
+ * Using these bits to get an reduced index from index1.
+ * index = index1[upper]
+ * Step 3:
+ * Combining the index and the bottom 5 bits of the original char16_t.
+ * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
+ *
+ * The advantage here is that the biggest number in index1 doesn't need 10 bits,
+ * but 7 and we save some memory.
+ *
+ * Step 4:
+ * Get the character informations by looking up real_index in js_charinfo.
+ *
+ * Pseudocode of generation:
+ *
+ * let table be the mapping of char16_t => js_charinfo_index
+ * let index1 be an empty array
+ * let index2 be an empty array
+ * let cache be a hash map
+ *
+ * while shift is less then maximal amount you can shift 0xffff before it's 0
+ * let chunks be table split in chunks of size 2**shift
+ *
+ * for every chunk in chunks
+ * if chunk is in cache
+ * let index be cache[chunk]
+ * else
+ * let index be the max key of index2 + 1
+ * for element in chunk
+ * push element to index2
+ * put index as chunk in cache
+ *
+ * push index >> shift to index1
+ *
+ * increase shift
+ * stop if you found the best shift
+ */
+"""
+
+ def dump(data, name, println):
+ println("const uint8_t unicode::{}[] = {{".format(name))
+
+ line = pad = " " * 4
+ lines = []
+ for entry in data:
+ assert entry < 256
+ s = str(entry)
+ s = s.rjust(3)
+
+ if len(line + s) + 5 > 99:
+ lines.append(line.rstrip())
+ line = pad + s + ", "
+ else:
+ line = line + s + ", "
+ lines.append(line.rstrip())
+
+ println("\n".join(lines))
+ println("};")
+
+ def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
+ println("const {} unicode::{}[] = {{".format(data_type, name))
+ for d in tbl:
+ println(" {{ {} }},".format(", ".join(str(e) for e in d)))
+ println("};")
+ println("")
+
+ dump(idx1, idx1_name, println)
+ println("")
+ dump(idx2, idx2_name, println)
+ println("")
+
+ def write_supplemental_identifier_method(name, group_set, println):
+ println("bool")
+ println("js::unicode::{}(uint32_t codePoint)".format(name))
+ println("{")
+ for (from_code, to_code) in int_ranges(group_set.keys()):
+ println(
+ " if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) {{ // {} .. {}".format(
+ from_code,
+ to_code,
+ codepoint_table.name(from_code),
+ codepoint_table.name(to_code),
+ )
+ )
+ println(" return true;")
+ println(" }")
+ println(" return false;")
+ println("}")
+ println("")
+
+ file_name = "Unicode.cpp"
+ with io.open(file_name, "w", encoding="utf-8") as data_file:
+ write = partial(print, file=data_file, sep="", end="")
+ println = partial(print, file=data_file, sep="", end="\n")
+
+ write(warning_message)
+ write(unicode_version_message.format(version))
+ write(public_domain)
+ println('#include "util/Unicode.h"')
+ println("")
+ println("using namespace js;")
+ println("using namespace js::unicode;")
+ write(comment)
+
+ write_table(
+ "CharacterInfo",
+ "js_charinfo",
+ table,
+ "index1",
+ index1,
+ "index2",
+ index2,
+ println,
+ )
+
+ write_table(
+ "FoldingInfo",
+ "js_foldinfo",
+ folding_table,
+ "folding_index1",
+ folding_index1,
+ "folding_index2",
+ folding_index2,
+ println,
+ )
+
+ # If the following assert fails, it means space character is added to
+ # non-BMP area. In that case the following code should be uncommented
+ # and the corresponding code should be added to frontend. (At least
+ # unicode::IsSpace will require updating to handle this.)
+ assert len(non_bmp_space_set.keys()) == 0
+
+ write_supplemental_identifier_method(
+ "IsIdentifierStartNonBMP", non_bmp_id_start_set, println
+ )
+
+ write_supplemental_identifier_method(
+ "IsIdentifierPartNonBMP", non_bmp_id_cont_set, println
+ )
+
+ write_special_casing_methods(unconditional_toupper, codepoint_table, println)
+
+ write_ascii_lookup_tables(table, index, write, println)
+
+ write_latin1_lookup_tables(table, index, write, println)
+
+
+def getsize(data):
+ """ return smallest possible integer size for the given array """
+ maxdata = max(data)
+ assert maxdata < 2 ** 32
+
+ if maxdata < 256:
+ return 1
+ elif maxdata < 65536:
+ return 2
+ else:
+ return 4
+
+
+def splitbins(t):
+ """t -> (t1, t2, shift). Split a table to save space.
+
+ t is a sequence of ints. This function can be useful to save space if
+ many of the ints are the same. t1 and t2 are lists of ints, and shift
+ is an int, chosen to minimize the combined size of t1 and t2 (in C
+ code), and where for each i in range(len(t)),
+ t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+ where mask is a bitmask isolating the last "shift" bits.
+ """
+
+ def dump(t1, t2, shift, bytes):
+ print(
+ "%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes),
+ file=sys.stderr,
+ )
+ print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr)
+
+ n = len(t) - 1 # last valid index
+ maxshift = 0 # the most we can shift n and still have something left
+ if n > 0:
+ while n >> 1:
+ n >>= 1
+ maxshift += 1
+ del n
+ bytes = sys.maxsize # smallest total size so far
+ t = tuple(t) # so slices can be dict keys
+ for shift in range(maxshift + 1):
+ t1 = []
+ t2 = []
+ size = 2 ** shift
+ bincache = {}
+
+ for i in range(0, len(t), size):
+ bin = t[i : i + size]
+
+ index = bincache.get(bin)
+ if index is None:
+ index = len(t2)
+ bincache[bin] = index
+ t2.extend(bin)
+ t1.append(index >> shift)
+
+ # determine memory size
+ b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
+ if b < bytes:
+ best = t1, t2, shift
+ bytes = b
+ t1, t2, shift = best
+
+ print("Best:", end=" ", file=sys.stderr)
+ dump(t1, t2, shift, bytes)
+
+ # exhaustively verify that the decomposition is correct
+ mask = 2 ** shift - 1
+ for i in range(len(t)):
+ assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+ return best
+
+
+def update_unicode(args):
+ base_path = os.getcwd()
+
+ version = args.version
+ if version is not None:
+ baseurl = "https://unicode.org/Public"
+ if version == "UNIDATA":
+ url = "%s/%s" % (baseurl, version)
+ else:
+ url = "%s/%s/ucd" % (baseurl, version)
+
+ print("Arguments:")
+ if version is not None:
+ print("\tVersion: %s" % version)
+ print("\tDownload url: %s" % url)
+
+ request_url = "{}/UCD.zip".format(url)
+ with closing(urlopen(request_url)) as downloaded_file:
+ downloaded_data = io.BytesIO(downloaded_file.read())
+
+ with ZipFile(downloaded_data) as zip_file:
+ for fname in [
+ "UnicodeData.txt",
+ "CaseFolding.txt",
+ "DerivedCoreProperties.txt",
+ "SpecialCasing.txt",
+ ]:
+ zip_file.extract(fname, path=base_path)
+ else:
+ print("\tUsing local files.")
+ print("\tAlways make sure you have the newest Unicode files!")
+ print("")
+
+ def version_from_file(f, fname):
+ pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname)
+ return pat_version.match(f.readline()).group("version")
+
+ with io.open(
+ os.path.join(base_path, "UnicodeData.txt"), "r", encoding="utf-8"
+ ) as unicode_data, io.open(
+ os.path.join(base_path, "CaseFolding.txt"), "r", encoding="utf-8"
+ ) as case_folding, io.open(
+ os.path.join(base_path, "DerivedCoreProperties.txt"), "r", encoding="utf-8"
+ ) as derived_core_properties, io.open(
+ os.path.join(base_path, "SpecialCasing.txt"), "r", encoding="utf-8"
+ ) as special_casing:
+ unicode_version = version_from_file(
+ derived_core_properties, "DerivedCoreProperties"
+ )
+
+ print("Processing...")
+ (
+ table,
+ index,
+ non_bmp_lower_map,
+ non_bmp_upper_map,
+ non_bmp_space_set,
+ non_bmp_id_start_set,
+ non_bmp_id_cont_set,
+ codepoint_table,
+ test_space_table,
+ ) = process_unicode_data(unicode_data, derived_core_properties)
+ (folding_table, folding_index, folding_tests) = process_case_folding(
+ case_folding
+ )
+ (unconditional_tolower, unconditional_toupper) = process_special_casing(
+ special_casing, table, index
+ )
+
+ print("Generating...")
+ make_unicode_file(
+ unicode_version,
+ table,
+ index,
+ folding_table,
+ folding_index,
+ non_bmp_space_set,
+ non_bmp_id_start_set,
+ non_bmp_id_cont_set,
+ unconditional_toupper,
+ codepoint_table,
+ )
+ make_non_bmp_file(
+ unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table
+ )
+
+ make_bmp_mapping_test(
+ unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper
+ )
+ make_non_bmp_mapping_test(
+ unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
+ )
+ make_space_test(unicode_version, test_space_table, codepoint_table)
+ make_regexp_space_test(unicode_version, test_space_table, codepoint_table)
+ make_icase_test(unicode_version, folding_tests, codepoint_table)
+
+
+if __name__ == "__main__":
+ import argparse
+
+ # This script must be run from js/src/util to work correctly.
+ if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util":
+ raise RuntimeError("%s must be run from js/src/util" % sys.argv[0])
+
+ parser = argparse.ArgumentParser(description="Update Unicode data.")
+
+ parser.add_argument(
+ "--version",
+ help='Optional Unicode version number. If specified, downloads the\
+ selected version from <https://unicode.org/Public>. If not specified\
+ uses the existing local files to generate the Unicode data. The\
+ number must match a published Unicode version, e.g. use\
+ "--version=8.0.0" to download Unicode 8 files. Alternatively use\
+ "--version=UNIDATA" to download the latest published version.',
+ )
+
+ parser.set_defaults(func=update_unicode)
+
+ args = parser.parse_args()
+ args.func(args)