From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Apr 2024 21:33:14 +0200
Subject: Adding upstream version 115.7.0esr.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 js/src/util/make_unicode.py | 1573 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1573 insertions(+)
 create mode 100755 js/src/util/make_unicode.py

(limited to 'js/src/util/make_unicode.py')

diff --git a/js/src/util/make_unicode.py b/js/src/util/make_unicode.py
new file mode 100755
index 0000000000..617268fa9c
--- /dev/null
+++ b/js/src/util/make_unicode.py
@@ -0,0 +1,1573 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Based upon makeunicodedata.py
+# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
+# written by Fredrik Lundh (fredrik@pythonware.com)
+#
+#    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import csv
+import io
+import os
+import re
+import sys
+from contextlib import closing
+from functools import partial
+from itertools import chain, tee
+from operator import is_not, itemgetter
+from zipfile import ZipFile
+
+if sys.version_info.major == 2:
+    from itertools import ifilter as filter
+    from itertools import imap as map
+    from itertools import izip_longest as zip_longest
+
+    from urllib2 import urlopen
+
+    range = xrange
+else:
+    from itertools import zip_longest
+    from urllib.request import urlopen
+
+
+class codepoint_dict(dict):
+    def name(self, code_point):
+        (_, _, name, alias) = self[code_point]
+        return "{}{}".format(name, (" (" + alias + ")" if alias else ""))
+
+    def full_name(self, code_point):
+        (_, _, name, alias) = self[code_point]
+        return "U+{:04X} {}{}".format(
+            code_point, name, (" (" + alias + ")" if alias else "")
+        )
+
+
+# ECMAScript 2016
+# §11.2 White Space
+whitespace = [
+    # python doesn't support using control character names :(
+    0x9,  # CHARACTER TABULATION
+    0xB,  # LINE TABULATION
+    0xC,  # FORM FEED
+    ord("\N{SPACE}"),
+    ord("\N{NO-BREAK SPACE}"),
+    ord("\N{ZERO WIDTH NO-BREAK SPACE}"),  # also BOM
+]
+
+# §11.3 Line Terminators
+line_terminator = [
+    0xA,  # LINE FEED
+    0xD,  # CARRIAGE RETURN
+    ord("\N{LINE SEPARATOR}"),
+    ord("\N{PARAGRAPH SEPARATOR}"),
+]
+
+# These are also part of IdentifierPart §11.6 Names and Keywords
+compatibility_identifier_part = [
+    ord("\N{ZERO WIDTH NON-JOINER}"),
+    ord("\N{ZERO WIDTH JOINER}"),
+]
+
+FLAG_SPACE = 1 << 0
+FLAG_UNICODE_ID_START = 1 << 1
+FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2
+
+MAX_BMP = 0xFFFF
+
+public_domain = """
+/*
+ * Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/licenses/publicdomain/
+ */
+"""
+
+mpl_license = """\
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+ * vim: set ts=8 sts=4 et sw=4 tw=99:
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"""
+
+warning_message = """\
+/* Generated by make_unicode.py DO NOT MODIFY */
+"""
+
+unicode_version_message = """\
+/* Unicode version: {0} */
+"""
+
+
+def read_unicode_data(unicode_data):
+    """
+    If you want to understand how this wonderful file format works checkout
+      Unicode Standard Annex #44 - Unicode Character Database
+      http://www.unicode.org/reports/tr44/
+    """
+
+    reader = csv.reader(unicode_data, delimiter=str(";"))
+
+    while True:
+        row = next(reader, None)
+        if row is None:
+            return
+        name = row[1]
+
+        # We need to expand the UAX #44 4.2.3 Code Point Range
+        if name.startswith("<") and name.endswith("First>"):
+            next_row = next(reader)
+
+            for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
+                row[0] = i
+                row[1] = name[1:-8]
+
+                yield row
+        else:
+            row[0] = int(row[0], 16)
+            yield row
+
+
+def read_case_folding(case_folding):
+    """
+    File format is:
+    <code>; <status>; <mapping>; # <name>
+    """
+
+    for line in case_folding:
+        if line == "\n" or line.startswith("#"):
+            continue
+        row = line.split("; ")
+        if row[1] in ["F", "T"]:
+            continue
+        assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings"
+        code = int(row[0], 16)
+        mapping = int(row[2], 16)
+        yield (code, mapping)
+
+
+def read_derived_core_properties(derived_core_properties):
+    for line in derived_core_properties:
+        if line == "\n" or line.startswith("#"):
+            continue
+        row = line.split("#")[0].split(";")
+        char_range = row[0].strip()
+        char_property = row[1].strip()
+        if ".." not in char_range:
+            yield (int(char_range, 16), char_property)
+        else:
+            [start, end] = char_range.split("..")
+            for char in range(int(start, 16), int(end, 16) + 1):
+                yield (char, char_property)
+
+
+def read_special_casing(special_casing):
+    # Format:
+    # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
+    for line in special_casing:
+        if line == "\n" or line.startswith("#"):
+            continue
+        row = line.split("#")[0].split(";")
+        code = int(row[0].strip(), 16)
+        lower = row[1].strip()
+        lower = [int(c, 16) for c in lower.split(" ")] if lower else []
+        upper = row[3].strip()
+        upper = [int(c, 16) for c in upper.split(" ")] if upper else []
+        languages = []
+        contexts = []
+        condition = row[4].strip()
+        if condition:
+            for cond in condition.split(" "):
+                if cond[0].islower():
+                    languages.append(cond)
+                else:
+                    contexts.append(cond)
+            pass
+        yield (code, lower, upper, languages, contexts)
+
+
+def int_ranges(ints):
+    """Yields consecutive ranges (inclusive) from integer values."""
+    (a, b) = tee(sorted(ints))
+    start = next(b)
+    for (curr, succ) in zip_longest(a, b):
+        if curr + 1 != succ:
+            yield (start, curr)
+            start = succ
+
+
+def utf16_encode(code):
+    NonBMPMin = 0x10000
+    LeadSurrogateMin = 0xD800
+    TrailSurrogateMin = 0xDC00
+
+    lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin
+    trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
+
+    return lead, trail
+
+
+def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
+    # Find continuous range in convert_map.
+    convert_list = []
+    entry = None
+    for code in sorted(convert_map.keys()):
+        lead, trail = utf16_encode(code)
+        converted = convert_map[code]
+        diff = converted - code
+
+        if (
+            entry
+            and code == entry["code"] + entry["length"]
+            and diff == entry["diff"]
+            and lead == entry["lead"]
+        ):
+            entry["length"] += 1
+            continue
+
+        entry = {
+            "code": code,
+            "diff": diff,
+            "length": 1,
+            "lead": lead,
+            "trail": trail,
+        }
+        convert_list.append(entry)
+
+    # Generate macro call for each range.
+    lines = []
+    comment = []
+    for entry in convert_list:
+        from_code = entry["code"]
+        to_code = entry["code"] + entry["length"] - 1
+        diff = entry["diff"]
+
+        lead = entry["lead"]
+        from_trail = entry["trail"]
+        to_trail = entry["trail"] + entry["length"] - 1
+
+        lines.append(
+            "    MACRO(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})".format(
+                from_code, to_code, lead, from_trail, to_trail, diff
+            )
+        )
+        comment.append(
+            "// {} .. {}".format(
+                codepoint_table.full_name(from_code), codepoint_table.full_name(to_code)
+            )
+        )
+
+    out_file.write("\n".join(comment))
+    out_file.write("\n")
+    out_file.write("#define FOR_EACH_NON_BMP_{}(MACRO) \\\n".format(name))
+    out_file.write(" \\\n".join(lines))
+    out_file.write("\n")
+
+
+def process_derived_core_properties(derived_core_properties):
+    id_start = set()
+    id_continue = set()
+
+    for (char, prop) in read_derived_core_properties(derived_core_properties):
+        if prop == "ID_Start":
+            id_start.add(char)
+        if prop == "ID_Continue":
+            id_continue.add(char)
+
+    return (id_start, id_continue)
+
+
+def process_unicode_data(unicode_data, derived_core_properties):
+    dummy = (0, 0, 0)
+    table = [dummy]
+    cache = {dummy: 0}
+    index = [0] * (MAX_BMP + 1)
+
+    codepoint_table = codepoint_dict()
+    test_space_table = []
+
+    non_bmp_lower_map = {}
+    non_bmp_upper_map = {}
+    non_bmp_id_start_set = {}
+    non_bmp_id_cont_set = {}
+    non_bmp_space_set = {}
+
+    (id_start, id_continue) = process_derived_core_properties(derived_core_properties)
+
+    for row in read_unicode_data(unicode_data):
+        code = row[0]
+        name = row[1]
+        category = row[2]
+        alias = row[-5]
+        uppercase = row[-3]
+        lowercase = row[-2]
+
+        if uppercase:
+            upper = int(uppercase, 16)
+        else:
+            upper = code
+
+        if lowercase:
+            lower = int(lowercase, 16)
+        else:
+            lower = code
+
+        codepoint_table[code] = (upper, lower, name, alias)
+
+        if code > MAX_BMP:
+            if code != lower:
+                non_bmp_lower_map[code] = lower
+            if code != upper:
+                non_bmp_upper_map[code] = upper
+            if category == "Zs":
+                non_bmp_space_set[code] = 1
+                test_space_table.append(code)
+            if code in id_start:
+                non_bmp_id_start_set[code] = 1
+            if code in id_continue:
+                non_bmp_id_cont_set[code] = 1
+            continue
+
+        assert lower <= MAX_BMP and upper <= MAX_BMP
+
+        flags = 0
+
+        # we combine whitespace and lineterminators because in pratice we don't need them separated
+        if category == "Zs" or code in whitespace or code in line_terminator:
+            flags |= FLAG_SPACE
+            test_space_table.append(code)
+
+        # §11.6 (IdentifierStart)
+        if code in id_start:
+            flags |= FLAG_UNICODE_ID_START
+
+        # §11.6 (IdentifierPart)
+        elif code in id_continue or code in compatibility_identifier_part:
+            flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
+
+        up_d = upper - code
+        low_d = lower - code
+
+        assert up_d > -65535 and up_d < 65535
+        assert low_d > -65535 and low_d < 65535
+
+        upper = up_d & 0xFFFF
+        lower = low_d & 0xFFFF
+
+        item = (upper, lower, flags)
+
+        i = cache.get(item)
+        if i is None:
+            assert item not in table
+            cache[item] = i = len(table)
+            table.append(item)
+        index[code] = i
+
+    return (
+        table,
+        index,
+        non_bmp_lower_map,
+        non_bmp_upper_map,
+        non_bmp_space_set,
+        non_bmp_id_start_set,
+        non_bmp_id_cont_set,
+        codepoint_table,
+        test_space_table,
+    )
+
+
+def process_case_folding(case_folding):
+    folding_map = {}
+    rev_folding_map = {}
+    folding_dummy = (0,)
+    folding_table = [folding_dummy]
+    folding_cache = {folding_dummy: 0}
+    folding_index = [0] * (MAX_BMP + 1)
+
+    folding_tests = []
+    folding_codes = set()
+
+    for (code, mapping) in read_case_folding(case_folding):
+        folding_map[code] = mapping
+
+        if mapping not in rev_folding_map:
+            rev_folding_map[mapping] = [code]
+        else:
+            rev_folding_map[mapping].append(code)
+
+        folding_codes.add(code)
+        folding_codes.add(mapping)
+
+    for code in sorted(folding_codes):
+        if code in folding_map:
+            folding = folding_map[code]
+        else:
+            folding = code
+
+        if code in rev_folding_map:
+            rev_folding = rev_folding_map[code]
+        elif folding in rev_folding_map:
+            rev_folding = [c for c in rev_folding_map[folding] if c != code]
+        else:
+            rev_folding = []
+
+        if folding != code or len(rev_folding):
+            item = [code]
+            if folding != code:
+                item.append(folding)
+            folding_tests.append(item + rev_folding)
+
+        if code > MAX_BMP:
+            continue
+
+        folding_d = folding - code
+
+        assert folding_d > -65535 and folding_d < 65535
+
+        folding = folding_d & 0xFFFF
+
+        item = (folding,)
+
+        i = folding_cache.get(item)
+        if i is None:
+            assert item not in folding_table
+            folding_cache[item] = i = len(folding_table)
+            folding_table.append(item)
+        folding_index[code] = i
+    return (folding_table, folding_index, folding_tests)
+
+
+def process_special_casing(special_casing, table, index):
+    # Unconditional special casing.
+    unconditional_tolower = {}
+    unconditional_toupper = {}
+
+    # Conditional special casing, language independent.
+    conditional_tolower = {}
+    conditional_toupper = {}
+
+    # Conditional special casing, language dependent.
+    lang_conditional_tolower = {}
+    lang_conditional_toupper = {}
+
+    def caseInfo(code):
+        (upper, lower, flags) = table[index[code]]
+        return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF)
+
+    for (code, lower, upper, languages, contexts) in read_special_casing(
+        special_casing
+    ):
+        assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code
+        assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages
+        assert len(contexts) <= 1, (
+            "Expected zero or one casing contexts: %s" % languages
+        )
+
+        (default_lower, default_upper) = caseInfo(code)
+        special_lower = len(lower) != 1 or lower[0] != default_lower
+        special_upper = len(upper) != 1 or upper[0] != default_upper
+
+        # Invariant: If |code| has casing per UnicodeData.txt, then it also has
+        # casing rules in SpecialCasing.txt.
+        assert code == default_lower or len(lower) != 1 or code != lower[0]
+        assert code == default_upper or len(upper) != 1 or code != upper[0]
+
+        language = languages[0] if languages else None
+        context = contexts[0] if contexts else None
+
+        if not language and not context:
+            if special_lower:
+                unconditional_tolower[code] = lower
+            if special_upper:
+                unconditional_toupper[code] = upper
+        elif not language and context:
+            if special_lower:
+                conditional_tolower[code] = (lower, context)
+            if special_upper:
+                conditional_toupper[code] = (upper, context)
+        else:
+            if language not in lang_conditional_tolower:
+                lang_conditional_tolower[language] = {}
+                lang_conditional_toupper[language] = {}
+            if special_lower:
+                lang_conditional_tolower[language][code] = (lower, context)
+            if special_upper:
+                lang_conditional_toupper[language][code] = (upper, context)
+
+    # Certain special casing rules are inlined in jsstr.cpp, ensure these cases
+    # still match the current SpecialCasing.txt file.
+    def lowerCase(code):
+        (lower, _) = caseInfo(code)
+        return lower
+
+    def upperCase(code):
+        (_, upper) = caseInfo(code)
+        return upper
+
+    def ascii(char_dict):
+        return (ch for ch in char_dict.keys() if ch <= 0x7F)
+
+    def latin1(char_dict):
+        return (ch for ch in char_dict.keys() if ch <= 0xFF)
+
+    def is_empty(iterable):
+        return not any(True for _ in iterable)
+
+    def is_equals(iter1, iter2):
+        return all(x == y for (x, y) in zip_longest(iter1, iter2))
+
+    # Ensure no ASCII characters have special case mappings.
+    assert is_empty(ascii(unconditional_tolower))
+    assert is_empty(ascii(unconditional_toupper))
+    assert is_empty(ascii(conditional_tolower))
+    assert is_empty(ascii(conditional_toupper))
+
+    # Ensure no Latin1 characters have special lower case mappings.
+    assert is_empty(latin1(unconditional_tolower))
+    assert is_empty(latin1(conditional_tolower))
+
+    # Ensure no Latin1 characters have conditional special upper case mappings.
+    assert is_empty(latin1(conditional_toupper))
+
+    # Ensure U+00DF is the only Latin1 character with a special upper case mapping.
+    assert is_equals([0x00DF], latin1(unconditional_toupper))
+
+    # Ensure U+0130 is the only character with a special lower case mapping.
+    assert is_equals([0x0130], unconditional_tolower)
+
+    # Ensure no characters have language independent conditional upper case mappings.
+    assert is_empty(conditional_toupper)
+
+    # Ensure U+03A3 is the only character with language independent conditional lower case mapping.
+    assert is_equals([0x03A3], conditional_tolower)
+
+    # Verify U+0130 and U+03A3 have simple lower case mappings.
+    assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
+
+    # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
+    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys()))
+    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys()))
+
+    # Maximum case mapping length is three characters.
+    assert (
+        max(
+            map(
+                len,
+                chain(
+                    unconditional_tolower.values(),
+                    unconditional_toupper.values(),
+                    map(itemgetter(0), conditional_tolower.values()),
+                    map(itemgetter(0), conditional_toupper.values()),
+                    map(
+                        itemgetter(0),
+                        chain.from_iterable(
+                            d.values() for d in lang_conditional_tolower.values()
+                        ),
+                    ),
+                    map(
+                        itemgetter(0),
+                        chain.from_iterable(
+                            d.values() for d in lang_conditional_toupper.values()
+                        ),
+                    ),
+                ),
+            )
+        )
+        <= 3
+    )
+
+    # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
+    assert set(
+        [
+            "After_I",
+            "After_Soft_Dotted",
+            "Final_Sigma",
+            "More_Above",
+            "Not_Before_Dot",
+        ]
+    ).issuperset(
+        set(
+            filter(
+                partial(is_not, None),
+                chain(
+                    map(itemgetter(1), conditional_tolower.values()),
+                    map(itemgetter(1), conditional_toupper.values()),
+                    map(
+                        itemgetter(1),
+                        chain.from_iterable(
+                            d.values() for d in lang_conditional_tolower.values()
+                        ),
+                    ),
+                    map(
+                        itemgetter(1),
+                        chain.from_iterable(
+                            d.values() for d in lang_conditional_toupper.values()
+                        ),
+                    ),
+                ),
+            )
+        )
+    )
+
+    # Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
+    assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [
+        0x0053,
+        0x0053,
+    ]
+
+    # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
+    assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
+
+    # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
+    assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == (
+        [0x03C2],
+        "Final_Sigma",
+    )
+
+    return (unconditional_tolower, unconditional_toupper)
+
+
+def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table):
+    file_name = "UnicodeNonBMP.h"
+    with io.open(file_name, mode="w", encoding="utf-8") as non_bmp_file:
+        non_bmp_file.write(mpl_license)
+        non_bmp_file.write("\n")
+        non_bmp_file.write(warning_message)
+        non_bmp_file.write(unicode_version_message.format(version))
+        non_bmp_file.write(
+            """
+#ifndef util_UnicodeNonBMP_h
+#define util_UnicodeNonBMP_h
+
+// |MACRO| receives the following arguments
+//   MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)
+//     FROM:       code point where the range starts
+//     TO:         code point where the range ends
+//     LEAD:       common lead surrogate of FROM and TO
+//     TRAIL_FROM: trail surrogate of FROM
+//     TRAIL_FROM: trail surrogate of TO
+//     DIFF:       the difference between the code point in the range and
+//                 converted code point
+
+"""
+        )
+
+        make_non_bmp_convert_macro(
+            non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table
+        )
+        non_bmp_file.write("\n")
+        make_non_bmp_convert_macro(
+            non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table
+        )
+
+        non_bmp_file.write(
+            """
+#endif /* util_UnicodeNonBMP_h */
+"""
+        )
+
+
+def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
+    def hexlit(n):
+        """Returns C++ hex-literal for |n|."""
+        return "0x{:04X}".format(n)
+
+    def describe_range(ranges, depth):
+        indent = depth * "    "
+        for (start, end) in ranges:
+            if start == end:
+                println(indent, "// {}".format(codepoint_table.full_name(start)))
+            else:
+                println(
+                    indent,
+                    "// {} .. {}".format(
+                        codepoint_table.full_name(start), codepoint_table.full_name(end)
+                    ),
+                )
+
+    def out_range(start, end):
+        """Tests if the input character isn't a member of the set {x | start <= x <= end}."""
+        if start == end:
+            return "ch != {}".format(hexlit(start))
+        return "ch < {} || ch > {}".format(hexlit(start), hexlit(end))
+
+    def in_range(start, end, parenthesize=False):
+        """Tests if the input character is in the set {x | start <= x <= end}."""
+        if start == end:
+            return "ch == {}".format(hexlit(start))
+        (left, right) = ("(", ")") if parenthesize else ("", "")
+        return "{}ch >= {} && ch <= {}{}".format(
+            left, hexlit(start), hexlit(end), right
+        )
+
+    def in_any_range(ranges, spaces):
+        """Tests if the input character is included in any of the given ranges."""
+        lines = [[]]
+        for (start, end) in ranges:
+            expr = in_range(start, end, parenthesize=True)
+            line = " || ".join(lines[-1] + [expr])
+            if len(line) < (100 - len(spaces) - len(" ||")):
+                lines[-1].append(expr)
+            else:
+                lines.append([expr])
+        return " ||\n{}".format(spaces).join(" || ".join(t) for t in lines)
+
+    def write_range_accept(parent_list, child_list, depth):
+        """Accepts the input character if it matches any code unit in |child_list|."""
+        (min_parent, max_parent) = (parent_list[0], parent_list[-1])
+        (min_child, max_child) = (child_list[0], child_list[-1])
+        assert min_child >= min_parent
+        assert max_child <= max_parent
+        indent = depth * "    "
+
+        child_ranges = list(int_ranges(child_list))
+        has_successor = max_child != max_parent
+
+        # If |child_list| is a contiguous list of code units, emit a simple
+        # range check: |min_child <= input <= max_child|.
+        if len(child_ranges) == 1:
+            describe_range(child_ranges, depth)
+            if has_successor:
+                println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
+                println(indent, "    return ch >= {};".format(hexlit(min_child)))
+                println(indent, "}")
+            else:
+                println(indent, "return {};".format(in_range(min_child, max_child)))
+            return
+
+        # Otherwise create a disjunction over the subranges in |child_ranges|.
+        if not has_successor:
+            spaces = indent + len("return ") * " "
+        else:
+            spaces = indent + len("    return ") * " "
+        range_test_expr = in_any_range(child_ranges, spaces)
+
+        if min_child != min_parent:
+            println(indent, "if (ch < {}) {{".format(hexlit(min_child)))
+            println(indent, "    return false;")
+            println(indent, "}")
+
+        # If there's no successor block, we can omit the |input <= max_child| check,
+        # because it was already checked when we emitted the parent range test.
+        if not has_successor:
+            describe_range(child_ranges, depth)
+            println(indent, "return {};".format(range_test_expr))
+        else:
+            println(indent, "if (ch <= {}) {{".format(hexlit(max_child)))
+            describe_range(child_ranges, depth + 1)
+            println(indent, "    return {};".format(range_test_expr))
+            println(indent, "}")
+
+    def write_ChangesWhenUpperCasedSpecialCasing():
+        """Checks if the input has a special upper case mapping."""
+        println("bool")
+        println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)")
+        println("{")
+
+        assert unconditional_toupper, "|unconditional_toupper| is not empty"
+
+        # Sorted list of code units with special upper case mappings.
+        code_list = sorted(unconditional_toupper.keys())
+
+        # Fail-fast if the input character isn't a special casing character.
+        println("    if ({}) {{".format(out_range(code_list[0], code_list[-1])))
+        println("        return false;")
+        println("    }")
+
+        for i in range(0, 16):
+            # Check if the input characters is in the range:
+            # |start_point <= input < end_point|.
+            start_point = i << 12
+            end_point = (i + 1) << 12
+            matches = [cu for cu in code_list if start_point <= cu < end_point]
+
+            # Skip empty ranges.
+            if not matches:
+                continue
+
+            # If |matches| consists of only a few characters, directly check
+            # the input against the characters in |matches|.
+            if len(matches) <= 8:
+                write_range_accept(code_list, matches, depth=1)
+                continue
+
+            # Otherwise split into further subranges.
+
+            # Only enter the if-block if the input is less-or-equals to the
+            # largest value in the current range.
+            is_last_block = matches[-1] == code_list[-1]
+            if not is_last_block:
+                println("    if (ch <= {}) {{".format(hexlit(matches[-1])))
+            else:
+                println("    if (ch < {}) {{".format(hexlit(matches[0])))
+                println("        return false;")
+                println("    }")
+
+            for j in range(0, 16):
+                inner_start = start_point + (j << 8)
+                inner_end = start_point + ((j + 1) << 8)
+                inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
+
+                if inner_matches:
+                    d = 1 if is_last_block else 2
+                    write_range_accept(matches, inner_matches, depth=d)
+
+            if not is_last_block:
+                println("    }")
+
+        println("}")
+
+    def write_LengthUpperCaseSpecialCasing():
+        """Slow case: Special casing character was found, returns its mapping length."""
+        println("size_t")
+        println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)")
+        println("{")
+
+        println("    switch(ch) {")
+        for (code, converted) in sorted(
+            unconditional_toupper.items(), key=itemgetter(0)
+        ):
+            println(
+                "      case {}: return {}; // {}".format(
+                    hexlit(code), len(converted), codepoint_table.name(code)
+                )
+            )
+        println("    }")
+        println("")
+        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+        println("    return 0;")
+
+        println("}")
+
+    def write_AppendUpperCaseSpecialCasing():
+        """Slow case: Special casing character was found, append its mapping characters."""
+        println("void")
+        println(
+            "js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)"  # NOQA: E501
+        )
+        println("{")
+
+        println("    switch(ch) {")
+        for (code, converted) in sorted(
+            unconditional_toupper.items(), key=itemgetter(0)
+        ):
+            println(
+                "      case {}: // {}".format(hexlit(code), codepoint_table.name(code))
+            )
+            for ch in converted:
+                println(
+                    "        elements[(*index)++] = {}; // {}".format(
+                        hexlit(ch), codepoint_table.name(ch)
+                    )
+                )
+            println("        return;")
+        println("    }")
+        println("")
+        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
+
+        println("}")
+
+    write_ChangesWhenUpperCasedSpecialCasing()
+    println("")
+    write_LengthUpperCaseSpecialCasing()
+    println("")
+    write_AppendUpperCaseSpecialCasing()
+
+
+def write_ascii_lookup_tables(table, index, write, println):
+    def is_id_compat(code):
+        return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}")
+
+    def is_id_start(code):
+        (upper, lower, flags) = table[index[code]]
+        return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code)
+
+    def is_id_continue(code):
+        (upper, lower, flags) = table[index[code]]
+        return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code)
+
+    def is_space(code):
+        (upper, lower, flags) = table[index[code]]
+        return flags & FLAG_SPACE
+
+    def write_entries(name, predicate):
+        println("const bool unicode::{}[] = {{".format(name))
+        header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip()
+        println("/*       {}  */".format(header))
+        for i in range(0, 13):
+            write("/* {0: >2} */".format(i))
+            for j in range(0, 10):
+                code = i * 10 + j
+                if code <= 0x7F:
+                    write(" {},".format("true" if predicate(code) else "____"))
+            println("")
+        println("};")
+
+    println("")
+    println("#define ____ false")
+
+    println(
+        """
+/*
+ * Identifier start chars:
+ * -      36:    $
+ * -  65..90: A..Z
+ * -      95:    _
+ * - 97..122: a..z
+ */"""
+    )
+    write_entries("js_isidstart", is_id_start)
+
+    println(
+        """
+/*
+ * Identifier chars:
+ * -      36:    $
+ * -  48..57: 0..9
+ * -  65..90: A..Z
+ * -      95:    _
+ * - 97..122: a..z
+ */"""
+    )
+    write_entries("js_isident", is_id_continue)
+
+    println(
+        """
+/* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */"""
+    )
+    write_entries("js_isspace", is_space)
+
+    println("")
+    println("#undef ____")
+
+
+def write_latin1_lookup_tables(table, index, write, println):
+    def case_info(code):
+        assert 0 <= code and code <= MAX_BMP
+        (upper, lower, flags) = table[index[code]]
+        return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags)
+
+    def toLowerCase(code):
+        (_, lower, _) = case_info(code)
+        assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1"
+        return lower
+
+    def write_entries(name, mapper):
+        println("const JS::Latin1Char unicode::{}[] = {{".format(name))
+        header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip()
+        println("/*       {}  */".format(header))
+        for i in range(0, 16):
+            write("/* {0: >2} */".format(i))
+            for j in range(0, 16):
+                code = i * 16 + j
+                if code <= 0xFF:
+                    write(" 0x{:02X},".format(mapper(code)))
+            println("")
+        println("};")
+
+    println("")
+    write_entries("latin1ToLowerCaseTable", toLowerCase)
+
+
+def make_bmp_mapping_test(
+    version, codepoint_table, unconditional_tolower, unconditional_toupper
+):
+    def unicodeEsc(n):
+        return "\\u{:04X}".format(n)
+
+    file_name = "../tests/non262/String/string-upper-lower-mapping.js"
+    with io.open(file_name, mode="w", encoding="utf-8") as output:
+        write = partial(print, file=output, sep="", end="")
+        println = partial(print, file=output, sep="", end="\n")
+
+        write(warning_message)
+        write(unicode_version_message.format(version))
+        write(public_domain)
+        println("var mapping = [")
+        for code in range(0, MAX_BMP + 1):
+            entry = codepoint_table.get(code)
+
+            if entry:
+                (upper, lower, _, _) = entry
+                upper = (
+                    unconditional_toupper[code]
+                    if code in unconditional_toupper
+                    else [upper]
+                )
+                lower = (
+                    unconditional_tolower[code]
+                    if code in unconditional_tolower
+                    else [lower]
+                )
+                println(
+                    '  ["{}", "{}"], /* {} */'.format(
+                        "".join(map(unicodeEsc, upper)),
+                        "".join(map(unicodeEsc, lower)),
+                        codepoint_table.name(code),
+                    )
+                )
+            else:
+                println('  ["{0}", "{0}"],'.format(unicodeEsc(code)))
+        println("];")
+        write(
+            """
+assertEq(mapping.length, 0x10000);
+for (var i = 0; i <= 0xffff; i++) {
+    var char = String.fromCharCode(i);
+    var info = mapping[i];
+
+    assertEq(char.toUpperCase(), info[0]);
+    assertEq(char.toLowerCase(), info[1]);
+}
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
+"""
+        )
+
+
+def make_non_bmp_mapping_test(
+    version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
+):
+    file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js"
+    with io.open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping:
+        test_non_bmp_mapping.write(warning_message)
+        test_non_bmp_mapping.write(unicode_version_message.format(version))
+        test_non_bmp_mapping.write(public_domain)
+
+        for code in sorted(non_bmp_upper_map.keys()):
+            test_non_bmp_mapping.write(
+                """\
+assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(
+                    code,
+                    non_bmp_upper_map[code],
+                    codepoint_table.name(code),
+                    codepoint_table.name(non_bmp_upper_map[code]),
+                )
+            )
+
+        for code in sorted(non_bmp_lower_map.keys()):
+            test_non_bmp_mapping.write(
+                """\
+assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {}
+""".format(
+                    code,
+                    non_bmp_lower_map[code],
+                    codepoint_table.name(code),
+                    codepoint_table.name(non_bmp_lower_map[code]),
+                )
+            )
+
+        test_non_bmp_mapping.write(
+            """
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
+"""
+        )
+
+
+def make_space_test(version, test_space_table, codepoint_table):
+    def hex_and_name(c):
+        return "    0x{:04X} /* {} */".format(c, codepoint_table.name(c))
+
+    file_name = "../tests/non262/String/string-space-trim.js"
+    with io.open(file_name, mode="w", encoding="utf-8") as test_space:
+        test_space.write(warning_message)
+        test_space.write(unicode_version_message.format(version))
+        test_space.write(public_domain)
+        test_space.write("var onlySpace = String.fromCharCode(\n")
+        test_space.write(",\n".join(map(hex_and_name, test_space_table)))
+        test_space.write("\n);\n")
+        test_space.write(
+            """
+assertEq(onlySpace.trim(), "");
+assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
+assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
+assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
+"""
+        )
+
+
+def make_regexp_space_test(version, test_space_table, codepoint_table):
+    def hex_and_name(c):
+        return "    0x{:04X} /* {} */".format(c, codepoint_table.name(c))
+
+    file_name = "../tests/non262/RegExp/character-class-escape-s.js"
+    with io.open(file_name, mode="w", encoding="utf-8") as test_space:
+        test_space.write(warning_message)
+        test_space.write(unicode_version_message.format(version))
+        test_space.write(public_domain)
+        test_space.write("var onlySpace = String.fromCodePoint(\n")
+        test_space.write(",\n".join(map(hex_and_name, test_space_table)))
+        test_space.write("\n);\n")
+        test_space.write(
+            """
+assertEq(/^\s+$/.exec(onlySpace) !== null, true);
+assertEq(/^[\s]+$/.exec(onlySpace) !== null, true);
+assertEq(/^[^\s]+$/.exec(onlySpace) === null, true);
+
+assertEq(/^\S+$/.exec(onlySpace) === null, true);
+assertEq(/^[\S]+$/.exec(onlySpace) === null, true);
+assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true);
+
+// Also test with Unicode RegExps.
+assertEq(/^\s+$/u.exec(onlySpace) !== null, true);
+assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true);
+assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true);
+
+assertEq(/^\S+$/u.exec(onlySpace) === null, true);
+assertEq(/^[\S]+$/u.exec(onlySpace) === null, true);
+assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
+"""
+        )
+
+
+def make_icase_test(version, folding_tests, codepoint_table):
+    def char_hex(c):
+        return "0x{:04X}".format(c)
+
+    file_name = "../tests/non262/RegExp/unicode-ignoreCase.js"
+    with io.open(file_name, mode="w", encoding="utf-8") as test_icase:
+        test_icase.write(warning_message)
+        test_icase.write(unicode_version_message.format(version))
+        test_icase.write(public_domain)
+        test_icase.write(
+            """
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
+
+print(BUGNUMBER + ": " + summary);
+
+function test(code, ...equivs) {
+  var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
+  var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
+  assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
+  codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
+  assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
+}
+"""
+        )
+        for args in folding_tests:
+            test_icase.write(
+                "test({}); // {}\n".format(
+                    ", ".join(map(char_hex, args)),
+                    ", ".join(map(codepoint_table.name, args)),
+                )
+            )
+        test_icase.write(
+            """
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
+"""
+        )
+
+
+def make_unicode_file(
+    version,
+    table,
+    index,
+    folding_table,
+    folding_index,
+    non_bmp_space_set,
+    non_bmp_id_start_set,
+    non_bmp_id_cont_set,
+    unconditional_toupper,
+    codepoint_table,
+):
+    index1, index2, shift = splitbins(index)
+
+    # Don't forget to update CharInfo in Unicode.h if you need to change this
+    assert shift == 6
+
+    folding_index1, folding_index2, folding_shift = splitbins(folding_index)
+
+    # Don't forget to update CaseFoldInfo in Unicode.h if you need to change this
+    assert folding_shift == 5
+
+    # verify correctness
+    for char in index:
+        test = table[index[char]]
+
+        idx = index1[char >> shift]
+        idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
+
+        assert test == table[idx]
+
+    # verify correctness
+    for char in folding_index:
+        test = folding_table[folding_index[char]]
+
+        idx = folding_index1[char >> folding_shift]
+        idx = folding_index2[
+            (idx << folding_shift) + (char & ((1 << folding_shift) - 1))
+        ]
+
+        assert test == folding_table[idx]
+
+    comment = """
+/*
+ * So how does indexing work?
+ * First let's have a look at a char16_t, 16-bits:
+ *              [................]
+ * Step 1:
+ *  Extracting the upper 11 bits from the char16_t.
+ *   upper = char >>  5 ([***********.....])
+ * Step 2:
+ *  Using these bits to get an reduced index from index1.
+ *   index = index1[upper]
+ * Step 3:
+ *  Combining the index and the bottom 5 bits of the original char16_t.
+ *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
+ *
+ * The advantage here is that the biggest number in index1 doesn't need 10 bits,
+ * but 7 and we save some memory.
+ *
+ * Step 4:
+ *  Get the character informations by looking up real_index in js_charinfo.
+ *
+ * Pseudocode of generation:
+ *
+ * let table be the mapping of char16_t => js_charinfo_index
+ * let index1 be an empty array
+ * let index2 be an empty array
+ * let cache be a hash map
+ *
+ * while shift is less then maximal amount you can shift 0xffff before it's 0
+ *  let chunks be table split in chunks of size 2**shift
+ *
+ *  for every chunk in chunks
+ *   if chunk is in cache
+ *    let index be cache[chunk]
+ *   else
+ *    let index be the max key of index2 + 1
+ *    for element in chunk
+ *     push element to index2
+ *    put index as chunk in cache
+ *
+ *   push index >> shift to index1
+ *
+ *  increase shift
+ *  stop if you found the best shift
+ */
+"""
+
+    def dump(data, name, println):
+        println("const uint8_t unicode::{}[] = {{".format(name))
+
+        line = pad = " " * 4
+        lines = []
+        for entry in data:
+            assert entry < 256
+            s = str(entry)
+            s = s.rjust(3)
+
+            if len(line + s) + 5 > 99:
+                lines.append(line.rstrip())
+                line = pad + s + ", "
+            else:
+                line = line + s + ", "
+        lines.append(line.rstrip())
+
+        println("\n".join(lines))
+        println("};")
+
+    def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
+        println("const {} unicode::{}[] = {{".format(data_type, name))
+        for d in tbl:
+            println("    {{ {} }},".format(", ".join(str(e) for e in d)))
+        println("};")
+        println("")
+
+        dump(idx1, idx1_name, println)
+        println("")
+        dump(idx2, idx2_name, println)
+        println("")
+
+    def write_supplemental_identifier_method(name, group_set, println):
+        println("bool")
+        println("js::unicode::{}(char32_t codePoint)".format(name))
+        println("{")
+        for (from_code, to_code) in int_ranges(group_set.keys()):
+            println(
+                "    if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) {{ // {} .. {}".format(
+                    from_code,
+                    to_code,
+                    codepoint_table.name(from_code),
+                    codepoint_table.name(to_code),
+                )
+            )
+            println("        return true;")
+            println("    }")
+        println("    return false;")
+        println("}")
+        println("")
+
+    file_name = "Unicode.cpp"
+    with io.open(file_name, "w", encoding="utf-8") as data_file:
+        write = partial(print, file=data_file, sep="", end="")
+        println = partial(print, file=data_file, sep="", end="\n")
+
+        write(warning_message)
+        write(unicode_version_message.format(version))
+        write(public_domain)
+        println('#include "util/Unicode.h"')
+        println("")
+        println("using namespace js;")
+        println("using namespace js::unicode;")
+        write(comment)
+
+        write_table(
+            "CharacterInfo",
+            "js_charinfo",
+            table,
+            "index1",
+            index1,
+            "index2",
+            index2,
+            println,
+        )
+
+        write_table(
+            "FoldingInfo",
+            "js_foldinfo",
+            folding_table,
+            "folding_index1",
+            folding_index1,
+            "folding_index2",
+            folding_index2,
+            println,
+        )
+
+        # If the following assert fails, it means space character is added to
+        # non-BMP area.  In that case the following code should be uncommented
+        # and the corresponding code should be added to frontend.  (At least
+        # unicode::IsSpace will require updating to handle this.)
+        assert len(non_bmp_space_set.keys()) == 0
+
+        write_supplemental_identifier_method(
+            "IsIdentifierStartNonBMP", non_bmp_id_start_set, println
+        )
+
+        write_supplemental_identifier_method(
+            "IsIdentifierPartNonBMP", non_bmp_id_cont_set, println
+        )
+
+        write_special_casing_methods(unconditional_toupper, codepoint_table, println)
+
+        write_ascii_lookup_tables(table, index, write, println)
+
+        write_latin1_lookup_tables(table, index, write, println)
+
+
+def getsize(data):
+    """return smallest possible integer size for the given array"""
+    maxdata = max(data)
+    assert maxdata < 2 ** 32
+
+    if maxdata < 256:
+        return 1
+    elif maxdata < 65536:
+        return 2
+    else:
+        return 4
+
+
+def splitbins(t):
+    """t -> (t1, t2, shift).  Split a table to save space.
+
+    t is a sequence of ints.  This function can be useful to save space if
+    many of the ints are the same.  t1 and t2 are lists of ints, and shift
+    is an int, chosen to minimize the combined size of t1 and t2 (in C
+    code), and where for each i in range(len(t)),
+        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+    where mask is a bitmask isolating the last "shift" bits.
+    """
+
+    def dump(t1, t2, shift, bytes):
+        print(
+            "%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes),
+            file=sys.stderr,
+        )
+        print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr)
+
+    n = len(t) - 1  # last valid index
+    maxshift = 0  # the most we can shift n and still have something left
+    if n > 0:
+        while n >> 1:
+            n >>= 1
+            maxshift += 1
+    del n
+    bytes = sys.maxsize  # smallest total size so far
+    t = tuple(t)  # so slices can be dict keys
+    for shift in range(maxshift + 1):
+        t1 = []
+        t2 = []
+        size = 2 ** shift
+        bincache = {}
+
+        for i in range(0, len(t), size):
+            bin = t[i : i + size]
+
+            index = bincache.get(bin)
+            if index is None:
+                index = len(t2)
+                bincache[bin] = index
+                t2.extend(bin)
+            t1.append(index >> shift)
+
+        # determine memory size
+        b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
+        if b < bytes:
+            best = t1, t2, shift
+            bytes = b
+    t1, t2, shift = best
+
+    print("Best:", end=" ", file=sys.stderr)
+    dump(t1, t2, shift, bytes)
+
+    # exhaustively verify that the decomposition is correct
+    mask = 2 ** shift - 1
+    for i in range(len(t)):
+        assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
+    return best
+
+
+def update_unicode(args):
+    base_path = os.getcwd()
+
+    version = args.version
+    if version is not None:
+        baseurl = "https://unicode.org/Public"
+        if version == "UNIDATA":
+            url = "%s/%s" % (baseurl, version)
+        else:
+            url = "%s/%s/ucd" % (baseurl, version)
+
+    print("Arguments:")
+    if version is not None:
+        print("\tVersion: %s" % version)
+        print("\tDownload url: %s" % url)
+
+        request_url = "{}/UCD.zip".format(url)
+        with closing(urlopen(request_url)) as downloaded_file:
+            downloaded_data = io.BytesIO(downloaded_file.read())
+
+        with ZipFile(downloaded_data) as zip_file:
+            for fname in [
+                "UnicodeData.txt",
+                "CaseFolding.txt",
+                "DerivedCoreProperties.txt",
+                "SpecialCasing.txt",
+            ]:
+                zip_file.extract(fname, path=base_path)
+    else:
+        print("\tUsing local files.")
+        print("\tAlways make sure you have the newest Unicode files!")
+    print("")
+
+    def version_from_file(f, fname):
+        pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname)
+        return pat_version.match(f.readline()).group("version")
+
+    with io.open(
+        os.path.join(base_path, "UnicodeData.txt"), "r", encoding="utf-8"
+    ) as unicode_data, io.open(
+        os.path.join(base_path, "CaseFolding.txt"), "r", encoding="utf-8"
+    ) as case_folding, io.open(
+        os.path.join(base_path, "DerivedCoreProperties.txt"), "r", encoding="utf-8"
+    ) as derived_core_properties, io.open(
+        os.path.join(base_path, "SpecialCasing.txt"), "r", encoding="utf-8"
+    ) as special_casing:
+        unicode_version = version_from_file(
+            derived_core_properties, "DerivedCoreProperties"
+        )
+
+        print("Processing...")
+        (
+            table,
+            index,
+            non_bmp_lower_map,
+            non_bmp_upper_map,
+            non_bmp_space_set,
+            non_bmp_id_start_set,
+            non_bmp_id_cont_set,
+            codepoint_table,
+            test_space_table,
+        ) = process_unicode_data(unicode_data, derived_core_properties)
+        (folding_table, folding_index, folding_tests) = process_case_folding(
+            case_folding
+        )
+        (unconditional_tolower, unconditional_toupper) = process_special_casing(
+            special_casing, table, index
+        )
+
+    print("Generating...")
+    make_unicode_file(
+        unicode_version,
+        table,
+        index,
+        folding_table,
+        folding_index,
+        non_bmp_space_set,
+        non_bmp_id_start_set,
+        non_bmp_id_cont_set,
+        unconditional_toupper,
+        codepoint_table,
+    )
+    make_non_bmp_file(
+        unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table
+    )
+
+    make_bmp_mapping_test(
+        unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper
+    )
+    make_non_bmp_mapping_test(
+        unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
+    )
+    make_space_test(unicode_version, test_space_table, codepoint_table)
+    make_regexp_space_test(unicode_version, test_space_table, codepoint_table)
+    make_icase_test(unicode_version, folding_tests, codepoint_table)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    # This script must be run from js/src/util to work correctly.
+    if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util":
+        raise RuntimeError("%s must be run from js/src/util" % sys.argv[0])
+
+    parser = argparse.ArgumentParser(description="Update Unicode data.")
+
+    parser.add_argument(
+        "--version",
+        help='Optional Unicode version number. If specified, downloads the\
+                              selected version from <https://unicode.org/Public>. If not specified\
+                              uses the existing local files to generate the Unicode data. The\
+                              number must match a published Unicode version, e.g. use\
+                              "--version=8.0.0" to download Unicode 8 files. Alternatively use\
+                              "--version=UNIDATA" to download the latest published version.',
+    )
+
+    parser.set_defaults(func=update_unicode)
+
+    args = parser.parse_args()
+    args.func(args)
-- 
cgit v1.2.3