diff options
Diffstat (limited to 'js/src/util/make_unicode.py')
-rwxr-xr-x | js/src/util/make_unicode.py | 1573 |
1 files changed, 1573 insertions, 0 deletions
diff --git a/js/src/util/make_unicode.py b/js/src/util/make_unicode.py new file mode 100755 index 0000000000..617268fa9c --- /dev/null +++ b/js/src/util/make_unicode.py @@ -0,0 +1,1573 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Based upon makeunicodedata.py +# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) +# written by Fredrik Lundh (fredrik@pythonware.com) +# +# Copyright (C) 2011 Tom Schuster <evilpies@gmail.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import csv +import io +import os +import re +import sys +from contextlib import closing +from functools import partial +from itertools import chain, tee +from operator import is_not, itemgetter +from zipfile import ZipFile + +if sys.version_info.major == 2: + from itertools import ifilter as filter + from itertools import imap as map + from itertools import izip_longest as zip_longest + + from urllib2 import urlopen + + range = xrange +else: + from itertools import zip_longest + from urllib.request import urlopen + + +class codepoint_dict(dict): + def name(self, code_point): + (_, _, name, alias) = self[code_point] + return "{}{}".format(name, (" (" + alias + ")" if alias else "")) + + def full_name(self, code_point): + (_, _, name, alias) = self[code_point] + return "U+{:04X} {}{}".format( + code_point, name, (" (" + alias + ")" if alias else "") + ) + + +# ECMAScript 2016 +# §11.2 White Space +whitespace = [ + # python doesn't support using control character names :( + 0x9, # CHARACTER TABULATION + 0xB, # LINE TABULATION + 0xC, # FORM FEED + ord("\N{SPACE}"), + ord("\N{NO-BREAK SPACE}"), + ord("\N{ZERO WIDTH NO-BREAK SPACE}"), # also BOM +] + +# §11.3 Line Terminators +line_terminator = [ + 0xA, # LINE FEED + 0xD, # CARRIAGE RETURN + ord("\N{LINE SEPARATOR}"), + ord("\N{PARAGRAPH SEPARATOR}"), +] + +# These are also part of IdentifierPart §11.6 Names and Keywords +compatibility_identifier_part = [ + ord("\N{ZERO WIDTH NON-JOINER}"), + ord("\N{ZERO WIDTH JOINER}"), +] + +FLAG_SPACE = 1 << 0 +FLAG_UNICODE_ID_START = 1 << 1 +FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2 + +MAX_BMP = 0xFFFF + +public_domain = """ +/* + * Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/licenses/publicdomain/ + */ +""" + +mpl_license = """\ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +""" + +warning_message = """\ +/* Generated by make_unicode.py DO NOT MODIFY */ +""" + +unicode_version_message = """\ +/* Unicode version: {0} */ +""" + + +def read_unicode_data(unicode_data): + """ + If you want to understand how this wonderful file format works checkout + Unicode Standard Annex #44 - Unicode Character Database + http://www.unicode.org/reports/tr44/ + """ + + reader = csv.reader(unicode_data, delimiter=str(";")) + + while True: + row = next(reader, None) + if row is None: + return + name = row[1] + + # We need to expand the UAX #44 4.2.3 Code Point Range + if name.startswith("<") and name.endswith("First>"): + next_row = next(reader) + + for i in range(int(row[0], 16), int(next_row[0], 16) + 1): + row[0] = i + row[1] = name[1:-8] + + yield row + else: + row[0] = int(row[0], 16) + yield row + + +def read_case_folding(case_folding): + """ + File format is: + <code>; <status>; <mapping>; # <name> + """ + + for line in case_folding: + if line == "\n" or line.startswith("#"): + continue + row = line.split("; ") + if row[1] in ["F", "T"]: + continue + assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings" + code = int(row[0], 16) + mapping = int(row[2], 16) + yield (code, mapping) + + +def read_derived_core_properties(derived_core_properties): + for line in derived_core_properties: + if line == "\n" or line.startswith("#"): + continue + row = line.split("#")[0].split(";") + char_range = row[0].strip() + char_property = row[1].strip() + if ".." not in char_range: + yield (int(char_range, 16), char_property) + else: + [start, end] = char_range.split("..") + for char in range(int(start, 16), int(end, 16) + 1): + yield (char, char_property) + + +def read_special_casing(special_casing): + # Format: + # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> + for line in special_casing: + if line == "\n" or line.startswith("#"): + continue + row = line.split("#")[0].split(";") + code = int(row[0].strip(), 16) + lower = row[1].strip() + lower = [int(c, 16) for c in lower.split(" ")] if lower else [] + upper = row[3].strip() + upper = [int(c, 16) for c in upper.split(" ")] if upper else [] + languages = [] + contexts = [] + condition = row[4].strip() + if condition: + for cond in condition.split(" "): + if cond[0].islower(): + languages.append(cond) + else: + contexts.append(cond) + pass + yield (code, lower, upper, languages, contexts) + + +def int_ranges(ints): + """Yields consecutive ranges (inclusive) from integer values.""" + (a, b) = tee(sorted(ints)) + start = next(b) + for (curr, succ) in zip_longest(a, b): + if curr + 1 != succ: + yield (start, curr) + start = succ + + +def utf16_encode(code): + NonBMPMin = 0x10000 + LeadSurrogateMin = 0xD800 + TrailSurrogateMin = 0xDC00 + + lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin + trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin + + return lead, trail + + +def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table): + # Find continuous range in convert_map. + convert_list = [] + entry = None + for code in sorted(convert_map.keys()): + lead, trail = utf16_encode(code) + converted = convert_map[code] + diff = converted - code + + if ( + entry + and code == entry["code"] + entry["length"] + and diff == entry["diff"] + and lead == entry["lead"] + ): + entry["length"] += 1 + continue + + entry = { + "code": code, + "diff": diff, + "length": 1, + "lead": lead, + "trail": trail, + } + convert_list.append(entry) + + # Generate macro call for each range. + lines = [] + comment = [] + for entry in convert_list: + from_code = entry["code"] + to_code = entry["code"] + entry["length"] - 1 + diff = entry["diff"] + + lead = entry["lead"] + from_trail = entry["trail"] + to_trail = entry["trail"] + entry["length"] - 1 + + lines.append( + " MACRO(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})".format( + from_code, to_code, lead, from_trail, to_trail, diff + ) + ) + comment.append( + "// {} .. {}".format( + codepoint_table.full_name(from_code), codepoint_table.full_name(to_code) + ) + ) + + out_file.write("\n".join(comment)) + out_file.write("\n") + out_file.write("#define FOR_EACH_NON_BMP_{}(MACRO) \\\n".format(name)) + out_file.write(" \\\n".join(lines)) + out_file.write("\n") + + +def process_derived_core_properties(derived_core_properties): + id_start = set() + id_continue = set() + + for (char, prop) in read_derived_core_properties(derived_core_properties): + if prop == "ID_Start": + id_start.add(char) + if prop == "ID_Continue": + id_continue.add(char) + + return (id_start, id_continue) + + +def process_unicode_data(unicode_data, derived_core_properties): + dummy = (0, 0, 0) + table = [dummy] + cache = {dummy: 0} + index = [0] * (MAX_BMP + 1) + + codepoint_table = codepoint_dict() + test_space_table = [] + + non_bmp_lower_map = {} + non_bmp_upper_map = {} + non_bmp_id_start_set = {} + non_bmp_id_cont_set = {} + non_bmp_space_set = {} + + (id_start, id_continue) = process_derived_core_properties(derived_core_properties) + + for row in read_unicode_data(unicode_data): + code = row[0] + name = row[1] + category = row[2] + alias = row[-5] + uppercase = row[-3] + lowercase = row[-2] + + if uppercase: + upper = int(uppercase, 16) + else: + upper = code + + if lowercase: + lower = int(lowercase, 16) + else: + lower = code + + codepoint_table[code] = (upper, lower, name, alias) + + if code > MAX_BMP: + if code != lower: + non_bmp_lower_map[code] = lower + if code != upper: + non_bmp_upper_map[code] = upper + if category == "Zs": + non_bmp_space_set[code] = 1 + test_space_table.append(code) + if code in id_start: + non_bmp_id_start_set[code] = 1 + if code in id_continue: + non_bmp_id_cont_set[code] = 1 + continue + + assert lower <= MAX_BMP and upper <= MAX_BMP + + flags = 0 + + # we combine whitespace and lineterminators because in pratice we don't need them separated + if category == "Zs" or code in whitespace or code in line_terminator: + flags |= FLAG_SPACE + test_space_table.append(code) + + # §11.6 (IdentifierStart) + if code in id_start: + flags |= FLAG_UNICODE_ID_START + + # §11.6 (IdentifierPart) + elif code in id_continue or code in compatibility_identifier_part: + flags |= FLAG_UNICODE_ID_CONTINUE_ONLY + + up_d = upper - code + low_d = lower - code + + assert up_d > -65535 and up_d < 65535 + assert low_d > -65535 and low_d < 65535 + + upper = up_d & 0xFFFF + lower = low_d & 0xFFFF + + item = (upper, lower, flags) + + i = cache.get(item) + if i is None: + assert item not in table + cache[item] = i = len(table) + table.append(item) + index[code] = i + + return ( + table, + index, + non_bmp_lower_map, + non_bmp_upper_map, + non_bmp_space_set, + non_bmp_id_start_set, + non_bmp_id_cont_set, + codepoint_table, + test_space_table, + ) + + +def process_case_folding(case_folding): + folding_map = {} + rev_folding_map = {} + folding_dummy = (0,) + folding_table = [folding_dummy] + folding_cache = {folding_dummy: 0} + folding_index = [0] * (MAX_BMP + 1) + + folding_tests = [] + folding_codes = set() + + for (code, mapping) in read_case_folding(case_folding): + folding_map[code] = mapping + + if mapping not in rev_folding_map: + rev_folding_map[mapping] = [code] + else: + rev_folding_map[mapping].append(code) + + folding_codes.add(code) + folding_codes.add(mapping) + + for code in sorted(folding_codes): + if code in folding_map: + folding = folding_map[code] + else: + folding = code + + if code in rev_folding_map: + rev_folding = rev_folding_map[code] + elif folding in rev_folding_map: + rev_folding = [c for c in rev_folding_map[folding] if c != code] + else: + rev_folding = [] + + if folding != code or len(rev_folding): + item = [code] + if folding != code: + item.append(folding) + folding_tests.append(item + rev_folding) + + if code > MAX_BMP: + continue + + folding_d = folding - code + + assert folding_d > -65535 and folding_d < 65535 + + folding = folding_d & 0xFFFF + + item = (folding,) + + i = folding_cache.get(item) + if i is None: + assert item not in folding_table + folding_cache[item] = i = len(folding_table) + folding_table.append(item) + folding_index[code] = i + return (folding_table, folding_index, folding_tests) + + +def process_special_casing(special_casing, table, index): + # Unconditional special casing. + unconditional_tolower = {} + unconditional_toupper = {} + + # Conditional special casing, language independent. + conditional_tolower = {} + conditional_toupper = {} + + # Conditional special casing, language dependent. + lang_conditional_tolower = {} + lang_conditional_toupper = {} + + def caseInfo(code): + (upper, lower, flags) = table[index[code]] + return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF) + + for (code, lower, upper, languages, contexts) in read_special_casing( + special_casing + ): + assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code + assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages + assert len(contexts) <= 1, ( + "Expected zero or one casing contexts: %s" % languages + ) + + (default_lower, default_upper) = caseInfo(code) + special_lower = len(lower) != 1 or lower[0] != default_lower + special_upper = len(upper) != 1 or upper[0] != default_upper + + # Invariant: If |code| has casing per UnicodeData.txt, then it also has + # casing rules in SpecialCasing.txt. + assert code == default_lower or len(lower) != 1 or code != lower[0] + assert code == default_upper or len(upper) != 1 or code != upper[0] + + language = languages[0] if languages else None + context = contexts[0] if contexts else None + + if not language and not context: + if special_lower: + unconditional_tolower[code] = lower + if special_upper: + unconditional_toupper[code] = upper + elif not language and context: + if special_lower: + conditional_tolower[code] = (lower, context) + if special_upper: + conditional_toupper[code] = (upper, context) + else: + if language not in lang_conditional_tolower: + lang_conditional_tolower[language] = {} + lang_conditional_toupper[language] = {} + if special_lower: + lang_conditional_tolower[language][code] = (lower, context) + if special_upper: + lang_conditional_toupper[language][code] = (upper, context) + + # Certain special casing rules are inlined in jsstr.cpp, ensure these cases + # still match the current SpecialCasing.txt file. + def lowerCase(code): + (lower, _) = caseInfo(code) + return lower + + def upperCase(code): + (_, upper) = caseInfo(code) + return upper + + def ascii(char_dict): + return (ch for ch in char_dict.keys() if ch <= 0x7F) + + def latin1(char_dict): + return (ch for ch in char_dict.keys() if ch <= 0xFF) + + def is_empty(iterable): + return not any(True for _ in iterable) + + def is_equals(iter1, iter2): + return all(x == y for (x, y) in zip_longest(iter1, iter2)) + + # Ensure no ASCII characters have special case mappings. + assert is_empty(ascii(unconditional_tolower)) + assert is_empty(ascii(unconditional_toupper)) + assert is_empty(ascii(conditional_tolower)) + assert is_empty(ascii(conditional_toupper)) + + # Ensure no Latin1 characters have special lower case mappings. + assert is_empty(latin1(unconditional_tolower)) + assert is_empty(latin1(conditional_tolower)) + + # Ensure no Latin1 characters have conditional special upper case mappings. + assert is_empty(latin1(conditional_toupper)) + + # Ensure U+00DF is the only Latin1 character with a special upper case mapping. + assert is_equals([0x00DF], latin1(unconditional_toupper)) + + # Ensure U+0130 is the only character with a special lower case mapping. + assert is_equals([0x0130], unconditional_tolower) + + # Ensure no characters have language independent conditional upper case mappings. + assert is_empty(conditional_toupper) + + # Ensure U+03A3 is the only character with language independent conditional lower case mapping. + assert is_equals([0x03A3], conditional_tolower) + + # Verify U+0130 and U+03A3 have simple lower case mappings. + assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3]) + + # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings. + assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys())) + assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys())) + + # Maximum case mapping length is three characters. + assert ( + max( + map( + len, + chain( + unconditional_tolower.values(), + unconditional_toupper.values(), + map(itemgetter(0), conditional_tolower.values()), + map(itemgetter(0), conditional_toupper.values()), + map( + itemgetter(0), + chain.from_iterable( + d.values() for d in lang_conditional_tolower.values() + ), + ), + map( + itemgetter(0), + chain.from_iterable( + d.values() for d in lang_conditional_toupper.values() + ), + ), + ), + ) + ) + <= 3 + ) + + # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms). + assert set( + [ + "After_I", + "After_Soft_Dotted", + "Final_Sigma", + "More_Above", + "Not_Before_Dot", + ] + ).issuperset( + set( + filter( + partial(is_not, None), + chain( + map(itemgetter(1), conditional_tolower.values()), + map(itemgetter(1), conditional_toupper.values()), + map( + itemgetter(1), + chain.from_iterable( + d.values() for d in lang_conditional_tolower.values() + ), + ), + map( + itemgetter(1), + chain.from_iterable( + d.values() for d in lang_conditional_toupper.values() + ), + ), + ), + ) + ) + ) + + # Special casing for U+00DF (LATIN SMALL LETTER SHARP S). + assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [ + 0x0053, + 0x0053, + ] + + # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE). + assert unconditional_tolower[0x0130] == [0x0069, 0x0307] + + # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA). + assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ( + [0x03C2], + "Final_Sigma", + ) + + return (unconditional_tolower, unconditional_toupper) + + +def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table): + file_name = "UnicodeNonBMP.h" + with io.open(file_name, mode="w", encoding="utf-8") as non_bmp_file: + non_bmp_file.write(mpl_license) + non_bmp_file.write("\n") + non_bmp_file.write(warning_message) + non_bmp_file.write(unicode_version_message.format(version)) + non_bmp_file.write( + """ +#ifndef util_UnicodeNonBMP_h +#define util_UnicodeNonBMP_h + +// |MACRO| receives the following arguments +// MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) +// FROM: code point where the range starts +// TO: code point where the range ends +// LEAD: common lead surrogate of FROM and TO +// TRAIL_FROM: trail surrogate of FROM +// TRAIL_FROM: trail surrogate of TO +// DIFF: the difference between the code point in the range and +// converted code point + +""" + ) + + make_non_bmp_convert_macro( + non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table + ) + non_bmp_file.write("\n") + make_non_bmp_convert_macro( + non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table + ) + + non_bmp_file.write( + """ +#endif /* util_UnicodeNonBMP_h */ +""" + ) + + +def write_special_casing_methods(unconditional_toupper, codepoint_table, println): + def hexlit(n): + """Returns C++ hex-literal for |n|.""" + return "0x{:04X}".format(n) + + def describe_range(ranges, depth): + indent = depth * " " + for (start, end) in ranges: + if start == end: + println(indent, "// {}".format(codepoint_table.full_name(start))) + else: + println( + indent, + "// {} .. {}".format( + codepoint_table.full_name(start), codepoint_table.full_name(end) + ), + ) + + def out_range(start, end): + """Tests if the input character isn't a member of the set {x | start <= x <= end}.""" + if start == end: + return "ch != {}".format(hexlit(start)) + return "ch < {} || ch > {}".format(hexlit(start), hexlit(end)) + + def in_range(start, end, parenthesize=False): + """Tests if the input character is in the set {x | start <= x <= end}.""" + if start == end: + return "ch == {}".format(hexlit(start)) + (left, right) = ("(", ")") if parenthesize else ("", "") + return "{}ch >= {} && ch <= {}{}".format( + left, hexlit(start), hexlit(end), right + ) + + def in_any_range(ranges, spaces): + """Tests if the input character is included in any of the given ranges.""" + lines = [[]] + for (start, end) in ranges: + expr = in_range(start, end, parenthesize=True) + line = " || ".join(lines[-1] + [expr]) + if len(line) < (100 - len(spaces) - len(" ||")): + lines[-1].append(expr) + else: + lines.append([expr]) + return " ||\n{}".format(spaces).join(" || ".join(t) for t in lines) + + def write_range_accept(parent_list, child_list, depth): + """Accepts the input character if it matches any code unit in |child_list|.""" + (min_parent, max_parent) = (parent_list[0], parent_list[-1]) + (min_child, max_child) = (child_list[0], child_list[-1]) + assert min_child >= min_parent + assert max_child <= max_parent + indent = depth * " " + + child_ranges = list(int_ranges(child_list)) + has_successor = max_child != max_parent + + # If |child_list| is a contiguous list of code units, emit a simple + # range check: |min_child <= input <= max_child|. + if len(child_ranges) == 1: + describe_range(child_ranges, depth) + if has_successor: + println(indent, "if (ch <= {}) {{".format(hexlit(max_child))) + println(indent, " return ch >= {};".format(hexlit(min_child))) + println(indent, "}") + else: + println(indent, "return {};".format(in_range(min_child, max_child))) + return + + # Otherwise create a disjunction over the subranges in |child_ranges|. + if not has_successor: + spaces = indent + len("return ") * " " + else: + spaces = indent + len(" return ") * " " + range_test_expr = in_any_range(child_ranges, spaces) + + if min_child != min_parent: + println(indent, "if (ch < {}) {{".format(hexlit(min_child))) + println(indent, " return false;") + println(indent, "}") + + # If there's no successor block, we can omit the |input <= max_child| check, + # because it was already checked when we emitted the parent range test. + if not has_successor: + describe_range(child_ranges, depth) + println(indent, "return {};".format(range_test_expr)) + else: + println(indent, "if (ch <= {}) {{".format(hexlit(max_child))) + describe_range(child_ranges, depth + 1) + println(indent, " return {};".format(range_test_expr)) + println(indent, "}") + + def write_ChangesWhenUpperCasedSpecialCasing(): + """Checks if the input has a special upper case mapping.""" + println("bool") + println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)") + println("{") + + assert unconditional_toupper, "|unconditional_toupper| is not empty" + + # Sorted list of code units with special upper case mappings. + code_list = sorted(unconditional_toupper.keys()) + + # Fail-fast if the input character isn't a special casing character. + println(" if ({}) {{".format(out_range(code_list[0], code_list[-1]))) + println(" return false;") + println(" }") + + for i in range(0, 16): + # Check if the input characters is in the range: + # |start_point <= input < end_point|. + start_point = i << 12 + end_point = (i + 1) << 12 + matches = [cu for cu in code_list if start_point <= cu < end_point] + + # Skip empty ranges. + if not matches: + continue + + # If |matches| consists of only a few characters, directly check + # the input against the characters in |matches|. + if len(matches) <= 8: + write_range_accept(code_list, matches, depth=1) + continue + + # Otherwise split into further subranges. + + # Only enter the if-block if the input is less-or-equals to the + # largest value in the current range. + is_last_block = matches[-1] == code_list[-1] + if not is_last_block: + println(" if (ch <= {}) {{".format(hexlit(matches[-1]))) + else: + println(" if (ch < {}) {{".format(hexlit(matches[0]))) + println(" return false;") + println(" }") + + for j in range(0, 16): + inner_start = start_point + (j << 8) + inner_end = start_point + ((j + 1) << 8) + inner_matches = [cu for cu in matches if inner_start <= cu < inner_end] + + if inner_matches: + d = 1 if is_last_block else 2 + write_range_accept(matches, inner_matches, depth=d) + + if not is_last_block: + println(" }") + + println("}") + + def write_LengthUpperCaseSpecialCasing(): + """Slow case: Special casing character was found, returns its mapping length.""" + println("size_t") + println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)") + println("{") + + println(" switch(ch) {") + for (code, converted) in sorted( + unconditional_toupper.items(), key=itemgetter(0) + ): + println( + " case {}: return {}; // {}".format( + hexlit(code), len(converted), codepoint_table.name(code) + ) + ) + println(" }") + println("") + println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");') + println(" return 0;") + + println("}") + + def write_AppendUpperCaseSpecialCasing(): + """Slow case: Special casing character was found, append its mapping characters.""" + println("void") + println( + "js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)" # NOQA: E501 + ) + println("{") + + println(" switch(ch) {") + for (code, converted) in sorted( + unconditional_toupper.items(), key=itemgetter(0) + ): + println( + " case {}: // {}".format(hexlit(code), codepoint_table.name(code)) + ) + for ch in converted: + println( + " elements[(*index)++] = {}; // {}".format( + hexlit(ch), codepoint_table.name(ch) + ) + ) + println(" return;") + println(" }") + println("") + println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");') + + println("}") + + write_ChangesWhenUpperCasedSpecialCasing() + println("") + write_LengthUpperCaseSpecialCasing() + println("") + write_AppendUpperCaseSpecialCasing() + + +def write_ascii_lookup_tables(table, index, write, println): + def is_id_compat(code): + return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}") + + def is_id_start(code): + (upper, lower, flags) = table[index[code]] + return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code) + + def is_id_continue(code): + (upper, lower, flags) = table[index[code]] + return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code) + + def is_space(code): + (upper, lower, flags) = table[index[code]] + return flags & FLAG_SPACE + + def write_entries(name, predicate): + println("const bool unicode::{}[] = {{".format(name)) + header = "".join("{0: <6}".format(x) for x in range(0, 10)).rstrip() + println("/* {} */".format(header)) + for i in range(0, 13): + write("/* {0: >2} */".format(i)) + for j in range(0, 10): + code = i * 10 + j + if code <= 0x7F: + write(" {},".format("true" if predicate(code) else "____")) + println("") + println("};") + + println("") + println("#define ____ false") + + println( + """ +/* + * Identifier start chars: + * - 36: $ + * - 65..90: A..Z + * - 95: _ + * - 97..122: a..z + */""" + ) + write_entries("js_isidstart", is_id_start) + + println( + """ +/* + * Identifier chars: + * - 36: $ + * - 48..57: 0..9 + * - 65..90: A..Z + * - 95: _ + * - 97..122: a..z + */""" + ) + write_entries("js_isident", is_id_continue) + + println( + """ +/* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */""" + ) + write_entries("js_isspace", is_space) + + println("") + println("#undef ____") + + +def write_latin1_lookup_tables(table, index, write, println): + def case_info(code): + assert 0 <= code and code <= MAX_BMP + (upper, lower, flags) = table[index[code]] + return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags) + + def toLowerCase(code): + (_, lower, _) = case_info(code) + assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1" + return lower + + def write_entries(name, mapper): + println("const JS::Latin1Char unicode::{}[] = {{".format(name)) + header = "".join("{0: <6}".format(x) for x in range(0, 16)).rstrip() + println("/* {} */".format(header)) + for i in range(0, 16): + write("/* {0: >2} */".format(i)) + for j in range(0, 16): + code = i * 16 + j + if code <= 0xFF: + write(" 0x{:02X},".format(mapper(code))) + println("") + println("};") + + println("") + write_entries("latin1ToLowerCaseTable", toLowerCase) + + +def make_bmp_mapping_test( + version, codepoint_table, unconditional_tolower, unconditional_toupper +): + def unicodeEsc(n): + return "\\u{:04X}".format(n) + + file_name = "../tests/non262/String/string-upper-lower-mapping.js" + with io.open(file_name, mode="w", encoding="utf-8") as output: + write = partial(print, file=output, sep="", end="") + println = partial(print, file=output, sep="", end="\n") + + write(warning_message) + write(unicode_version_message.format(version)) + write(public_domain) + println("var mapping = [") + for code in range(0, MAX_BMP + 1): + entry = codepoint_table.get(code) + + if entry: + (upper, lower, _, _) = entry + upper = ( + unconditional_toupper[code] + if code in unconditional_toupper + else [upper] + ) + lower = ( + unconditional_tolower[code] + if code in unconditional_tolower + else [lower] + ) + println( + ' ["{}", "{}"], /* {} */'.format( + "".join(map(unicodeEsc, upper)), + "".join(map(unicodeEsc, lower)), + codepoint_table.name(code), + ) + ) + else: + println(' ["{0}", "{0}"],'.format(unicodeEsc(code))) + println("];") + write( + """ +assertEq(mapping.length, 0x10000); +for (var i = 0; i <= 0xffff; i++) { + var char = String.fromCharCode(i); + var info = mapping[i]; + + assertEq(char.toUpperCase(), info[0]); + assertEq(char.toLowerCase(), info[1]); +} + +if (typeof reportCompare === "function") + reportCompare(true, true); +""" + ) + + +def make_non_bmp_mapping_test( + version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table +): + file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js" + with io.open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping: + test_non_bmp_mapping.write(warning_message) + test_non_bmp_mapping.write(unicode_version_message.format(version)) + test_non_bmp_mapping.write(public_domain) + + for code in sorted(non_bmp_upper_map.keys()): + test_non_bmp_mapping.write( + """\ +assertEq(String.fromCodePoint(0x{:04X}).toUpperCase().codePointAt(0), 0x{:04X}); // {}, {} +""".format( + code, + non_bmp_upper_map[code], + codepoint_table.name(code), + codepoint_table.name(non_bmp_upper_map[code]), + ) + ) + + for code in sorted(non_bmp_lower_map.keys()): + test_non_bmp_mapping.write( + """\ +assertEq(String.fromCodePoint(0x{:04X}).toLowerCase().codePointAt(0), 0x{:04X}); // {}, {} +""".format( + code, + non_bmp_lower_map[code], + codepoint_table.name(code), + codepoint_table.name(non_bmp_lower_map[code]), + ) + ) + + test_non_bmp_mapping.write( + """ +if (typeof reportCompare === "function") + reportCompare(true, true); +""" + ) + + +def make_space_test(version, test_space_table, codepoint_table): + def hex_and_name(c): + return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c)) + + file_name = "../tests/non262/String/string-space-trim.js" + with io.open(file_name, mode="w", encoding="utf-8") as test_space: + test_space.write(warning_message) + test_space.write(unicode_version_message.format(version)) + test_space.write(public_domain) + test_space.write("var onlySpace = String.fromCharCode(\n") + test_space.write(",\n".join(map(hex_and_name, test_space_table))) + test_space.write("\n);\n") + test_space.write( + """ +assertEq(onlySpace.trim(), ""); +assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); +assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); +assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); + +if (typeof reportCompare === "function") + reportCompare(true, true); +""" + ) + + +def make_regexp_space_test(version, test_space_table, codepoint_table): + def hex_and_name(c): + return " 0x{:04X} /* {} */".format(c, codepoint_table.name(c)) + + file_name = "../tests/non262/RegExp/character-class-escape-s.js" + with io.open(file_name, mode="w", encoding="utf-8") as test_space: + test_space.write(warning_message) + test_space.write(unicode_version_message.format(version)) + test_space.write(public_domain) + test_space.write("var onlySpace = String.fromCodePoint(\n") + test_space.write(",\n".join(map(hex_and_name, test_space_table))) + test_space.write("\n);\n") + test_space.write( + """ +assertEq(/^\s+$/.exec(onlySpace) !== null, true); +assertEq(/^[\s]+$/.exec(onlySpace) !== null, true); +assertEq(/^[^\s]+$/.exec(onlySpace) === null, true); + +assertEq(/^\S+$/.exec(onlySpace) === null, true); +assertEq(/^[\S]+$/.exec(onlySpace) === null, true); +assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true); + +// Also test with Unicode RegExps. +assertEq(/^\s+$/u.exec(onlySpace) !== null, true); +assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true); +assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true); + +assertEq(/^\S+$/u.exec(onlySpace) === null, true); +assertEq(/^[\S]+$/u.exec(onlySpace) === null, true); +assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true); + +if (typeof reportCompare === "function") + reportCompare(true, true); +""" + ) + + +def make_icase_test(version, folding_tests, codepoint_table): + def char_hex(c): + return "0x{:04X}".format(c) + + file_name = "../tests/non262/RegExp/unicode-ignoreCase.js" + with io.open(file_name, mode="w", encoding="utf-8") as test_icase: + test_icase.write(warning_message) + test_icase.write(unicode_version_message.format(version)) + test_icase.write(public_domain) + test_icase.write( + """ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- ignoreCase flag."; + +print(BUGNUMBER + ": " + summary); + +function test(code, ...equivs) { + var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu"); + var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join(""); + assertEqArray(codeRe.exec("<" + ans + ">"), [ans]); + codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu"); + assertEqArray(codeRe.exec("<" + ans + ">"), [ans]); +} +""" + ) + for args in folding_tests: + test_icase.write( + "test({}); // {}\n".format( + ", ".join(map(char_hex, args)), + ", ".join(map(codepoint_table.name, args)), + ) + ) + test_icase.write( + """ +if (typeof reportCompare === "function") + reportCompare(true, true); +""" + ) + + +def make_unicode_file( + version, + table, + index, + folding_table, + folding_index, + non_bmp_space_set, + non_bmp_id_start_set, + non_bmp_id_cont_set, + unconditional_toupper, + codepoint_table, +): + index1, index2, shift = splitbins(index) + + # Don't forget to update CharInfo in Unicode.h if you need to change this + assert shift == 6 + + folding_index1, folding_index2, folding_shift = splitbins(folding_index) + + # Don't forget to update CaseFoldInfo in Unicode.h if you need to change this + assert folding_shift == 5 + + # verify correctness + for char in index: + test = table[index[char]] + + idx = index1[char >> shift] + idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] + + assert test == table[idx] + + # verify correctness + for char in folding_index: + test = folding_table[folding_index[char]] + + idx = folding_index1[char >> folding_shift] + idx = folding_index2[ + (idx << folding_shift) + (char & ((1 << folding_shift) - 1)) + ] + + assert test == folding_table[idx] + + comment = """ +/* + * So how does indexing work? + * First let's have a look at a char16_t, 16-bits: + * [................] + * Step 1: + * Extracting the upper 11 bits from the char16_t. + * upper = char >> 5 ([***********.....]) + * Step 2: + * Using these bits to get an reduced index from index1. + * index = index1[upper] + * Step 3: + * Combining the index and the bottom 5 bits of the original char16_t. + * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) + * + * The advantage here is that the biggest number in index1 doesn't need 10 bits, + * but 7 and we save some memory. + * + * Step 4: + * Get the character informations by looking up real_index in js_charinfo. + * + * Pseudocode of generation: + * + * let table be the mapping of char16_t => js_charinfo_index + * let index1 be an empty array + * let index2 be an empty array + * let cache be a hash map + * + * while shift is less then maximal amount you can shift 0xffff before it's 0 + * let chunks be table split in chunks of size 2**shift + * + * for every chunk in chunks + * if chunk is in cache + * let index be cache[chunk] + * else + * let index be the max key of index2 + 1 + * for element in chunk + * push element to index2 + * put index as chunk in cache + * + * push index >> shift to index1 + * + * increase shift + * stop if you found the best shift + */ +""" + + def dump(data, name, println): + println("const uint8_t unicode::{}[] = {{".format(name)) + + line = pad = " " * 4 + lines = [] + for entry in data: + assert entry < 256 + s = str(entry) + s = s.rjust(3) + + if len(line + s) + 5 > 99: + lines.append(line.rstrip()) + line = pad + s + ", " + else: + line = line + s + ", " + lines.append(line.rstrip()) + + println("\n".join(lines)) + println("};") + + def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println): + println("const {} unicode::{}[] = {{".format(data_type, name)) + for d in tbl: + println(" {{ {} }},".format(", ".join(str(e) for e in d))) + println("};") + println("") + + dump(idx1, idx1_name, println) + println("") + dump(idx2, idx2_name, println) + println("") + + def write_supplemental_identifier_method(name, group_set, println): + println("bool") + println("js::unicode::{}(char32_t codePoint)".format(name)) + println("{") + for (from_code, to_code) in int_ranges(group_set.keys()): + println( + " if (codePoint >= 0x{:X} && codePoint <= 0x{:X}) {{ // {} .. {}".format( + from_code, + to_code, + codepoint_table.name(from_code), + codepoint_table.name(to_code), + ) + ) + println(" return true;") + println(" }") + println(" return false;") + println("}") + println("") + + file_name = "Unicode.cpp" + with io.open(file_name, "w", encoding="utf-8") as data_file: + write = partial(print, file=data_file, sep="", end="") + println = partial(print, file=data_file, sep="", end="\n") + + write(warning_message) + write(unicode_version_message.format(version)) + write(public_domain) + println('#include "util/Unicode.h"') + println("") + println("using namespace js;") + println("using namespace js::unicode;") + write(comment) + + write_table( + "CharacterInfo", + "js_charinfo", + table, + "index1", + index1, + "index2", + index2, + println, + ) + + write_table( + "FoldingInfo", + "js_foldinfo", + folding_table, + "folding_index1", + folding_index1, + "folding_index2", + folding_index2, + println, + ) + + # If the following assert fails, it means space character is added to + # non-BMP area. In that case the following code should be uncommented + # and the corresponding code should be added to frontend. (At least + # unicode::IsSpace will require updating to handle this.) + assert len(non_bmp_space_set.keys()) == 0 + + write_supplemental_identifier_method( + "IsIdentifierStartNonBMP", non_bmp_id_start_set, println + ) + + write_supplemental_identifier_method( + "IsIdentifierPartNonBMP", non_bmp_id_cont_set, println + ) + + write_special_casing_methods(unconditional_toupper, codepoint_table, println) + + write_ascii_lookup_tables(table, index, write, println) + + write_latin1_lookup_tables(table, index, write, println) + + +def getsize(data): + """return smallest possible integer size for the given array""" + maxdata = max(data) + assert maxdata < 2 ** 32 + + if maxdata < 256: + return 1 + elif maxdata < 65536: + return 2 + else: + return 4 + + +def splitbins(t): + """t -> (t1, t2, shift). Split a table to save space. + + t is a sequence of ints. This function can be useful to save space if + many of the ints are the same. t1 and t2 are lists of ints, and shift + is an int, chosen to minimize the combined size of t1 and t2 (in C + code), and where for each i in range(len(t)), + t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + where mask is a bitmask isolating the last "shift" bits. + """ + + def dump(t1, t2, shift, bytes): + print( + "%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes), + file=sys.stderr, + ) + print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr) + + n = len(t) - 1 # last valid index + maxshift = 0 # the most we can shift n and still have something left + if n > 0: + while n >> 1: + n >>= 1 + maxshift += 1 + del n + bytes = sys.maxsize # smallest total size so far + t = tuple(t) # so slices can be dict keys + for shift in range(maxshift + 1): + t1 = [] + t2 = [] + size = 2 ** shift + bincache = {} + + for i in range(0, len(t), size): + bin = t[i : i + size] + + index = bincache.get(bin) + if index is None: + index = len(t2) + bincache[bin] = index + t2.extend(bin) + t1.append(index >> shift) + + # determine memory size + b = len(t1) * getsize(t1) + len(t2) * getsize(t2) + if b < bytes: + best = t1, t2, shift + bytes = b + t1, t2, shift = best + + print("Best:", end=" ", file=sys.stderr) + dump(t1, t2, shift, bytes) + + # exhaustively verify that the decomposition is correct + mask = 2 ** shift - 1 + for i in range(len(t)): + assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] + return best + + +def update_unicode(args): + base_path = os.getcwd() + + version = args.version + if version is not None: + baseurl = "https://unicode.org/Public" + if version == "UNIDATA": + url = "%s/%s" % (baseurl, version) + else: + url = "%s/%s/ucd" % (baseurl, version) + + print("Arguments:") + if version is not None: + print("\tVersion: %s" % version) + print("\tDownload url: %s" % url) + + request_url = "{}/UCD.zip".format(url) + with closing(urlopen(request_url)) as downloaded_file: + downloaded_data = io.BytesIO(downloaded_file.read()) + + with ZipFile(downloaded_data) as zip_file: + for fname in [ + "UnicodeData.txt", + "CaseFolding.txt", + "DerivedCoreProperties.txt", + "SpecialCasing.txt", + ]: + zip_file.extract(fname, path=base_path) + else: + print("\tUsing local files.") + print("\tAlways make sure you have the newest Unicode files!") + print("") + + def version_from_file(f, fname): + pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname) + return pat_version.match(f.readline()).group("version") + + with io.open( + os.path.join(base_path, "UnicodeData.txt"), "r", encoding="utf-8" + ) as unicode_data, io.open( + os.path.join(base_path, "CaseFolding.txt"), "r", encoding="utf-8" + ) as case_folding, io.open( + os.path.join(base_path, "DerivedCoreProperties.txt"), "r", encoding="utf-8" + ) as derived_core_properties, io.open( + os.path.join(base_path, "SpecialCasing.txt"), "r", encoding="utf-8" + ) as special_casing: + unicode_version = version_from_file( + derived_core_properties, "DerivedCoreProperties" + ) + + print("Processing...") + ( + table, + index, + non_bmp_lower_map, + non_bmp_upper_map, + non_bmp_space_set, + non_bmp_id_start_set, + non_bmp_id_cont_set, + codepoint_table, + test_space_table, + ) = process_unicode_data(unicode_data, derived_core_properties) + (folding_table, folding_index, folding_tests) = process_case_folding( + case_folding + ) + (unconditional_tolower, unconditional_toupper) = process_special_casing( + special_casing, table, index + ) + + print("Generating...") + make_unicode_file( + unicode_version, + table, + index, + folding_table, + folding_index, + non_bmp_space_set, + non_bmp_id_start_set, + non_bmp_id_cont_set, + unconditional_toupper, + codepoint_table, + ) + make_non_bmp_file( + unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table + ) + + make_bmp_mapping_test( + unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper + ) + make_non_bmp_mapping_test( + unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table + ) + make_space_test(unicode_version, test_space_table, codepoint_table) + make_regexp_space_test(unicode_version, test_space_table, codepoint_table) + make_icase_test(unicode_version, folding_tests, codepoint_table) + + +if __name__ == "__main__": + import argparse + + # This script must be run from js/src/util to work correctly. + if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util": + raise RuntimeError("%s must be run from js/src/util" % sys.argv[0]) + + parser = argparse.ArgumentParser(description="Update Unicode data.") + + parser.add_argument( + "--version", + help='Optional Unicode version number. If specified, downloads the\ + selected version from <https://unicode.org/Public>. If not specified\ + uses the existing local files to generate the Unicode data. The\ + number must match a published Unicode version, e.g. use\ + "--version=8.0.0" to download Unicode 8 files. Alternatively use\ + "--version=UNIDATA" to download the latest published version.', + ) + + parser.set_defaults(func=update_unicode) + + args = parser.parse_args() + args.func(args) |