diff options
Diffstat (limited to 'js/src/irregexp/imported/gen-regexp-special-case.cc')
-rw-r--r-- | js/src/irregexp/imported/gen-regexp-special-case.cc | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc new file mode 100644 index 0000000000..8f6557ed30 --- /dev/null +++ b/js/src/irregexp/imported/gen-regexp-special-case.cc @@ -0,0 +1,214 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <fstream> +#include <iomanip> +#include <iostream> +#include <sstream> + +#include "irregexp/imported/special-case.h" +#include "unicode/usetiter.h" + +namespace v8 { +namespace internal { + +static const base::uc32 kSurrogateStart = 0xd800; +static const base::uc32 kSurrogateEnd = 0xdfff; +static const base::uc32 kNonBmpStart = 0x10000; + +// The following code generates "src/regexp/special-case.cc". +void PrintSet(std::ofstream& out, const char* name, + const icu::UnicodeSet& set) { + out << "icu::UnicodeSet Build" << name << "() {\n" + << " icu::UnicodeSet set;\n"; + for (int32_t i = 0; i < set.getRangeCount(); i++) { + if (set.getRangeStart(i) == set.getRangeEnd(i)) { + out << " set.add(0x" << set.getRangeStart(i) << ");\n"; + } else { + out << " set.add(0x" << set.getRangeStart(i) << ", 0x" + << set.getRangeEnd(i) << ");\n"; + } + } + out << " set.freeze();\n" + << " return set;\n" + << "}\n\n"; + + out << "struct " << name << "Data {\n" + << " " << name << "Data() : set(Build" << name << "()) {}\n" + << " const icu::UnicodeSet set;\n" + << "};\n\n"; + + out << "//static\n" + << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" + << " static base::LazyInstance<" << name << "Data>::type set =\n" + << " LAZY_INSTANCE_INITIALIZER;\n" + << " return set.Pointer()->set;\n" + << "}\n\n"; +} + +void PrintSpecial(std::ofstream& out) { + icu::UnicodeSet current; + icu::UnicodeSet special_add; + icu::UnicodeSet ignore; + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeSet upper("[\\p{Lu}]", status); + CHECK(U_SUCCESS(status)); + + // Iterate through all chars in BMP except surrogates. + for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { + if (i >= static_cast<UChar32>(kSurrogateStart) && + i <= static_cast<UChar32>(kSurrogateEnd)) { + continue; // Ignore surrogate range + } + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Check to see if all characters in the case-folding equivalence + // class as defined by UnicodeSet::closeOver all map to the same + // canonical value. + UChar32 canonical = RegExpCaseFolding::Canonicalize(i); + bool class_has_matching_canonical_char = false; + bool class_has_non_matching_canonical_char = false; + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); + c++) { + if (c == i) { + continue; + } + UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); + if (canonical == other_canonical) { + class_has_matching_canonical_char = true; + } else { + class_has_non_matching_canonical_char = true; + } + } + } + // If any other character in i's equivalence class has a + // different canonical value, then i needs special handling. If + // no other character shares a canonical value with i, we can + // ignore i when adding alternatives for case-independent + // comparison. If at least one other character shares a + // canonical value, then i needs special handling. + if (class_has_non_matching_canonical_char) { + if (class_has_matching_canonical_char) { + special_add.add(i); + } else { + ignore.add(i); + } + } + } + + // Verify that no Unicode equivalence class contains two non-trivial + // JS equivalence classes. Every character in SpecialAddSet has the + // same canonical value as every other non-IgnoreSet character in + // its Unicode equivalence class. Therefore, if we call closeOver on + // a set containing no IgnoreSet characters, the only characters + // that must be removed from the result are in IgnoreSet. This fact + // is used in CharacterRange::AddCaseEquivalents. + for (int32_t i = 0; i < special_add.getRangeCount(); i++) { + for (UChar32 c = special_add.getRangeStart(i); + c <= special_add.getRangeEnd(i); c++) { + UChar32 canonical = RegExpCaseFolding::Canonicalize(c); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE); + current.removeAll(ignore); + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c2 = current.getRangeStart(j); + c2 <= current.getRangeEnd(j); c2++) { + CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); + } + } + } + } + + PrintSet(out, "IgnoreSet", ignore); + PrintSet(out, "SpecialAddSet", special_add); +} + +void PrintUnicodeSpecial(std::ofstream& out) { + icu::UnicodeSet non_simple_folding; + icu::UnicodeSet current; + UErrorCode status = U_ZERO_ERROR; + // Look at all characters except white spaces. + icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status); + CHECK_EQ(status, U_ZERO_ERROR); + icu::UnicodeSetIterator iter(interestingCP); + while (iter.next()) { + UChar32 c = iter.getCodepoint(); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings(); + CHECK(!current.isBogus()); + // Remove characters from the closeover that have a simple case folding. + icu::UnicodeSet toRemove; + icu::UnicodeSetIterator closeOverIter(current); + while (closeOverIter.next()) { + UChar32 closeOverChar = closeOverIter.getCodepoint(); + UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT); + if (closeOverChar != closeOverSCF) { + toRemove.add(closeOverChar); + } + } + CHECK(!toRemove.isBogus()); + current.removeAll(toRemove); + + // The current character and its simple case folding are also always OK. + UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT); + current.remove(c); + current.remove(scf); + + // If there are any characters remaining, they were added due to full case + // foldings and shouldn't match the current charcter according to the spec. + if (!current.isEmpty()) { + // Ensure that the character doesn't have a simple case folding. + // Otherwise the current approach of simply removing the character from + // the set before calling closeOver won't work. + CHECK_EQ(c, scf); + non_simple_folding.add(c); + } + } + CHECK(!non_simple_folding.isBogus()); + + PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding); +} + +void WriteHeader(const char* header_filename) { + std::ofstream out(header_filename); + out << std::hex << std::setfill('0') << std::setw(4); + out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" + << "// Use of this source code is governed by a BSD-style license that\n" + << "// can be found in the LICENSE file.\n\n" + << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" + << "// The following functions are used to build UnicodeSets\n" + << "// for special cases where the case-folding algorithm used by\n" + << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" + << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" + << "// Semantics: Canonicalize) step 3.\n\n" + << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/base/lazy-instance.h\"\n\n" + << "#include \"src/regexp/special-case.h\"\n\n" + << "#include \"unicode/uniset.h\"\n" + << "namespace v8 {\n" + << "namespace internal {\n\n"; + + PrintSpecial(out); + PrintUnicodeSpecial(out); + + out << "\n" + << "} // namespace internal\n" + << "} // namespace v8\n" + << "#endif // V8_INTL_SUPPORT\n"; +} + +} // namespace internal +} // namespace v8 + +int main(int argc, const char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " <output filename>\n"; + std::exit(1); + } + v8::internal::WriteHeader(argv[1]); + + return 0; +} |