diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py')
-rwxr-xr-x | third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py b/third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py new file mode 100755 index 0000000000..476aa93561 --- /dev/null +++ b/third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +# +# Copyright 2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - auxiliary/GraphemeBreakTest.txt +# - auxiliary/WordBreakTest.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the unicode.rs file into git. + +import unicode, re, os, fileinput + +def load_test_data(f, optsplit=[]): + outls = [] + testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") + + unicode.fetch(f) + data = [] + for line in fileinput.input(os.path.basename(f)): + # lines that include a test start with the ÷ character + if len(line) < 2 or line[0:2] != '÷': + continue + + m = testRe1.match(line) + if not m: + print "error: no match on line where test was expected: %s" % line + continue + + # process the characters in this test case + chars = process_split_string(m.group(1)) + # skip test case if it contains invalid characters (viz., surrogates) + if not chars: + continue + + # now process test cases + (chars, info) = process_split_info(m.group(2), chars, optsplit) + + # make sure that we have break info for each break! + assert len(chars) - 1 == len(info) + + outls.append((chars, info)) + + return outls + +def process_split_info(s, c, o): + outcs = [] + outis = [] + workcs = c.pop(0) + + # are we on a × or a ÷? + isX = False + if s[0:2] == '×': + isX = True + + # find each instance of '(÷|×) [x.y] ' + while s: + # find the currently considered rule number + sInd = s.index('[') + 1 + eInd = s.index(']') + + # if it's '× [a.b]' where 'a.b' is in o, then + # we consider it a split even though it's not + # marked as one + # if it's ÷ then it's always a split + if not isX or s[sInd:eInd] in o: + outis.append(s[sInd:eInd]) + outcs.append(workcs) + workcs = c.pop(0) + else: + workcs.extend(c.pop(0)) + + idx = 1 + while idx < len(s): + if s[idx:idx+2] == '×': + isX = True + break + if s[idx:idx+2] == '÷': + isX = False + break + idx += 1 + s = s[idx:] + + outcs.append(workcs) + return (outcs, outis) + +def process_split_string(s): + outls = [] + workls = [] + + inls = s.split() + + for i in inls: + if i == '÷' or i == '×': + outls.append(workls) + workls = [] + continue + + ival = int(i,16) + + if unicode.is_surrogate(ival): + return [] + + workls.append(ival) + + if workls: + outls.append(workls) + + return outls + +def showfun(x): + outstr = '("' + for c in x[0]: + outstr += "\\u{%x}" % c + outstr += '",&[' + xfirst = True + for xx in x[1:]: + if not xfirst: + outstr += '],&[' + xfirst = False + sfirst = True + for sp in xx: + if not sfirst: + outstr += ',' + sfirst = False + outstr += '"' + for c in sp: + outstr += "\\u{%x}" % c + outstr += '"' + outstr += '])' + return outstr + +def create_grapheme_data(f): + # rules 9.1 and 9.2 are for extended graphemes only + optsplits = ['9.1','9.2'] + d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits) + + test_same = [] + test_diff = [] + + for (c, i) in d: + allchars = [cn for s in c for cn in s] + extgraphs = [] + extwork = [] + + extwork.extend(c[0]) + for n in range(0,len(i)): + if i[n] in optsplits: + extwork.extend(c[n+1]) + else: + extgraphs.append(extwork) + extwork = [] + extwork.extend(c[n+1]) + + # these are the extended grapheme clusters + extgraphs.append(extwork) + + if extgraphs == c: + test_same.append((allchars, c)) + else: + test_diff.append((allchars, extgraphs, c)) + + stype = "&'static [(&'static str, &'static [&'static str])]" + dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]" + f.write(" // official Unicode test data\n") + f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n") + unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True) + unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True) + +def create_words_data(f): + d = load_test_data("auxiliary/WordBreakTest.txt") + + test = [] + + for (c, i) in d: + allchars = [cn for s in c for cn in s] + test.append((allchars, c)) + + wtype = "&'static [(&'static str, &'static [&'static str])]" + f.write(" // official Unicode test data\n") + f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n") + unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True) + +if __name__ == "__main__": + with open("testdata.rs", "w") as rf: + rf.write(unicode.preamble) + create_grapheme_data(rf) + create_words_data(rf) |