diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
commit | dc0db358abe19481e475e10c32149b53370f1a1c (patch) | |
tree | ab8ce99c4b255ce46f99ef402c27916055b899ee /vendor/unicode-segmentation | |
parent | Releasing progress-linux version 1.71.1+dfsg1-2~progress7.99u1. (diff) | |
download | rustc-dc0db358abe19481e475e10c32149b53370f1a1c.tar.xz rustc-dc0db358abe19481e475e10c32149b53370f1a1c.zip |
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/unicode-segmentation')
-rw-r--r-- | vendor/unicode-segmentation/.cargo-checksum.json | 2 | ||||
-rw-r--r-- | vendor/unicode-segmentation/Cargo.toml | 2 | ||||
-rw-r--r-- | vendor/unicode-segmentation/README.md | 2 | ||||
-rw-r--r-- | vendor/unicode-segmentation/benches/unicode_words.rs | 55 | ||||
-rw-r--r-- | vendor/unicode-segmentation/benches/word_bounds.rs | 55 | ||||
-rwxr-xr-x | vendor/unicode-segmentation/scripts/unicode.py | 62 | ||||
-rw-r--r-- | vendor/unicode-segmentation/src/tables.rs | 346 |
7 files changed, 442 insertions, 82 deletions
diff --git a/vendor/unicode-segmentation/.cargo-checksum.json b/vendor/unicode-segmentation/.cargo-checksum.json index acc5d5d0e..faa400016 100644 --- a/vendor/unicode-segmentation/.cargo-checksum.json +++ b/vendor/unicode-segmentation/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"}
\ No newline at end of file +{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"3de4086a8d886795bd6f4322b3f42ee8b84eb1c9ab6651fb95c0469d1cf508f1","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"8d05f9f7aafc8af56853b802e14d0e2e73a56021d8b5481b69e169fd3e2c44cc","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"42df2c8e9bfb54baf31c8327e27915683c25b90d55dc04311c28a0c2a9d185e3","benches/word_bounds.rs":"0453e37477134063178e104305b0a5f27ce74aa342234a6208095623c45de0d4","scripts/unicode.py":"e38d564c2f778aeb2cd17ce6c4395e5f5abd85ff8dd5292d76a93c16f4e38eb9","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"547636e4d6d685286f98c7f1db5ebde8e254ec811dbec4b7c2edf74c1454740d","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"}
\ No newline at end of file diff --git a/vendor/unicode-segmentation/Cargo.toml b/vendor/unicode-segmentation/Cargo.toml index 0da56c81e..83dea2895 100644 --- a/vendor/unicode-segmentation/Cargo.toml +++ b/vendor/unicode-segmentation/Cargo.toml @@ -12,7 +12,7 @@ [package] edition = "2018" name = "unicode-segmentation" -version = "1.10.0" +version = "1.10.1" authors = [ "kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>", diff --git a/vendor/unicode-segmentation/README.md b/vendor/unicode-segmentation/README.md index 48d9a9205..ef61ebd10 100644 --- a/vendor/unicode-segmentation/README.md +++ b/vendor/unicode-segmentation/README.md @@ -38,7 +38,7 @@ to your `Cargo.toml`: ```toml [dependencies] -unicode-segmentation = "1.9.0" +unicode-segmentation = "1.10.1" ``` # Change Log diff --git a/vendor/unicode-segmentation/benches/unicode_words.rs b/vendor/unicode-segmentation/benches/unicode_words.rs index c87851a37..a7f8f4142 100644 --- a/vendor/unicode-segmentation/benches/unicode_words.rs +++ b/vendor/unicode-segmentation/benches/unicode_words.rs @@ -1,55 +1,52 @@ -#[macro_use] -extern crate bencher; -extern crate unicode_segmentation; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use bencher::Bencher; use std::fs; use unicode_segmentation::UnicodeSegmentation; -fn unicode_words(bench: &mut Bencher, path: &str) { +fn unicode_words(c: &mut Criterion, lang: &str, path: &str) { let text = fs::read_to_string(path).unwrap(); - bench.iter(|| { - for w in text.unicode_words() { - bencher::black_box(w); - } + c.bench_function(&format!("unicode_words_{}", lang), |bench| { + bench.iter(|| { + for w in text.unicode_words() { + black_box(w); + } + }) }); - - bench.bytes = text.len() as u64; } -fn unicode_words_arabic(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/arabic.txt"); +fn unicode_words_arabic(c: &mut Criterion) { + unicode_words(c, "arabic", "benches/texts/arabic.txt"); } -fn unicode_words_english(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/english.txt"); +fn unicode_words_english(c: &mut Criterion) { + unicode_words(c, "english", "benches/texts/english.txt"); } -fn unicode_words_hindi(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/hindi.txt"); +fn unicode_words_hindi(c: &mut Criterion) { + unicode_words(c, "hindi", "benches/texts/hindi.txt"); } -fn unicode_words_japanese(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/japanese.txt"); +fn unicode_words_japanese(c: &mut Criterion) { + unicode_words(c, "japanese", "benches/texts/japanese.txt"); } -fn unicode_words_korean(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/korean.txt"); +fn unicode_words_korean(c: &mut Criterion) { + unicode_words(c, "korean", "benches/texts/korean.txt"); } -fn unicode_words_mandarin(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/mandarin.txt"); +fn unicode_words_mandarin(c: &mut Criterion) { + unicode_words(c, "mandarin", "benches/texts/mandarin.txt"); } -fn unicode_words_russian(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/russian.txt"); +fn unicode_words_russian(c: &mut Criterion) { + unicode_words(c, "russian", "benches/texts/russian.txt"); } -fn unicode_words_source_code(bench: &mut Bencher) { - unicode_words(bench, "benches/texts/source_code.txt"); +fn unicode_words_source_code(c: &mut Criterion) { + unicode_words(c, "source_code", "benches/texts/source_code.txt"); } -benchmark_group!( +criterion_group!( benches, unicode_words_arabic, unicode_words_english, @@ -61,4 +58,4 @@ benchmark_group!( unicode_words_source_code, ); -benchmark_main!(benches); +criterion_main!(benches); diff --git a/vendor/unicode-segmentation/benches/word_bounds.rs b/vendor/unicode-segmentation/benches/word_bounds.rs index 6b01ddb10..cae7a8819 100644 --- a/vendor/unicode-segmentation/benches/word_bounds.rs +++ b/vendor/unicode-segmentation/benches/word_bounds.rs @@ -1,55 +1,52 @@ -#[macro_use] -extern crate bencher; -extern crate unicode_segmentation; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use bencher::Bencher; use std::fs; use unicode_segmentation::UnicodeSegmentation; -fn word_bounds(bench: &mut Bencher, path: &str) { +fn word_bounds(c: &mut Criterion, lang: &str, path: &str) { let text = fs::read_to_string(path).unwrap(); - bench.iter(|| { - for w in text.split_word_bounds() { - bencher::black_box(w); - } + c.bench_function(&format!("word_bounds_{}", lang), |bench| { + bench.iter(|| { + for w in text.split_word_bounds() { + black_box(w); + } + }); }); - - bench.bytes = text.len() as u64; } -fn word_bounds_arabic(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/arabic.txt"); +fn word_bounds_arabic(c: &mut Criterion) { + word_bounds(c, "arabic", "benches/texts/arabic.txt"); } -fn word_bounds_english(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/english.txt"); +fn word_bounds_english(c: &mut Criterion) { + word_bounds(c, "english", "benches/texts/english.txt"); } -fn word_bounds_hindi(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/hindi.txt"); +fn word_bounds_hindi(c: &mut Criterion) { + word_bounds(c, "hindi", "benches/texts/hindi.txt"); } -fn word_bounds_japanese(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/japanese.txt"); +fn word_bounds_japanese(c: &mut Criterion) { + word_bounds(c, "japanese", "benches/texts/japanese.txt"); } -fn word_bounds_korean(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/korean.txt"); +fn word_bounds_korean(c: &mut Criterion) { + word_bounds(c, "korean", "benches/texts/korean.txt"); } -fn word_bounds_mandarin(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/mandarin.txt"); +fn word_bounds_mandarin(c: &mut Criterion) { + word_bounds(c, "mandarin", "benches/texts/mandarin.txt"); } -fn word_bounds_russian(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/russian.txt"); +fn word_bounds_russian(c: &mut Criterion) { + word_bounds(c, "russian", "benches/texts/russian.txt"); } -fn word_bounds_source_code(bench: &mut Bencher) { - word_bounds(bench, "benches/texts/source_code.txt"); +fn word_bounds_source_code(c: &mut Criterion) { + word_bounds(c, "source_code", "benches/texts/source_code.txt"); } -benchmark_group!( +criterion_group!( benches, word_bounds_arabic, word_bounds_english, @@ -61,4 +58,4 @@ benchmark_group!( word_bounds_source_code, ); -benchmark_main!(benches); +criterion_main!(benches); diff --git a/vendor/unicode-segmentation/scripts/unicode.py b/vendor/unicode-segmentation/scripts/unicode.py index 7aed85e7c..18cea99c4 100755 --- a/vendor/unicode-segmentation/scripts/unicode.py +++ b/vendor/unicode-segmentation/scripts/unicode.py @@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name): pub enum %sCat { """ % (name, Name, Name)) + # We don't want the lookup table to be too large so choose a reasonable + # cutoff. 0x20000 is selected because most of the range table entries are + # within the interval of [0x0, 0x20000] + lookup_value_cutoff = 0x20000 + + # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`. + lookup_table_len = 0x400 + + lookup_interval = round(lookup_value_cutoff / lookup_table_len) + + # Lookup table is a mapping from `character code / lookup_interval` to + # the index in the range table that covers the `character code`. + lookup_table = [0] * lookup_table_len + j = 0 + for i in range(0, lookup_table_len): + lookup_from = i * lookup_interval + while j < len(break_table): + (_, entry_to, _) = break_table[j] + if entry_to >= lookup_from: + break + j += 1 + lookup_table[i] = j + break_cats.append("Any") break_cats.sort() for cat in break_cats: f.write((" %sC_" % Name[0]) + cat + ",\n") f.write(""" } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name): } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), %sC_Any, ) } @@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name): } pub fn %s_category(c: char) -> (u32, u32, %sCat) { - bsearch_range_value_table(c, %s_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x%x; + let idx = (c as u32 / lookup_interval) as usize; + let range = %s_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + %d..%d, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &%s_cat_table[range], lower, upper) } -""" % (Name, Name, Name[0], name, Name, name)) +""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name)) + + + if len(break_table) <= 0xff: + lookup_type = "u8" + elif len(break_table) <= 0xffff: + lookup_type = "u16" + else: + lookup_type = "u32" + + emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type, + pfun=lambda x: "%d" % x, + is_pub=False, is_const=True) emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name, pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]), diff --git a/vendor/unicode-segmentation/src/tables.rs b/vendor/unicode-segmentation/src/tables.rs index 5a811c922..ca83b503a 100644 --- a/vendor/unicode-segmentation/src/tables.rs +++ b/vendor/unicode-segmentation/src/tables.rs @@ -365,7 +365,7 @@ pub mod grapheme { GC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)], default_lower: u32, default_upper: u32) -> (u32, u32, GraphemeCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -378,8 +378,8 @@ pub mod grapheme { } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), GC_Any, ) } @@ -387,9 +387,93 @@ pub mod grapheme { } pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) { - bsearch_range_value_table(c, grapheme_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = grapheme_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 1443..1449, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &grapheme_cat_table[range], lower, upper) } + const grapheme_cat_lookup: &'static [u16] = &[ + 0, 5, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 16, 21, 26, 29, 32, 37, 41, 53, 65, 75, 86, 97, + 106, 116, 131, 143, 153, 157, 161, 168, 173, 183, 188, 189, 191, 191, 191, 192, 192, 192, + 192, 192, 192, 192, 192, 198, 206, 209, 211, 219, 219, 232, 233, 242, 258, 262, 270, 270, + 271, 271, 271, 271, 271, 279, 280, 282, 284, 284, 284, 286, 290, 290, 291, 291, 295, 297, + 298, 313, 317, 317, 317, 318, 318, 318, 318, 322, 322, 322, 323, 324, 325, 325, 325, 325, + 325, 328, 329, 329, 329, 329, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 333, 335, 335, 335, 342, 347, 351, 360, 369, 379, 379, 386, 395, 405, 413, + 423, 431, 441, 450, 459, 469, 477, 487, 495, 505, 514, 523, 533, 541, 551, 559, 569, 578, + 587, 597, 605, 615, 623, 633, 642, 651, 661, 669, 679, 687, 697, 706, 715, 725, 733, 743, + 751, 761, 770, 779, 789, 797, 807, 815, 825, 834, 843, 853, 861, 871, 879, 889, 898, 907, + 917, 925, 935, 943, 953, 962, 971, 981, 989, 999, 1007, 1017, 1026, 1035, 1045, 1053, 1063, + 1071, 1081, 1090, 1099, 1109, 1117, 1127, 1135, 1145, 1154, 1163, 1173, 1181, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1187, 1187, 1187, 1187, 1187, 1187, + 1189, 1190, 1190, 1192, 1192, 1192, 1192, 1193, 1193, 1194, 1195, 1195, 1195, 1195, 1195, + 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1200, 1201, 1201, 1201, 1201, 1201, + 1202, 1202, 1202, 1204, 1205, 1206, 1212, 1221, 1227, 1236, 1244, 1247, 1260, 1260, 1267, + 1278, 1278, 1286, 1292, 1299, 1303, 1303, 1307, 1307, 1318, 1324, 1333, 1337, 1337, 1337, + 1342, 1349, 1355, 1361, 1361, 1363, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1376, 1377, 1377, 1377, 1377, 1377, 1377, 1377, + 1377, 1378, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, + 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, + 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1386, 1386, + 1386, 1386, 1392, 1395, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, + 1396, 1396, 1396, 1396, 1396, 1399, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, + 1402, 1402, 1407, 1408, 1409, 1409, 1409, 1411, 1411, 1411, 1411, 1412, 1412, 1412, 1412, + 1412, 1412, 1412, 1412, 1413, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, + 1414, 1414, 1414, 1414, 1414, 1415, 1419, 1423, 1428, 1428, 1428, 1430, 1430, 1430, 1431, + 1431, 1432, 1433, 1434, 1435, 1438, 1440, 1442, 1442, 1442, 1443, 1443, 1443, 1443, 1443, + 1443, 1443, 1443, 1443, 1443 + ]; + const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[ ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control), ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', @@ -1028,7 +1112,7 @@ pub mod word { WC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)], default_lower: u32, default_upper: u32) -> (u32, u32, WordCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1041,8 +1125,8 @@ pub mod word { } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), WC_Any, ) } @@ -1050,9 +1134,87 @@ pub mod word { } pub fn word_category(c: char) -> (u32, u32, WordCat) { - bsearch_range_value_table(c, word_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = word_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 1050..1053, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &word_cat_table[range], lower, upper) } + const word_cat_lookup: &'static [u16] = &[ + 0, 14, 22, 22, 22, 22, 24, 30, 36, 36, 38, 43, 55, 66, 78, 83, 93, 104, 111, 121, 143, 162, + 180, 198, 215, 231, 250, 266, 278, 282, 286, 295, 301, 308, 316, 316, 316, 321, 329, 333, + 336, 336, 336, 336, 336, 338, 342, 351, 354, 359, 365, 369, 370, 375, 378, 384, 391, 397, + 409, 409, 411, 411, 411, 420, 430, 449, 451, 464, 465, 465, 465, 465, 465, 465, 466, 466, + 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 470, 476, 486, 487, + 487, 487, 487, 492, 496, 497, 500, 500, 501, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 504, 504, 504, 511, 515, 515, 519, 529, 538, 544, 551, 559, 568, 574, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 592, 593, 593, 593, 594, + 597, 609, 611, 620, 628, 634, 635, 636, 637, 637, 640, 644, 648, 648, 652, 655, 662, 662, + 662, 665, 668, 675, 678, 680, 682, 692, 696, 699, 700, 701, 703, 706, 706, 706, 710, 714, + 718, 726, 734, 744, 753, 759, 767, 785, 785, 791, 796, 796, 801, 805, 809, 811, 811, 813, + 815, 828, 835, 844, 848, 848, 848, 854, 857, 869, 875, 875, 877, 885, 886, 886, 886, 886, + 886, 886, 886, 886, 887, 888, 888, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, + 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 890, 890, 890, 890, 890, 890, 890, 890, + 890, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, + 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, + 895, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 899, 903, 908, 909, 909, 909, 909, 909, 910, 910, 913, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 923, 924, 924, 927, + 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, + 927, 927, 927, 929, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, + 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, + 933, 933, 933, 933, 933, 935, 935, 935, 935, 938, 941, 942, 942, 942, 942, 943, 951, 960, + 960, 960, 964, 968, 973, 973, 973, 973, 973, 976, 979, 979, 979, 979, 979, 979, 979, 979, + 979, 981, 981, 987, 988, 993, 993, 993, 998, 998, 998, 998, 1001, 1001, 1001, 1001, 1001, + 1001, 1005, 1005, 1007, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1039, + 1044, 1044, 1044, 1044, 1044, 1046, 1048, 1048, 1048, 1048, 1049, 1049, 1049, 1049, 1049, + 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1050, 1050, 1050, 1050, + 1050, 1050, 1050, 1050 + ]; + const word_cat_table: &'static [(char, char, WordCat)] = &[ ('\u{a}', '\u{a}', WC_LF), ('\u{b}', '\u{c}', WC_Newline), ('\u{d}', '\u{d}', WC_CR), ('\u{20}', '\u{20}', WC_WSegSpace), ('\u{22}', '\u{22}', WC_Double_Quote), ('\u{27}', @@ -1530,7 +1692,7 @@ pub mod emoji { EC_Extended_Pictographic, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)], default_lower: u32, default_upper: u32) -> (u32, u32, EmojiCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1543,8 +1705,8 @@ pub mod emoji { } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), EC_Any, ) } @@ -1552,9 +1714,73 @@ pub mod emoji { } pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) { - bsearch_range_value_table(c, emoji_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = emoji_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 77..78, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &emoji_cat_table[range], lower, upper) } + const emoji_cat_lookup: &'static [u8] = &[ + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 8, 8, 8, 10, 14, 14, 15, 15, 19, 21, 22, 37, 41, 41, 41, 42, 42, 42, 42, + 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 48, 48, 48, 48, 48, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 55, 58, 63, 63, 63, 64, 64, 64, 65, 65, 66, 67, + 68, 69, 72, 74, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77 + ]; + const emoji_cat_table: &'static [(char, char, EmojiCat)] = &[ ('\u{a9}', '\u{a9}', EC_Extended_Pictographic), ('\u{ae}', '\u{ae}', EC_Extended_Pictographic), ('\u{203c}', '\u{203c}', EC_Extended_Pictographic), ('\u{2049}', @@ -1633,7 +1859,7 @@ pub mod sentence { SC_Upper, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)], default_lower: u32, default_upper: u32) -> (u32, u32, SentenceCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1646,8 +1872,8 @@ pub mod sentence { } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower }, + r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper), SC_Any, ) } @@ -1655,9 +1881,97 @@ pub mod sentence { } pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) { - bsearch_range_value_table(c, sentence_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = sentence_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 2410..2421, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + + // Compute pessimistic default lower and upper bounds on the category. + // If character doesn't map to any range and there is no adjacent range + // in the table slice - these bounds has to apply. + let lower = idx as u32 * lookup_interval; + let upper = lower + lookup_interval - 1; + bsearch_range_value_table(c, &sentence_cat_table[range], lower, upper) } + const sentence_cat_lookup: &'static [u16] = &[ + 0, 19, 31, 154, 247, 314, 323, 333, 375, 409, 528, 579, 588, 599, 612, 618, 629, 643, 650, + 661, 683, 702, 720, 738, 755, 771, 790, 806, 818, 825, 840, 850, 856, 871, 882, 882, 882, + 887, 895, 901, 904, 904, 904, 904, 904, 907, 912, 922, 928, 937, 943, 950, 953, 959, 964, + 973, 980, 988, 1000, 1000, 1002, 1130, 1249, 1267, 1288, 1308, 1311, 1336, 1340, 1340, 1340, + 1342, 1342, 1342, 1344, 1344, 1344, 1344, 1344, 1346, 1348, 1348, 1348, 1348, 1351, 1351, + 1351, 1351, 1351, 1369, 1476, 1482, 1492, 1501, 1501, 1501, 1501, 1512, 1517, 1518, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1525, 1525, 1525, 1580, 1613, 1696, 1769, 1780, 1790, 1797, 1808, + 1819, 1836, 1843, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1853, 1854, 1864, 1865, 1865, + 1865, 1867, 1870, 1886, 1888, 1905, 1913, 1919, 1920, 1921, 1922, 1922, 1925, 1929, 1933, + 1935, 1939, 1942, 1949, 1949, 1949, 1952, 1957, 1964, 1967, 1969, 1971, 1982, 1986, 1989, + 1990, 1991, 1993, 1996, 1996, 1996, 2000, 2005, 2010, 2019, 2028, 2039, 2051, 2059, 2068, + 2086, 2086, 2093, 2098, 2098, 2105, 2110, 2114, 2119, 2119, 2121, 2124, 2139, 2146, 2156, + 2161, 2161, 2161, 2168, 2171, 2183, 2189, 2189, 2192, 2201, 2202, 2202, 2202, 2202, 2202, + 2202, 2202, 2202, 2203, 2204, 2204, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, + 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2206, 2206, 2206, + 2206, 2206, 2206, 2206, 2206, 2206, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, + 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, + 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2216, 2221, 2228, 2229, 2229, 2229, + 2229, 2229, 2231, 2232, 2235, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2243, 2243, 2243, 2243, 2243, 2243, 2243, + 2243, 2243, 2243, 2244, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2248, 2248, + 2248, 2253, 2253, 2253, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, + 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2256, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2263, 2263, 2263, 2263, 2266, 2269, 2270, 2270, 2270, 2270, 2275, 2288, 2300, 2305, + 2310, 2316, 2322, 2330, 2330, 2330, 2330, 2330, 2333, 2337, 2337, 2337, 2337, 2337, 2337, + 2337, 2337, 2337, 2341, 2341, 2347, 2348, 2353, 2353, 2353, 2358, 2358, 2358, 2358, 2361, + 2361, 2361, 2361, 2361, 2361, 2365, 2365, 2367, 2372, 2372, 2372, 2372, 2372, 2372, 2372, + 2372, 2372, 2372, 2400, 2405, 2405, 2405, 2405, 2405, 2407, 2408, 2408, 2408, 2408, 2408, + 2408, 2408, 2408, 2408, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, + 2410, 2410, 2410, 2410, 2410, 2410, 2410, 2410 + ]; + const sentence_cat_table: &'static [(char, char, SentenceCat)] = &[ ('\u{9}', '\u{9}', SC_Sp), ('\u{a}', '\u{a}', SC_LF), ('\u{b}', '\u{c}', SC_Sp), ('\u{d}', '\u{d}', SC_CR), ('\u{20}', '\u{20}', SC_Sp), ('\u{21}', '\u{21}', SC_STerm), ('\u{22}', |