summaryrefslogtreecommitdiffstats
path: root/vendor/unicode-segmentation
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:31 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:31 +0000
commitdc0db358abe19481e475e10c32149b53370f1a1c (patch)
treeab8ce99c4b255ce46f99ef402c27916055b899ee /vendor/unicode-segmentation
parentReleasing progress-linux version 1.71.1+dfsg1-2~progress7.99u1. (diff)
downloadrustc-dc0db358abe19481e475e10c32149b53370f1a1c.tar.xz
rustc-dc0db358abe19481e475e10c32149b53370f1a1c.zip
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/unicode-segmentation')
-rw-r--r--vendor/unicode-segmentation/.cargo-checksum.json2
-rw-r--r--vendor/unicode-segmentation/Cargo.toml2
-rw-r--r--vendor/unicode-segmentation/README.md2
-rw-r--r--vendor/unicode-segmentation/benches/unicode_words.rs55
-rw-r--r--vendor/unicode-segmentation/benches/word_bounds.rs55
-rwxr-xr-xvendor/unicode-segmentation/scripts/unicode.py62
-rw-r--r--vendor/unicode-segmentation/src/tables.rs346
7 files changed, 442 insertions, 82 deletions
diff --git a/vendor/unicode-segmentation/.cargo-checksum.json b/vendor/unicode-segmentation/.cargo-checksum.json
index acc5d5d0e..faa400016 100644
--- a/vendor/unicode-segmentation/.cargo-checksum.json
+++ b/vendor/unicode-segmentation/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"} \ No newline at end of file
+{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"3de4086a8d886795bd6f4322b3f42ee8b84eb1c9ab6651fb95c0469d1cf508f1","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"8d05f9f7aafc8af56853b802e14d0e2e73a56021d8b5481b69e169fd3e2c44cc","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"42df2c8e9bfb54baf31c8327e27915683c25b90d55dc04311c28a0c2a9d185e3","benches/word_bounds.rs":"0453e37477134063178e104305b0a5f27ce74aa342234a6208095623c45de0d4","scripts/unicode.py":"e38d564c2f778aeb2cd17ce6c4395e5f5abd85ff8dd5292d76a93c16f4e38eb9","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"547636e4d6d685286f98c7f1db5ebde8e254ec811dbec4b7c2edf74c1454740d","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"} \ No newline at end of file
diff --git a/vendor/unicode-segmentation/Cargo.toml b/vendor/unicode-segmentation/Cargo.toml
index 0da56c81e..83dea2895 100644
--- a/vendor/unicode-segmentation/Cargo.toml
+++ b/vendor/unicode-segmentation/Cargo.toml
@@ -12,7 +12,7 @@
[package]
edition = "2018"
name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
authors = [
"kwantam <kwantam@gmail.com>",
"Manish Goregaokar <manishsmail@gmail.com>",
diff --git a/vendor/unicode-segmentation/README.md b/vendor/unicode-segmentation/README.md
index 48d9a9205..ef61ebd10 100644
--- a/vendor/unicode-segmentation/README.md
+++ b/vendor/unicode-segmentation/README.md
@@ -38,7 +38,7 @@ to your `Cargo.toml`:
```toml
[dependencies]
-unicode-segmentation = "1.9.0"
+unicode-segmentation = "1.10.1"
```
# Change Log
diff --git a/vendor/unicode-segmentation/benches/unicode_words.rs b/vendor/unicode-segmentation/benches/unicode_words.rs
index c87851a37..a7f8f4142 100644
--- a/vendor/unicode-segmentation/benches/unicode_words.rs
+++ b/vendor/unicode-segmentation/benches/unicode_words.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
-fn unicode_words(bench: &mut Bencher, path: &str) {
+fn unicode_words(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
- bench.iter(|| {
- for w in text.unicode_words() {
- bencher::black_box(w);
- }
+ c.bench_function(&format!("unicode_words_{}", lang), |bench| {
+ bench.iter(|| {
+ for w in text.unicode_words() {
+ black_box(w);
+ }
+ })
});
-
- bench.bytes = text.len() as u64;
}
-fn unicode_words_arabic(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/arabic.txt");
+fn unicode_words_arabic(c: &mut Criterion) {
+ unicode_words(c, "arabic", "benches/texts/arabic.txt");
}
-fn unicode_words_english(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/english.txt");
+fn unicode_words_english(c: &mut Criterion) {
+ unicode_words(c, "english", "benches/texts/english.txt");
}
-fn unicode_words_hindi(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/hindi.txt");
+fn unicode_words_hindi(c: &mut Criterion) {
+ unicode_words(c, "hindi", "benches/texts/hindi.txt");
}
-fn unicode_words_japanese(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/japanese.txt");
+fn unicode_words_japanese(c: &mut Criterion) {
+ unicode_words(c, "japanese", "benches/texts/japanese.txt");
}
-fn unicode_words_korean(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/korean.txt");
+fn unicode_words_korean(c: &mut Criterion) {
+ unicode_words(c, "korean", "benches/texts/korean.txt");
}
-fn unicode_words_mandarin(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/mandarin.txt");
+fn unicode_words_mandarin(c: &mut Criterion) {
+ unicode_words(c, "mandarin", "benches/texts/mandarin.txt");
}
-fn unicode_words_russian(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/russian.txt");
+fn unicode_words_russian(c: &mut Criterion) {
+ unicode_words(c, "russian", "benches/texts/russian.txt");
}
-fn unicode_words_source_code(bench: &mut Bencher) {
- unicode_words(bench, "benches/texts/source_code.txt");
+fn unicode_words_source_code(c: &mut Criterion) {
+ unicode_words(c, "source_code", "benches/texts/source_code.txt");
}
-benchmark_group!(
+criterion_group!(
benches,
unicode_words_arabic,
unicode_words_english,
@@ -61,4 +58,4 @@ benchmark_group!(
unicode_words_source_code,
);
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/vendor/unicode-segmentation/benches/word_bounds.rs b/vendor/unicode-segmentation/benches/word_bounds.rs
index 6b01ddb10..cae7a8819 100644
--- a/vendor/unicode-segmentation/benches/word_bounds.rs
+++ b/vendor/unicode-segmentation/benches/word_bounds.rs
@@ -1,55 +1,52 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
-fn word_bounds(bench: &mut Bencher, path: &str) {
+fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
- bench.iter(|| {
- for w in text.split_word_bounds() {
- bencher::black_box(w);
- }
+ c.bench_function(&format!("word_bounds_{}", lang), |bench| {
+ bench.iter(|| {
+ for w in text.split_word_bounds() {
+ black_box(w);
+ }
+ });
});
-
- bench.bytes = text.len() as u64;
}
-fn word_bounds_arabic(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/arabic.txt");
+fn word_bounds_arabic(c: &mut Criterion) {
+ word_bounds(c, "arabic", "benches/texts/arabic.txt");
}
-fn word_bounds_english(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/english.txt");
+fn word_bounds_english(c: &mut Criterion) {
+ word_bounds(c, "english", "benches/texts/english.txt");
}
-fn word_bounds_hindi(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/hindi.txt");
+fn word_bounds_hindi(c: &mut Criterion) {
+ word_bounds(c, "hindi", "benches/texts/hindi.txt");
}
-fn word_bounds_japanese(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/japanese.txt");
+fn word_bounds_japanese(c: &mut Criterion) {
+ word_bounds(c, "japanese", "benches/texts/japanese.txt");
}
-fn word_bounds_korean(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/korean.txt");
+fn word_bounds_korean(c: &mut Criterion) {
+ word_bounds(c, "korean", "benches/texts/korean.txt");
}
-fn word_bounds_mandarin(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/mandarin.txt");
+fn word_bounds_mandarin(c: &mut Criterion) {
+ word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
}
-fn word_bounds_russian(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/russian.txt");
+fn word_bounds_russian(c: &mut Criterion) {
+ word_bounds(c, "russian", "benches/texts/russian.txt");
}
-fn word_bounds_source_code(bench: &mut Bencher) {
- word_bounds(bench, "benches/texts/source_code.txt");
+fn word_bounds_source_code(c: &mut Criterion) {
+ word_bounds(c, "source_code", "benches/texts/source_code.txt");
}
-benchmark_group!(
+criterion_group!(
benches,
word_bounds_arabic,
word_bounds_english,
@@ -61,4 +58,4 @@ benchmark_group!(
word_bounds_source_code,
);
-benchmark_main!(benches);
+criterion_main!(benches);
diff --git a/vendor/unicode-segmentation/scripts/unicode.py b/vendor/unicode-segmentation/scripts/unicode.py
index 7aed85e7c..18cea99c4 100755
--- a/vendor/unicode-segmentation/scripts/unicode.py
+++ b/vendor/unicode-segmentation/scripts/unicode.py
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
pub enum %sCat {
""" % (name, Name, Name))
+ # We don't want the lookup table to be too large so choose a reasonable
+ # cutoff. 0x20000 is selected because most of the range table entries are
+ # within the interval of [0x0, 0x20000]
+ lookup_value_cutoff = 0x20000
+
+ # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
+ lookup_table_len = 0x400
+
+ lookup_interval = round(lookup_value_cutoff / lookup_table_len)
+
+ # Lookup table is a mapping from `character code / lookup_interval` to
+ # the index in the range table that covers the `character code`.
+ lookup_table = [0] * lookup_table_len
+ j = 0
+ for i in range(0, lookup_table_len):
+ lookup_from = i * lookup_interval
+ while j < len(break_table):
+ (_, entry_to, _) = break_table[j]
+ if entry_to >= lookup_from:
+ break
+ j += 1
+ lookup_table[i] = j
+
break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -293,8 +316,8 @@ def emit_break_module(f, break_table, break_cats, name):
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
%sC_Any,
)
}
@@ -302,10 +325,39 @@ def emit_break_module(f, break_table, break_cats, name):
}
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
- bsearch_range_value_table(c, %s_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x%x;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ %d..%d,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
}
-""" % (Name, Name, Name[0], name, Name, name))
+""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
+
+
+ if len(break_table) <= 0xff:
+ lookup_type = "u8"
+ elif len(break_table) <= 0xffff:
+ lookup_type = "u16"
+ else:
+ lookup_type = "u32"
+
+ emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
+ pfun=lambda x: "%d" % x,
+ is_pub=False, is_const=True)
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
diff --git a/vendor/unicode-segmentation/src/tables.rs b/vendor/unicode-segmentation/src/tables.rs
index 5a811c922..ca83b503a 100644
--- a/vendor/unicode-segmentation/src/tables.rs
+++ b/vendor/unicode-segmentation/src/tables.rs
@@ -365,7 +365,7 @@ pub mod grapheme {
GC_ZWJ,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)], default_lower: u32, default_upper: u32) -> (u32, u32, GraphemeCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -378,8 +378,8 @@ pub mod grapheme {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
GC_Any,
)
}
@@ -387,9 +387,93 @@ pub mod grapheme {
}
pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
- bsearch_range_value_table(c, grapheme_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = grapheme_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 1443..1449,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &grapheme_cat_table[range], lower, upper)
}
+ const grapheme_cat_lookup: &'static [u16] = &[
+ 0, 5, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 16, 21, 26, 29, 32, 37, 41, 53, 65, 75, 86, 97,
+ 106, 116, 131, 143, 153, 157, 161, 168, 173, 183, 188, 189, 191, 191, 191, 192, 192, 192,
+ 192, 192, 192, 192, 192, 198, 206, 209, 211, 219, 219, 232, 233, 242, 258, 262, 270, 270,
+ 271, 271, 271, 271, 271, 279, 280, 282, 284, 284, 284, 286, 290, 290, 291, 291, 295, 297,
+ 298, 313, 317, 317, 317, 318, 318, 318, 318, 322, 322, 322, 323, 324, 325, 325, 325, 325,
+ 325, 328, 329, 329, 329, 329, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331,
+ 331, 331, 331, 333, 335, 335, 335, 342, 347, 351, 360, 369, 379, 379, 386, 395, 405, 413,
+ 423, 431, 441, 450, 459, 469, 477, 487, 495, 505, 514, 523, 533, 541, 551, 559, 569, 578,
+ 587, 597, 605, 615, 623, 633, 642, 651, 661, 669, 679, 687, 697, 706, 715, 725, 733, 743,
+ 751, 761, 770, 779, 789, 797, 807, 815, 825, 834, 843, 853, 861, 871, 879, 889, 898, 907,
+ 917, 925, 935, 943, 953, 962, 971, 981, 989, 999, 1007, 1017, 1026, 1035, 1045, 1053, 1063,
+ 1071, 1081, 1090, 1099, 1109, 1117, 1127, 1135, 1145, 1154, 1163, 1173, 1181, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186,
+ 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1187, 1187, 1187, 1187, 1187, 1187,
+ 1189, 1190, 1190, 1192, 1192, 1192, 1192, 1193, 1193, 1194, 1195, 1195, 1195, 1195, 1195,
+ 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1200, 1201, 1201, 1201, 1201, 1201,
+ 1202, 1202, 1202, 1204, 1205, 1206, 1212, 1221, 1227, 1236, 1244, 1247, 1260, 1260, 1267,
+ 1278, 1278, 1286, 1292, 1299, 1303, 1303, 1307, 1307, 1318, 1324, 1333, 1337, 1337, 1337,
+ 1342, 1349, 1355, 1361, 1361, 1363, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372,
+ 1372, 1372, 1372, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375,
+ 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1376, 1377, 1377, 1377, 1377, 1377, 1377, 1377,
+ 1377, 1378, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382,
+ 1382, 1382, 1382, 1382, 1382, 1382, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+ 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384,
+ 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1386, 1386,
+ 1386, 1386, 1392, 1395, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396,
+ 1396, 1396, 1396, 1396, 1396, 1399, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402,
+ 1402, 1402, 1407, 1408, 1409, 1409, 1409, 1411, 1411, 1411, 1411, 1412, 1412, 1412, 1412,
+ 1412, 1412, 1412, 1412, 1413, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
+ 1414, 1414, 1414, 1414, 1414, 1415, 1419, 1423, 1428, 1428, 1428, 1430, 1430, 1430, 1431,
+ 1431, 1432, 1433, 1434, 1435, 1438, 1440, 1442, 1442, 1442, 1443, 1443, 1443, 1443, 1443,
+ 1443, 1443, 1443, 1443, 1443
+ ];
+
const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[
('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control),
('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}',
@@ -1028,7 +1112,7 @@ pub mod word {
WC_ZWJ,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)], default_lower: u32, default_upper: u32) -> (u32, u32, WordCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1041,8 +1125,8 @@ pub mod word {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
WC_Any,
)
}
@@ -1050,9 +1134,87 @@ pub mod word {
}
pub fn word_category(c: char) -> (u32, u32, WordCat) {
- bsearch_range_value_table(c, word_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = word_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 1050..1053,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &word_cat_table[range], lower, upper)
}
+ const word_cat_lookup: &'static [u16] = &[
+ 0, 14, 22, 22, 22, 22, 24, 30, 36, 36, 38, 43, 55, 66, 78, 83, 93, 104, 111, 121, 143, 162,
+ 180, 198, 215, 231, 250, 266, 278, 282, 286, 295, 301, 308, 316, 316, 316, 321, 329, 333,
+ 336, 336, 336, 336, 336, 338, 342, 351, 354, 359, 365, 369, 370, 375, 378, 384, 391, 397,
+ 409, 409, 411, 411, 411, 420, 430, 449, 451, 464, 465, 465, 465, 465, 465, 465, 466, 466,
+ 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 470, 476, 486, 487,
+ 487, 487, 487, 492, 496, 497, 500, 500, 501, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502,
+ 502, 502, 504, 504, 504, 511, 515, 515, 519, 529, 538, 544, 551, 559, 568, 574, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578,
+ 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581,
+ 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 592, 593, 593, 593, 594,
+ 597, 609, 611, 620, 628, 634, 635, 636, 637, 637, 640, 644, 648, 648, 652, 655, 662, 662,
+ 662, 665, 668, 675, 678, 680, 682, 692, 696, 699, 700, 701, 703, 706, 706, 706, 710, 714,
+ 718, 726, 734, 744, 753, 759, 767, 785, 785, 791, 796, 796, 801, 805, 809, 811, 811, 813,
+ 815, 828, 835, 844, 848, 848, 848, 854, 857, 869, 875, 875, 877, 885, 886, 886, 886, 886,
+ 886, 886, 886, 886, 887, 888, 888, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889,
+ 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 890, 890, 890, 890, 890, 890, 890, 890,
+ 890, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+ 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895,
+ 895, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896,
+ 896, 899, 903, 908, 909, 909, 909, 909, 909, 910, 910, 913, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920,
+ 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 923, 924, 924, 927,
+ 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927,
+ 927, 927, 927, 929, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+ 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933,
+ 933, 933, 933, 933, 933, 935, 935, 935, 935, 938, 941, 942, 942, 942, 942, 943, 951, 960,
+ 960, 960, 964, 968, 973, 973, 973, 973, 973, 976, 979, 979, 979, 979, 979, 979, 979, 979,
+ 979, 981, 981, 987, 988, 993, 993, 993, 998, 998, 998, 998, 1001, 1001, 1001, 1001, 1001,
+ 1001, 1005, 1005, 1007, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1039,
+ 1044, 1044, 1044, 1044, 1044, 1046, 1048, 1048, 1048, 1048, 1049, 1049, 1049, 1049, 1049,
+ 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1050, 1050, 1050, 1050,
+ 1050, 1050, 1050, 1050
+ ];
+
const word_cat_table: &'static [(char, char, WordCat)] = &[
('\u{a}', '\u{a}', WC_LF), ('\u{b}', '\u{c}', WC_Newline), ('\u{d}', '\u{d}', WC_CR),
('\u{20}', '\u{20}', WC_WSegSpace), ('\u{22}', '\u{22}', WC_Double_Quote), ('\u{27}',
@@ -1530,7 +1692,7 @@ pub mod emoji {
EC_Extended_Pictographic,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)], default_lower: u32, default_upper: u32) -> (u32, u32, EmojiCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1543,8 +1705,8 @@ pub mod emoji {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
EC_Any,
)
}
@@ -1552,9 +1714,73 @@ pub mod emoji {
}
pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
- bsearch_range_value_table(c, emoji_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = emoji_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 77..78,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &emoji_cat_table[range], lower, upper)
}
+ const emoji_cat_lookup: &'static [u8] = &[
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 4, 4, 6, 8, 8, 8, 10, 14, 14, 15, 15, 19, 21, 22, 37, 41, 41, 41, 42, 42, 42, 42,
+ 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 48, 48, 48, 48, 48, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 55, 58, 63, 63, 63, 64, 64, 64, 65, 65, 66, 67,
+ 68, 69, 72, 74, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77
+ ];
+
const emoji_cat_table: &'static [(char, char, EmojiCat)] = &[
('\u{a9}', '\u{a9}', EC_Extended_Pictographic), ('\u{ae}', '\u{ae}',
EC_Extended_Pictographic), ('\u{203c}', '\u{203c}', EC_Extended_Pictographic), ('\u{2049}',
@@ -1633,7 +1859,7 @@ pub mod sentence {
SC_Upper,
}
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)], default_lower: u32, default_upper: u32) -> (u32, u32, SentenceCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
@@ -1646,8 +1872,8 @@ pub mod sentence {
}
Err(idx) => {
(
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
SC_Any,
)
}
@@ -1655,9 +1881,97 @@ pub mod sentence {
}
pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
- bsearch_range_value_table(c, sentence_cat_table)
+ // Perform a quick O(1) lookup in a precomputed table to determine
+ // the slice of the range table to search in.
+ let lookup_interval = 0x80;
+ let idx = (c as u32 / lookup_interval) as usize;
+ let range = sentence_cat_lookup.get(idx..(idx + 2)).map_or(
+ // If the `idx` is outside of the precomputed table - use the slice
+ // starting from the last covered index in the precomputed table and
+ // ending with the length of the range table.
+ 2410..2421,
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
+ );
+
+ // Compute pessimistic default lower and upper bounds on the category.
+ // If character doesn't map to any range and there is no adjacent range
+ // in the table slice - these bounds has to apply.
+ let lower = idx as u32 * lookup_interval;
+ let upper = lower + lookup_interval - 1;
+ bsearch_range_value_table(c, &sentence_cat_table[range], lower, upper)
}
+ const sentence_cat_lookup: &'static [u16] = &[
+ 0, 19, 31, 154, 247, 314, 323, 333, 375, 409, 528, 579, 588, 599, 612, 618, 629, 643, 650,
+ 661, 683, 702, 720, 738, 755, 771, 790, 806, 818, 825, 840, 850, 856, 871, 882, 882, 882,
+ 887, 895, 901, 904, 904, 904, 904, 904, 907, 912, 922, 928, 937, 943, 950, 953, 959, 964,
+ 973, 980, 988, 1000, 1000, 1002, 1130, 1249, 1267, 1288, 1308, 1311, 1336, 1340, 1340, 1340,
+ 1342, 1342, 1342, 1344, 1344, 1344, 1344, 1344, 1346, 1348, 1348, 1348, 1348, 1351, 1351,
+ 1351, 1351, 1351, 1369, 1476, 1482, 1492, 1501, 1501, 1501, 1501, 1512, 1517, 1518, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521,
+ 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522,
+ 1522, 1522, 1522, 1522, 1525, 1525, 1525, 1580, 1613, 1696, 1769, 1780, 1790, 1797, 1808,
+ 1819, 1836, 1843, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849,
+ 1849, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852,
+ 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1853, 1854, 1864, 1865, 1865,
+ 1865, 1867, 1870, 1886, 1888, 1905, 1913, 1919, 1920, 1921, 1922, 1922, 1925, 1929, 1933,
+ 1935, 1939, 1942, 1949, 1949, 1949, 1952, 1957, 1964, 1967, 1969, 1971, 1982, 1986, 1989,
+ 1990, 1991, 1993, 1996, 1996, 1996, 2000, 2005, 2010, 2019, 2028, 2039, 2051, 2059, 2068,
+ 2086, 2086, 2093, 2098, 2098, 2105, 2110, 2114, 2119, 2119, 2121, 2124, 2139, 2146, 2156,
+ 2161, 2161, 2161, 2168, 2171, 2183, 2189, 2189, 2192, 2201, 2202, 2202, 2202, 2202, 2202,
+ 2202, 2202, 2202, 2203, 2204, 2204, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205,
+ 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2206, 2206, 2206,
+ 2206, 2206, 2206, 2206, 2206, 2206, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+ 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211,
+ 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212,
+ 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2216, 2221, 2228, 2229, 2229, 2229,
+ 2229, 2229, 2231, 2232, 2235, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242,
+ 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2243, 2243, 2243, 2243, 2243, 2243, 2243,
+ 2243, 2243, 2243, 2244, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245,
+ 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2248, 2248,
+ 2248, 2253, 2253, 2253, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254,
+ 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2256, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261,
+ 2261, 2263, 2263, 2263, 2263, 2266, 2269, 2270, 2270, 2270, 2270, 2275, 2288, 2300, 2305,
+ 2310, 2316, 2322, 2330, 2330, 2330, 2330, 2330, 2333, 2337, 2337, 2337, 2337, 2337, 2337,
+ 2337, 2337, 2337, 2341, 2341, 2347, 2348, 2353, 2353, 2353, 2358, 2358, 2358, 2358, 2361,
+ 2361, 2361, 2361, 2361, 2361, 2365, 2365, 2367, 2372, 2372, 2372, 2372, 2372, 2372, 2372,
+ 2372, 2372, 2372, 2400, 2405, 2405, 2405, 2405, 2405, 2407, 2408, 2408, 2408, 2408, 2408,
+ 2408, 2408, 2408, 2408, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409,
+ 2410, 2410, 2410, 2410, 2410, 2410, 2410, 2410
+ ];
+
const sentence_cat_table: &'static [(char, char, SentenceCat)] = &[
('\u{9}', '\u{9}', SC_Sp), ('\u{a}', '\u{a}', SC_LF), ('\u{b}', '\u{c}', SC_Sp), ('\u{d}',
'\u{d}', SC_CR), ('\u{20}', '\u{20}', SC_Sp), ('\u{21}', '\u{21}', SC_STerm), ('\u{22}',