summaryrefslogtreecommitdiffstats
path: root/src/tools/unicode-table-generator
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /src/tools/unicode-table-generator
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/tools/unicode-table-generator')
-rw-r--r--src/tools/unicode-table-generator/Cargo.toml9
-rw-r--r--src/tools/unicode-table-generator/src/case_mapping.rs70
-rw-r--r--src/tools/unicode-table-generator/src/main.rs439
-rw-r--r--src/tools/unicode-table-generator/src/range_search.rs93
-rw-r--r--src/tools/unicode-table-generator/src/raw_emitter.rs394
-rw-r--r--src/tools/unicode-table-generator/src/skiplist.rs98
-rw-r--r--src/tools/unicode-table-generator/src/unicode_download.rs46
7 files changed, 1149 insertions, 0 deletions
diff --git a/src/tools/unicode-table-generator/Cargo.toml b/src/tools/unicode-table-generator/Cargo.toml
new file mode 100644
index 000000000..ef01877c0
--- /dev/null
+++ b/src/tools/unicode-table-generator/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "unicode-bdd"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ucd-parse = "0.1.3"
diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs
new file mode 100644
index 000000000..992aac1f8
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/case_mapping.rs
@@ -0,0 +1,70 @@
+use crate::{fmt_list, UnicodeData};
+use std::fmt;
+
+pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
+ let mut file = String::new();
+
+ file.push_str(HEADER.trim_start());
+
+ let decl_type = "&[(char, [char; 3])]";
+
+ file.push_str(&format!(
+ "static LOWERCASE_TABLE: {} = &[{}];",
+ decl_type,
+ fmt_list(data.to_lower.iter().map(to_mapping))
+ ));
+ file.push_str("\n\n");
+ file.push_str(&format!(
+ "static UPPERCASE_TABLE: {} = &[{}];",
+ decl_type,
+ fmt_list(data.to_upper.iter().map(to_mapping))
+ ));
+ file
+}
+
+fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
+ (
+ CharEscape(std::char::from_u32(*key).unwrap()),
+ [
+ CharEscape(std::char::from_u32(*a).unwrap()),
+ CharEscape(std::char::from_u32(*b).unwrap()),
+ CharEscape(std::char::from_u32(*c).unwrap()),
+ ],
+ )
+}
+
+struct CharEscape(char);
+
+impl fmt::Debug for CharEscape {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "'{}'", self.0.escape_default())
+ }
+}
+
+static HEADER: &str = r"
+pub fn to_lower(c: char) -> [char; 3] {
+ if c.is_ascii() {
+ [(c as u8).to_ascii_lowercase() as char, '\0', '\0']
+ } else {
+ match bsearch_case_table(c, LOWERCASE_TABLE) {
+ None => [c, '\0', '\0'],
+ Some(index) => LOWERCASE_TABLE[index].1,
+ }
+ }
+}
+
+pub fn to_upper(c: char) -> [char; 3] {
+ if c.is_ascii() {
+ [(c as u8).to_ascii_uppercase() as char, '\0', '\0']
+ } else {
+ match bsearch_case_table(c, UPPERCASE_TABLE) {
+ None => [c, '\0', '\0'],
+ Some(index) => UPPERCASE_TABLE[index].1,
+ }
+ }
+}
+
+fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
+ table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
+}
+";
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
new file mode 100644
index 000000000..4720ee702
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -0,0 +1,439 @@
+//! This implements the core logic of the compression scheme used to compactly
+//! encode Unicode properties.
+//!
+//! We have two primary goals with the encoding: we want to be compact, because
+//! these tables often end up in ~every Rust program (especially the
+//! grapheme_extend table, used for str debugging), including those for embedded
+//! targets (where space is important). We also want to be relatively fast,
+//! though this is more of a nice to have rather than a key design constraint.
+//! It is expected that libraries/applications which are performance-sensitive
+//! to Unicode property lookups are extremely rare, and those that care may find
+//! the tradeoff of the raw bitsets worth it. For most applications, a
+//! relatively fast but much smaller (and as such less cache-impacting, etc.)
+//! data set is likely preferable.
+//!
+//! We have two separate encoding schemes: a skiplist-like approach, and a
+//! compressed bitset. The datasets we consider mostly use the skiplist (it's
+//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
+//! the bitset to be worthwhile -- for those sets the bitset is a 2x size win.
+//! Since the bitset is also faster, this seems an obvious choice. (As a
+//! historical note, the bitset was also the prior implementation, so its
+//! relative complexity had already been paid).
+//!
+//! ## The bitset
+//!
+//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
+//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
+//! over 17 kilobytes of data per character set -- way too much for our
+//! purposes.
+//!
+//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
+//! not skipping the small 'gap') is associated into words (u64) and
+//! deduplicated. On random data, this would be useless; on our data, this is
+//! incredibly beneficial -- our data sets have (far) less than 256 unique
+//! words.
+//!
+//! This gives us an array that maps `u8 -> word`; the current algorithm does
+//! not handle the case of more than 256 unique words, but we are relatively far
+//! from coming that close.
+//!
+//! With that scheme, we now have a single byte for every 64 codepoints.
+//!
+//! We further chunk these by some constant N (between 1 and 64 per group,
+//! dynamically chosen for smallest size), and again deduplicate and store in an
+//! array (u8 -> [u8; N]).
+//!
+//! The bytes of this array map into the words from the bitset above, but we
+//! apply another trick here: some of these words are similar enough that they
+//! can be represented by some function of another word. The particular
+//! functions chosen are rotation, inversion, and shifting (right).
+//!
+//! ## The skiplist
+//!
+//! The skip list arose out of the desire for an even smaller encoding than the
+//! bitset -- and was the answer to the question "what is the smallest
+//! representation we can imagine?". However, it is not necessarily the
+//! smallest, and if you have a better proposal, please do suggest it!
+//!
+//! This is a relatively straightforward encoding. First, we break up all the
+//! ranges in the input data into offsets from each other, essentially a gap
+//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
+//! store those directly. We make use of the larger gaps (which are nicely
+//! interspersed already) throughout the dataset to index this data set.
+//!
+//! In particular, each run of small gaps (terminating in a large gap) is
+//! indexed in a separate dataset. That data set stores an index into the
+//! primary offset list and a prefix sum of that offset list. These are packed
+//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
+//!
+//! Lookup proceeds via a binary search in the index and then a straightforward
+//! linear scan (adding up the offsets) until we reach the needle, and then the
+//! index of that offset is utilized as the answer to whether we're in the set
+//! or not.
+
+use std::collections::{BTreeMap, HashMap};
+use std::ops::Range;
+use ucd_parse::Codepoints;
+
+mod case_mapping;
+mod raw_emitter;
+mod skiplist;
+mod unicode_download;
+
+use raw_emitter::{emit_codepoints, RawEmitter};
+
+static PROPERTIES: &[&str] = &[
+ "Alphabetic",
+ "Lowercase",
+ "Uppercase",
+ "Cased",
+ "Case_Ignorable",
+ "Grapheme_Extend",
+ "White_Space",
+ "Cc",
+ "N",
+];
+
+struct UnicodeData {
+ ranges: Vec<(&'static str, Vec<Range<u32>>)>,
+ to_upper: BTreeMap<u32, (u32, u32, u32)>,
+ to_lower: BTreeMap<u32, (u32, u32, u32)>,
+}
+
+fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
+ let mut a = None;
+ let mut b = None;
+ let mut c = None;
+
+ for codepoint in codepoints {
+ if origin == codepoint.value() {
+ return None;
+ }
+
+ if a.is_none() {
+ a = Some(codepoint.value());
+ } else if b.is_none() {
+ b = Some(codepoint.value());
+ } else if c.is_none() {
+ c = Some(codepoint.value());
+ } else {
+ panic!("more than 3 mapped codepoints")
+ }
+ }
+
+ Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
+}
+
+static UNICODE_DIRECTORY: &str = "unicode-downloads";
+
+fn load_data() -> UnicodeData {
+ unicode_download::fetch_latest();
+
+ let mut properties = HashMap::new();
+ for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
+ if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
+ properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
+ }
+ }
+ for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
+ if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
+ properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
+ }
+ }
+
+ let mut to_lower = BTreeMap::new();
+ let mut to_upper = BTreeMap::new();
+ for row in ucd_parse::UnicodeDataExpander::new(
+ ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
+ ) {
+ let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
+ "N"
+ } else {
+ row.general_category.as_str()
+ };
+ if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
+ properties
+ .entry(*name)
+ .or_insert_with(Vec::new)
+ .push(Codepoints::Single(row.codepoint));
+ }
+
+ if let Some(mapped) = row.simple_lowercase_mapping {
+ if mapped != row.codepoint {
+ to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
+ }
+ }
+ if let Some(mapped) = row.simple_uppercase_mapping {
+ if mapped != row.codepoint {
+ to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
+ }
+ }
+ }
+
+ for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
+ if !row.conditions.is_empty() {
+ // Skip conditional case mappings
+ continue;
+ }
+
+ let key = row.codepoint.value();
+ if let Some(lower) = to_mapping(key, row.lowercase) {
+ to_lower.insert(key, lower);
+ }
+ if let Some(upper) = to_mapping(key, row.uppercase) {
+ to_upper.insert(key, upper);
+ }
+ }
+
+ let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k,
+ v.into_iter()
+ .flat_map(|codepoints| match codepoints {
+ Codepoints::Single(c) => c
+ .scalar()
+ .map(|ch| (ch as u32..ch as u32 + 1))
+ .into_iter()
+ .collect::<Vec<_>>(),
+ Codepoints::Range(c) => c
+ .into_iter()
+ .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
+ .collect::<Vec<_>>(),
+ })
+ .collect::<Vec<Range<u32>>>(),
+ )
+ })
+ .collect();
+
+ for ranges in properties.values_mut() {
+ merge_ranges(ranges);
+ }
+
+ let mut properties = properties.into_iter().collect::<Vec<_>>();
+ properties.sort_by_key(|p| p.0);
+ UnicodeData { ranges: properties, to_lower, to_upper }
+}
+
+fn main() {
+ let write_location = std::env::args().nth(1).unwrap_or_else(|| {
+ eprintln!("Must provide path to write unicode tables to");
+ eprintln!(
+ "e.g. {} library/core/unicode/unicode_data.rs",
+ std::env::args().next().unwrap_or_default()
+ );
+ std::process::exit(1);
+ });
+
+ // Optional test path, which is a Rust source file testing that the unicode
+ // property lookups are correct.
+ let test_path = std::env::args().nth(2);
+
+ let unicode_data = load_data();
+ let ranges_by_property = &unicode_data.ranges;
+
+ if let Some(path) = test_path {
+ std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
+ }
+
+ let mut total_bytes = 0;
+ let mut modules = Vec::new();
+ for (property, ranges) in ranges_by_property {
+ let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
+ let mut emitter = RawEmitter::new();
+ emit_codepoints(&mut emitter, &ranges);
+
+ modules.push((property.to_lowercase().to_string(), emitter.file));
+ println!(
+ "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}",
+ property,
+ emitter.bytes_used,
+ datapoints,
+ ranges.len(),
+ ranges.first().unwrap().start,
+ ranges.last().unwrap().end,
+ emitter.desc,
+ );
+ total_bytes += emitter.bytes_used;
+ }
+
+ let mut table_file = String::new();
+
+ table_file.push_str(
+ "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
+ );
+
+ // Include the range search function
+ table_file.push('\n');
+ table_file.push_str(include_str!("range_search.rs"));
+ table_file.push('\n');
+
+ table_file.push_str(&version());
+
+ table_file.push('\n');
+
+ modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
+
+ for (name, contents) in modules {
+ table_file.push_str("#[rustfmt::skip]\n");
+ table_file.push_str(&format!("pub mod {name} {{\n"));
+ for line in contents.lines() {
+ if !line.trim().is_empty() {
+ table_file.push_str(" ");
+ table_file.push_str(&line);
+ }
+ table_file.push('\n');
+ }
+ table_file.push_str("}\n\n");
+ }
+
+ std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
+
+ println!("Total table sizes: {total_bytes} bytes");
+}
+
+fn version() -> String {
+ let mut out = String::new();
+ out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");
+
+ let readme =
+ std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
+ .unwrap();
+
+ let prefix = "for Version ";
+ let start = readme.find(prefix).unwrap() + prefix.len();
+ let end = readme.find(" of the Unicode Standard.").unwrap();
+ let version =
+ readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
+ let [major, minor, micro] = [version[0], version[1], version[2]];
+
+ out.push_str(&format!("({major}, {minor}, {micro});\n"));
+ out
+}
+
+fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
+ let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
+ let mut out = String::new();
+ let mut line = String::from("\n ");
+ for piece in pieces {
+ if line.len() + piece.len() < 98 {
+ line.push_str(&piece);
+ } else {
+ out.push_str(line.trim_end());
+ out.push('\n');
+ line = format!(" {piece}");
+ }
+ }
+ out.push_str(line.trim_end());
+ out.push('\n');
+ out
+}
+
+fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
+ let mut s = String::new();
+ s.push_str("#![allow(incomplete_features, unused)]\n");
+ s.push_str("#![feature(const_generics)]\n\n");
+ s.push_str("\n#[allow(unused)]\nuse std::hint;\n");
+ s.push_str(&format!("#[path = \"{data_path}\"]\n"));
+ s.push_str("mod unicode_data;\n\n");
+
+ s.push_str("\nfn main() {\n");
+
+ for (property, ranges) in ranges {
+ s.push_str(&format!(r#" println!("Testing {}");"#, property));
+ s.push('\n');
+ s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
+ s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
+ let mut is_true = Vec::new();
+ let mut is_false = Vec::new();
+ for ch_num in 0..(std::char::MAX as u32) {
+ if std::char::from_u32(ch_num).is_none() {
+ continue;
+ }
+ if ranges.iter().any(|r| r.contains(&ch_num)) {
+ is_true.push(ch_num);
+ } else {
+ is_false.push(ch_num);
+ }
+ }
+
+ s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase()));
+ generate_asserts(&mut s, property, &is_true, true);
+ s.push_str(" }\n\n");
+ s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase()));
+ generate_asserts(&mut s, property, &is_false, false);
+ s.push_str(" }\n\n");
+ }
+
+ s.push_str("}");
+ s
+}
+
+fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
+ for range in ranges_from_set(points) {
+ if range.end == range.start + 1 {
+ s.push_str(&format!(
+ " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n",
+ if truthy { "" } else { "!" },
+ property.to_lowercase(),
+ std::char::from_u32(range.start).unwrap(),
+ range.start,
+ ));
+ } else {
+ s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
+ s.push_str(&format!(
+ " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
+ if truthy { "" } else { "!" },
+ property.to_lowercase(),
+ ));
+ s.push_str(" }\n");
+ }
+ }
+}
+
+fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
+ let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
+ merge_ranges(&mut ranges);
+ ranges
+}
+
+fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
+ loop {
+ let mut new_ranges = Vec::new();
+ let mut idx_iter = 0..(ranges.len() - 1);
+ let mut should_insert_last = true;
+ while let Some(idx) = idx_iter.next() {
+ let cur = ranges[idx].clone();
+ let next = ranges[idx + 1].clone();
+ if cur.end == next.start {
+ if idx_iter.next().is_none() {
+ // We're merging the last element
+ should_insert_last = false;
+ }
+ new_ranges.push(cur.start..next.end);
+ } else {
+ // We're *not* merging the last element
+ should_insert_last = true;
+ new_ranges.push(cur);
+ }
+ }
+ if should_insert_last {
+ new_ranges.push(ranges.last().unwrap().clone());
+ }
+ if new_ranges.len() == ranges.len() {
+ *ranges = new_ranges;
+ break;
+ } else {
+ *ranges = new_ranges;
+ }
+ }
+
+ let mut last_end = None;
+ for range in ranges {
+ if let Some(last) = last_end {
+ assert!(range.start > last, "{:?}", range);
+ }
+ last_end = Some(range.end);
+ }
+}
diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs
new file mode 100644
index 000000000..39b47ce70
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@@ -0,0 +1,93 @@
+#[inline(always)]
+fn bitset_search<
+ const N: usize,
+ const CHUNK_SIZE: usize,
+ const N1: usize,
+ const CANONICAL: usize,
+ const CANONICALIZED: usize,
+>(
+ needle: u32,
+ chunk_idx_map: &[u8; N],
+ bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
+ bitset_canonical: &[u64; CANONICAL],
+ bitset_canonicalized: &[(u8, u8); CANONICALIZED],
+) -> bool {
+ let bucket_idx = (needle / 64) as usize;
+ let chunk_map_idx = bucket_idx / CHUNK_SIZE;
+ let chunk_piece = bucket_idx % CHUNK_SIZE;
+ let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
+ v
+ } else {
+ return false;
+ };
+ let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
+ let word = if let Some(word) = bitset_canonical.get(idx) {
+ *word
+ } else {
+ let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
+ let mut word = bitset_canonical[real_idx as usize];
+ let should_invert = mapping & (1 << 6) != 0;
+ if should_invert {
+ word = !word;
+ }
+ // Lower 6 bits
+ let quantity = mapping & ((1 << 6) - 1);
+ if mapping & (1 << 7) != 0 {
+ // shift
+ word >>= quantity as u64;
+ } else {
+ word = word.rotate_left(quantity as u32);
+ }
+ word
+ };
+ (word & (1 << (needle % 64) as u64)) != 0
+}
+
+fn decode_prefix_sum(short_offset_run_header: u32) -> u32 {
+ short_offset_run_header & ((1 << 21) - 1)
+}
+
+fn decode_length(short_offset_run_header: u32) -> usize {
+ (short_offset_run_header >> 21) as usize
+}
+
+#[inline(always)]
+fn skip_search<const SOR: usize, const OFFSETS: usize>(
+ needle: u32,
+ short_offset_runs: &[u32; SOR],
+ offsets: &[u8; OFFSETS],
+) -> bool {
+ // Note that this *cannot* be past the end of the array, as the last
+ // element is greater than std::char::MAX (the largest possible needle).
+ //
+ // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct
+ // location cannot be past it, so Err(idx) != length either.
+ //
+ // This means that we can avoid bounds checking for the accesses below, too.
+ let last_idx =
+ match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) {
+ Ok(idx) => idx + 1,
+ Err(idx) => idx,
+ };
+
+ let mut offset_idx = decode_length(short_offset_runs[last_idx]);
+ let length = if let Some(next) = short_offset_runs.get(last_idx + 1) {
+ decode_length(*next) - offset_idx
+ } else {
+ offsets.len() - offset_idx
+ };
+ let prev =
+ last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0);
+
+ let total = needle - prev;
+ let mut prefix_sum = 0;
+ for _ in 0..(length - 1) {
+ let offset = offsets[offset_idx];
+ prefix_sum += offset as u32;
+ if prefix_sum > total {
+ break;
+ }
+ offset_idx += 1;
+ }
+ offset_idx % 2 == 1
+}
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
new file mode 100644
index 000000000..ab8eaee95
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -0,0 +1,394 @@
+use crate::fmt_list;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
+use std::convert::TryFrom;
+use std::fmt::{self, Write};
+use std::ops::Range;
+
+#[derive(Clone)]
+pub struct RawEmitter {
+ pub file: String,
+ pub desc: String,
+ pub bytes_used: usize,
+}
+
+impl RawEmitter {
+ pub fn new() -> RawEmitter {
+ RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
+ }
+
+ fn blank_line(&mut self) {
+ if self.file.is_empty() || self.file.ends_with("\n\n") {
+ return;
+ }
+ writeln!(&mut self.file).unwrap();
+ }
+
+ fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
+ let last_code_point = ranges.last().unwrap().end;
+ // bitset for every bit in the codepoint range
+ //
+ // + 2 to ensure an all zero word to use for padding
+ let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
+ for range in ranges {
+ for codepoint in range.clone() {
+ let bucket = codepoint as usize / 64;
+ let bit = codepoint as u64 % 64;
+ buckets[bucket] |= 1 << bit;
+ }
+ }
+
+ let mut words = buckets;
+ // Ensure that there's a zero word in the dataset, used for padding and
+ // such.
+ words.push(0);
+ let unique_words =
+ words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
+ if unique_words.len() > u8::MAX as usize {
+ return Err(format!("cannot pack {} into 8 bits", unique_words.len()));
+ }
+ // needed for the chunk mapping to work
+ assert_eq!(unique_words[0], 0, "has a zero word");
+ let canonicalized = Canonicalized::canonicalize(&unique_words);
+
+ let word_indices = canonicalized.unique_mapping.clone();
+ let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
+
+ let mut best = None;
+ for length in 1..=64 {
+ let mut temp = self.clone();
+ temp.emit_chunk_map(word_indices[&0], &compressed_words, length);
+ if let Some((_, size)) = best {
+ if temp.bytes_used < size {
+ best = Some((length, temp.bytes_used));
+ }
+ } else {
+ best = Some((length, temp.bytes_used));
+ }
+ }
+ self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);
+
+ struct Bits(u64);
+ impl fmt::Debug for Bits {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "0b{:064b}", self.0)
+ }
+ }
+
+ writeln!(
+ &mut self.file,
+ "static BITSET_CANONICAL: [u64; {}] = [{}];",
+ canonicalized.canonical_words.len(),
+ fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
+ )
+ .unwrap();
+ self.bytes_used += 8 * canonicalized.canonical_words.len();
+ writeln!(
+ &mut self.file,
+ "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
+ canonicalized.canonicalized_words.len(),
+ fmt_list(&canonicalized.canonicalized_words),
+ )
+ .unwrap();
+ // 8 bit index into shifted words, 7 bits for shift + optional flip
+ // We only need it for the words that we removed by applying a shift and
+ // flip to them.
+ self.bytes_used += 2 * canonicalized.canonicalized_words.len();
+
+ self.blank_line();
+
+ writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+ writeln!(&mut self.file, " super::bitset_search(",).unwrap();
+ writeln!(&mut self.file, " c as u32,").unwrap();
+ writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
+ writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
+ writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
+ writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
+ writeln!(&mut self.file, " )").unwrap();
+ writeln!(&mut self.file, "}}").unwrap();
+
+ Ok(())
+ }
+
+ fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
+ let mut compressed_words = compressed_words.to_vec();
+ for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
+ // pad out bitset index with zero words so we have all chunks of
+ // chunkchunk_length
+ compressed_words.push(zero_at);
+ }
+
+ let mut chunks = BTreeSet::new();
+ for chunk in compressed_words.chunks(chunk_length) {
+ chunks.insert(chunk);
+ }
+ let chunk_map = chunks
+ .clone()
+ .into_iter()
+ .enumerate()
+ .map(|(idx, chunk)| (chunk, idx))
+ .collect::<HashMap<_, _>>();
+ let mut chunk_indices = Vec::new();
+ for chunk in compressed_words.chunks(chunk_length) {
+ chunk_indices.push(chunk_map[chunk]);
+ }
+
+ writeln!(
+ &mut self.file,
+ "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
+ chunk_indices.len(),
+ fmt_list(&chunk_indices),
+ )
+ .unwrap();
+ self.bytes_used += chunk_indices.len();
+ writeln!(
+ &mut self.file,
+ "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
+ chunk_length,
+ chunks.len(),
+ fmt_list(chunks.iter()),
+ )
+ .unwrap();
+ self.bytes_used += chunk_length * chunks.len();
+ }
+}
+
+pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
+ emitter.blank_line();
+
+ let mut bitset = emitter.clone();
+ let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
+
+ let mut skiplist = emitter.clone();
+ skiplist.emit_skiplist(&ranges);
+
+ if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
+ *emitter = bitset;
+ emitter.desc = String::from("bitset");
+ } else {
+ *emitter = skiplist;
+ emitter.desc = String::from("skiplist");
+ }
+}
+
+struct Canonicalized {
+ canonical_words: Vec<u64>,
+ canonicalized_words: Vec<(u8, u8)>,
+
+ /// Maps an input unique word to the associated index (u8) which is into
+ /// canonical_words or canonicalized_words (in order).
+ unique_mapping: HashMap<u64, u8>,
+}
+
+impl Canonicalized {
+ fn canonicalize(unique_words: &[u64]) -> Self {
+ #[derive(Copy, Clone, Debug)]
+ enum Mapping {
+ Rotate(u32),
+ Invert,
+ RotateAndInvert(u32),
+ ShiftRight(u32),
+ }
+
+ // key is the word being mapped to
+ let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
+ for &a in unique_words {
+ 'b: for &b in unique_words {
+ // skip self
+ if a == b {
+ continue;
+ }
+
+ // All possible distinct rotations
+ for rotation in 1..64 {
+ if a.rotate_right(rotation) == b {
+ mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
+ // We're not interested in further mappings between a and b
+ continue 'b;
+ }
+ }
+
+ if (!a) == b {
+ mappings.entry(b).or_default().push((a, Mapping::Invert));
+ // We're not interested in further mappings between a and b
+ continue 'b;
+ }
+
+ // All possible distinct rotations, inverted
+ for rotation in 1..64 {
+ if (!a.rotate_right(rotation)) == b {
+ mappings
+ .entry(b)
+ .or_default()
+ .push((a, Mapping::RotateAndInvert(rotation)));
+ // We're not interested in further mappings between a and b
+ continue 'b;
+ }
+ }
+
+ // All possible shifts
+ for shift_by in 1..64 {
+ if a == (b >> shift_by) {
+ mappings
+ .entry(b)
+ .or_default()
+ .push((a, Mapping::ShiftRight(shift_by as u32)));
+ // We're not interested in further mappings between a and b
+ continue 'b;
+ }
+ }
+ }
+ }
+ // These are the bitset words which will be represented "raw" (as a u64)
+ let mut canonical_words = Vec::new();
+ // These are mapped words, which will be represented by an index into
+ // the canonical_words and a Mapping; u16 when encoded.
+ let mut canonicalized_words = Vec::new();
+ let mut unique_mapping = HashMap::new();
+
+ #[derive(Debug, PartialEq, Eq)]
+ enum UniqueMapping {
+ Canonical(usize),
+ Canonicalized(usize),
+ }
+
+ // Map 0 first, so that it is the first canonical word.
+ // This is realistically not inefficient because 0 is not mapped to by
+ // anything else (a shift pattern could do it, but would be wasteful).
+ //
+ // However, 0s are quite common in the overall dataset, and it is quite
+ // wasteful to have to go through a mapping function to determine that
+ // we have a zero.
+ //
+ // FIXME: Experiment with choosing most common words in overall data set
+ // for canonical when possible.
+ while let Some((&to, _)) = mappings
+ .iter()
+ .find(|(&to, _)| to == 0)
+ .or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
+ {
+ // Get the mapping with the most entries. Currently, no mapping can
+ // only exist transitively (i.e., there is no A, B, C such that A
+ // does not map to C and but A maps to B maps to C), so this is
+ // guaranteed to be acceptable.
+ //
+ // In the future, we may need a more sophisticated algorithm to
+ // identify which keys to prefer as canonical.
+ let mapped_from = mappings.remove(&to).unwrap();
+ for (from, how) in &mapped_from {
+ // Remove the entries which mapped to this one.
+ // Noting that it should be associated with the Nth canonical word.
+ //
+ // We do not assert that this is present, because there may be
+ // no mappings to the `from` word; that's fine.
+ mappings.remove(from);
+ assert_eq!(
+ unique_mapping
+ .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
+ None
+ );
+ canonicalized_words.push((canonical_words.len(), *how));
+
+ // Remove the now-canonicalized word from other mappings,
+ // to ensure that we deprioritize them in the next iteration of
+ // the while loop.
+ for mapped in mappings.values_mut() {
+ let mut i = 0;
+ while i != mapped.len() {
+ if mapped[i].0 == *from {
+ mapped.remove(i);
+ } else {
+ i += 1;
+ }
+ }
+ }
+ }
+ assert!(
+ unique_mapping
+ .insert(to, UniqueMapping::Canonical(canonical_words.len()))
+ .is_none()
+ );
+ canonical_words.push(to);
+
+ // Remove the now-canonical word from other mappings, to ensure that
+ // we deprioritize them in the next iteration of the while loop.
+ for mapped in mappings.values_mut() {
+ let mut i = 0;
+ while i != mapped.len() {
+ if mapped[i].0 == to {
+ mapped.remove(i);
+ } else {
+ i += 1;
+ }
+ }
+ }
+ }
+
+ // Any words which we couldn't shrink, just stick into the canonical
+ // words.
+ //
+ // FIXME: work harder -- there are more possibilities for mapping
+ // functions (e.g., multiplication, shifting instead of rotation, etc.)
+ // We'll probably always have some slack though so this loop will still
+ // be needed.
+ for &w in unique_words {
+ if !unique_mapping.contains_key(&w) {
+ assert!(
+ unique_mapping
+ .insert(w, UniqueMapping::Canonical(canonical_words.len()))
+ .is_none()
+ );
+ canonical_words.push(w);
+ }
+ }
+ assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
+ assert_eq!(unique_mapping.len(), unique_words.len());
+
+ let unique_mapping = unique_mapping
+ .into_iter()
+ .map(|(key, value)| {
+ (
+ key,
+ match value {
+ UniqueMapping::Canonicalized(idx) => {
+ u8::try_from(canonical_words.len() + idx).unwrap()
+ }
+ UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
+ },
+ )
+ })
+ .collect::<HashMap<_, _>>();
+
+ let mut distinct_indices = BTreeSet::new();
+ for &w in unique_words {
+ let idx = unique_mapping.get(&w).unwrap();
+ assert!(distinct_indices.insert(idx));
+ }
+
+ const LOWER_6: u32 = (1 << 6) - 1;
+
+ let canonicalized_words = canonicalized_words
+ .into_iter()
+ .map(|v| {
+ (
+ u8::try_from(v.0).unwrap(),
+ match v.1 {
+ Mapping::RotateAndInvert(amount) => {
+ assert_eq!(amount, amount & LOWER_6);
+ 1 << 6 | (amount as u8)
+ }
+ Mapping::Rotate(amount) => {
+ assert_eq!(amount, amount & LOWER_6);
+ amount as u8
+ }
+ Mapping::Invert => 1 << 6,
+ Mapping::ShiftRight(shift_by) => {
+ assert_eq!(shift_by, shift_by & LOWER_6);
+ 1 << 7 | (shift_by as u8)
+ }
+ },
+ )
+ })
+ .collect::<Vec<(u8, u8)>>();
+ Canonicalized { unique_mapping, canonical_words, canonicalized_words }
+ }
+}
diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs
new file mode 100644
index 000000000..6e439968c
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@@ -0,0 +1,98 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::convert::TryInto;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+/// This will get packed into a single u32 before inserting into the data set.
+#[derive(Debug, PartialEq)]
+struct ShortOffsetRunHeader {
+ /// Note, we only allow for 21 bits here.
+ prefix_sum: u32,
+
+ /// Note, we actually only allow for 11 bits here. This should be enough --
+ /// our largest sets are around ~1400 offsets long.
+ start_idx: u16,
+}
+
+impl ShortOffsetRunHeader {
+ fn pack(&self) -> u32 {
+ assert!(self.start_idx < (1 << 11));
+ assert!(self.prefix_sum < (1 << 21));
+
+ (self.start_idx as u32) << 21 | self.prefix_sum
+ }
+}
+
+impl RawEmitter {
+ pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
+ let mut offsets = Vec::<u32>::new();
+ let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
+ let mut offset = 0;
+ for pt in points {
+ let delta = pt - offset;
+ offsets.push(delta);
+ offset = pt;
+ }
+ // Guaranteed to terminate, as it's impossible to subtract a value this
+ // large from a valid char.
+ offsets.push(std::char::MAX as u32 + 1);
+ let mut coded_offsets: Vec<u8> = Vec::new();
+ let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![];
+ let mut iter = offsets.iter().cloned();
+ let mut prefix_sum = 0;
+ loop {
+ let mut any_elements = false;
+ let mut inserted = false;
+ let start = coded_offsets.len();
+ for offset in iter.by_ref() {
+ any_elements = true;
+ prefix_sum += offset;
+ if let Ok(offset) = offset.try_into() {
+ coded_offsets.push(offset);
+ } else {
+ short_offset_runs.push(ShortOffsetRunHeader {
+ start_idx: start.try_into().unwrap(),
+ prefix_sum,
+ });
+ // This is just needed to maintain indices even/odd
+ // correctly.
+ coded_offsets.push(0);
+ inserted = true;
+ break;
+ }
+ }
+ if !any_elements {
+ break;
+ }
+ // We always append the huge char::MAX offset to the end which
+ // should never be able to fit into the u8 offsets.
+ assert!(inserted);
+ }
+
+ writeln!(
+ &mut self.file,
+ "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];",
+ short_offset_runs.len(),
+ fmt_list(short_offset_runs.iter().map(|v| v.pack()))
+ )
+ .unwrap();
+ self.bytes_used += 4 * short_offset_runs.len();
+ writeln!(
+ &mut self.file,
+ "static OFFSETS: [u8; {}] = [{}];",
+ coded_offsets.len(),
+ fmt_list(&coded_offsets)
+ )
+ .unwrap();
+ self.bytes_used += coded_offsets.len();
+
+ writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+ writeln!(&mut self.file, " super::skip_search(",).unwrap();
+ writeln!(&mut self.file, " c as u32,").unwrap();
+ writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();
+ writeln!(&mut self.file, " &OFFSETS,").unwrap();
+ writeln!(&mut self.file, " )").unwrap();
+ writeln!(&mut self.file, "}}").unwrap();
+ }
+}
diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs
new file mode 100644
index 000000000..9b2e0a258
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/unicode_download.rs
@@ -0,0 +1,46 @@
+use crate::UNICODE_DIRECTORY;
+use std::path::Path;
+use std::process::Command;
+
+static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/";
+
+static README: &str = "ReadMe.txt";
+
+static RESOURCES: &[&str] =
+ &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"];
+
+pub fn fetch_latest() {
+ let directory = Path::new(UNICODE_DIRECTORY);
+ if directory.exists() {
+ eprintln!(
+ "Not refetching unicode data, already exists, please delete {directory:?} to regenerate",
+ );
+ return;
+ }
+ if let Err(e) = std::fs::create_dir_all(directory) {
+ panic!("Failed to create {UNICODE_DIRECTORY:?}: {e}");
+ }
+ let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
+ if !output.status.success() {
+ panic!(
+ "Failed to run curl to fetch readme: stderr: {}",
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+ let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default();
+ if current.as_bytes() != &output.stdout[..] {
+ std::fs::write(directory.join(README), output.stdout).unwrap();
+ }
+
+ for resource in RESOURCES {
+ let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap();
+ if !output.status.success() {
+ panic!(
+ "Failed to run curl to fetch {}: stderr: {}",
+ resource,
+ String::from_utf8_lossy(&output.stderr)
+ );
+ }
+ std::fs::write(directory.join(resource), output.stdout).unwrap();
+ }
+}