diff options
Diffstat (limited to '')
-rw-r--r-- | src/tools/unicode-table-generator/Cargo.toml | 9 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/case_mapping.rs | 70 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/main.rs | 439 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/range_search.rs | 93 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/raw_emitter.rs | 394 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/skiplist.rs | 98 | ||||
-rw-r--r-- | src/tools/unicode-table-generator/src/unicode_download.rs | 46 |
7 files changed, 1149 insertions, 0 deletions
diff --git a/src/tools/unicode-table-generator/Cargo.toml b/src/tools/unicode-table-generator/Cargo.toml new file mode 100644 index 000000000..ef01877c0 --- /dev/null +++ b/src/tools/unicode-table-generator/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "unicode-bdd" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +ucd-parse = "0.1.3" diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs new file mode 100644 index 000000000..992aac1f8 --- /dev/null +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -0,0 +1,70 @@ +use crate::{fmt_list, UnicodeData}; +use std::fmt; + +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { + let mut file = String::new(); + + file.push_str(HEADER.trim_start()); + + let decl_type = "&[(char, [char; 3])]"; + + file.push_str(&format!( + "static LOWERCASE_TABLE: {} = &[{}];", + decl_type, + fmt_list(data.to_lower.iter().map(to_mapping)) + )); + file.push_str("\n\n"); + file.push_str(&format!( + "static UPPERCASE_TABLE: {} = &[{}];", + decl_type, + fmt_list(data.to_upper.iter().map(to_mapping)) + )); + file +} + +fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) { + ( + CharEscape(std::char::from_u32(*key).unwrap()), + [ + CharEscape(std::char::from_u32(*a).unwrap()), + CharEscape(std::char::from_u32(*b).unwrap()), + CharEscape(std::char::from_u32(*c).unwrap()), + ], + ) +} + +struct CharEscape(char); + +impl fmt::Debug for CharEscape { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "'{}'", self.0.escape_default()) + } +} + +static HEADER: &str = r" +pub fn to_lower(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_lowercase() as char, '\0', '\0'] + } else { + match bsearch_case_table(c, LOWERCASE_TABLE) { + None => [c, '\0', '\0'], + Some(index) => LOWERCASE_TABLE[index].1, + } + } +} + +pub fn to_upper(c: char) -> [char; 3] { + if c.is_ascii() { + [(c as u8).to_ascii_uppercase() as char, '\0', '\0'] + } else { + match bsearch_case_table(c, UPPERCASE_TABLE) { + None => [c, '\0', '\0'], + Some(index) => UPPERCASE_TABLE[index].1, + } + } +} + +fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> { + table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() +} +"; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs new file mode 100644 index 000000000..4720ee702 --- /dev/null +++ b/src/tools/unicode-table-generator/src/main.rs @@ -0,0 +1,439 @@ +//! This implements the core logic of the compression scheme used to compactly +//! encode Unicode properties. +//! +//! We have two primary goals with the encoding: we want to be compact, because +//! these tables often end up in ~every Rust program (especially the +//! grapheme_extend table, used for str debugging), including those for embedded +//! targets (where space is important). We also want to be relatively fast, +//! though this is more of a nice to have rather than a key design constraint. +//! It is expected that libraries/applications which are performance-sensitive +//! to Unicode property lookups are extremely rare, and those that care may find +//! the tradeoff of the raw bitsets worth it. For most applications, a +//! relatively fast but much smaller (and as such less cache-impacting, etc.) +//! data set is likely preferable. +//! +//! We have two separate encoding schemes: a skiplist-like approach, and a +//! compressed bitset. The datasets we consider mostly use the skiplist (it's +//! smaller) but the lowercase and uppercase sets are sufficiently sparse for +//! the bitset to be worthwhile -- for those sets the bitset is a 2x size win. +//! Since the bitset is also faster, this seems an obvious choice. (As a +//! historical note, the bitset was also the prior implementation, so its +//! relative complexity had already been paid). +//! +//! ## The bitset +//! +//! The primary idea is that we 'flatten' the Unicode ranges into an enormous +//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need +//! over 17 kilobytes of data per character set -- way too much for our +//! purposes. +//! +//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF, +//! not skipping the small 'gap') is associated into words (u64) and +//! deduplicated. On random data, this would be useless; on our data, this is +//! incredibly beneficial -- our data sets have (far) less than 256 unique +//! words. +//! +//! This gives us an array that maps `u8 -> word`; the current algorithm does +//! not handle the case of more than 256 unique words, but we are relatively far +//! from coming that close. +//! +//! With that scheme, we now have a single byte for every 64 codepoints. +//! +//! We further chunk these by some constant N (between 1 and 64 per group, +//! dynamically chosen for smallest size), and again deduplicate and store in an +//! array (u8 -> [u8; N]). +//! +//! The bytes of this array map into the words from the bitset above, but we +//! apply another trick here: some of these words are similar enough that they +//! can be represented by some function of another word. The particular +//! functions chosen are rotation, inversion, and shifting (right). +//! +//! ## The skiplist +//! +//! The skip list arose out of the desire for an even smaller encoding than the +//! bitset -- and was the answer to the question "what is the smallest +//! representation we can imagine?". However, it is not necessarily the +//! smallest, and if you have a better proposal, please do suggest it! +//! +//! This is a relatively straightforward encoding. First, we break up all the +//! ranges in the input data into offsets from each other, essentially a gap +//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we +//! store those directly. We make use of the larger gaps (which are nicely +//! interspersed already) throughout the dataset to index this data set. +//! +//! In particular, each run of small gaps (terminating in a large gap) is +//! indexed in a separate dataset. That data set stores an index into the +//! primary offset list and a prefix sum of that offset list. These are packed +//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum). +//! +//! Lookup proceeds via a binary search in the index and then a straightforward +//! linear scan (adding up the offsets) until we reach the needle, and then the +//! index of that offset is utilized as the answer to whether we're in the set +//! or not. + +use std::collections::{BTreeMap, HashMap}; +use std::ops::Range; +use ucd_parse::Codepoints; + +mod case_mapping; +mod raw_emitter; +mod skiplist; +mod unicode_download; + +use raw_emitter::{emit_codepoints, RawEmitter}; + +static PROPERTIES: &[&str] = &[ + "Alphabetic", + "Lowercase", + "Uppercase", + "Cased", + "Case_Ignorable", + "Grapheme_Extend", + "White_Space", + "Cc", + "N", +]; + +struct UnicodeData { + ranges: Vec<(&'static str, Vec<Range<u32>>)>, + to_upper: BTreeMap<u32, (u32, u32, u32)>, + to_lower: BTreeMap<u32, (u32, u32, u32)>, +} + +fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> { + let mut a = None; + let mut b = None; + let mut c = None; + + for codepoint in codepoints { + if origin == codepoint.value() { + return None; + } + + if a.is_none() { + a = Some(codepoint.value()); + } else if b.is_none() { + b = Some(codepoint.value()); + } else if c.is_none() { + c = Some(codepoint.value()); + } else { + panic!("more than 3 mapped codepoints") + } + } + + Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) +} + +static UNICODE_DIRECTORY: &str = "unicode-downloads"; + +fn load_data() -> UnicodeData { + unicode_download::fetch_latest(); + + let mut properties = HashMap::new(); + for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() { + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); + } + } + for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() { + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); + } + } + + let mut to_lower = BTreeMap::new(); + let mut to_upper = BTreeMap::new(); + for row in ucd_parse::UnicodeDataExpander::new( + ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), + ) { + let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) { + "N" + } else { + row.general_category.as_str() + }; + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { + properties + .entry(*name) + .or_insert_with(Vec::new) + .push(Codepoints::Single(row.codepoint)); + } + + if let Some(mapped) = row.simple_lowercase_mapping { + if mapped != row.codepoint { + to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + } + } + if let Some(mapped) = row.simple_uppercase_mapping { + if mapped != row.codepoint { + to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + } + } + } + + for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { + if !row.conditions.is_empty() { + // Skip conditional case mappings + continue; + } + + let key = row.codepoint.value(); + if let Some(lower) = to_mapping(key, row.lowercase) { + to_lower.insert(key, lower); + } + if let Some(upper) = to_mapping(key, row.uppercase) { + to_upper.insert(key, upper); + } + } + + let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties + .into_iter() + .map(|(k, v)| { + ( + k, + v.into_iter() + .flat_map(|codepoints| match codepoints { + Codepoints::Single(c) => c + .scalar() + .map(|ch| (ch as u32..ch as u32 + 1)) + .into_iter() + .collect::<Vec<_>>(), + Codepoints::Range(c) => c + .into_iter() + .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1))) + .collect::<Vec<_>>(), + }) + .collect::<Vec<Range<u32>>>(), + ) + }) + .collect(); + + for ranges in properties.values_mut() { + merge_ranges(ranges); + } + + let mut properties = properties.into_iter().collect::<Vec<_>>(); + properties.sort_by_key(|p| p.0); + UnicodeData { ranges: properties, to_lower, to_upper } +} + +fn main() { + let write_location = std::env::args().nth(1).unwrap_or_else(|| { + eprintln!("Must provide path to write unicode tables to"); + eprintln!( + "e.g. {} library/core/unicode/unicode_data.rs", + std::env::args().next().unwrap_or_default() + ); + std::process::exit(1); + }); + + // Optional test path, which is a Rust source file testing that the unicode + // property lookups are correct. + let test_path = std::env::args().nth(2); + + let unicode_data = load_data(); + let ranges_by_property = &unicode_data.ranges; + + if let Some(path) = test_path { + std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap(); + } + + let mut total_bytes = 0; + let mut modules = Vec::new(); + for (property, ranges) in ranges_by_property { + let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>(); + let mut emitter = RawEmitter::new(); + emit_codepoints(&mut emitter, &ranges); + + modules.push((property.to_lowercase().to_string(), emitter.file)); + println!( + "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", + property, + emitter.bytes_used, + datapoints, + ranges.len(), + ranges.first().unwrap().start, + ranges.last().unwrap().end, + emitter.desc, + ); + total_bytes += emitter.bytes_used; + } + + let mut table_file = String::new(); + + table_file.push_str( + "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n", + ); + + // Include the range search function + table_file.push('\n'); + table_file.push_str(include_str!("range_search.rs")); + table_file.push('\n'); + + table_file.push_str(&version()); + + table_file.push('\n'); + + modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + + for (name, contents) in modules { + table_file.push_str("#[rustfmt::skip]\n"); + table_file.push_str(&format!("pub mod {name} {{\n")); + for line in contents.lines() { + if !line.trim().is_empty() { + table_file.push_str(" "); + table_file.push_str(&line); + } + table_file.push('\n'); + } + table_file.push_str("}\n\n"); + } + + std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); + + println!("Total table sizes: {total_bytes} bytes"); +} + +fn version() -> String { + let mut out = String::new(); + out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = "); + + let readme = + std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt")) + .unwrap(); + + let prefix = "for Version "; + let start = readme.find(prefix).unwrap() + prefix.len(); + let end = readme.find(" of the Unicode Standard.").unwrap(); + let version = + readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>(); + let [major, minor, micro] = [version[0], version[1], version[2]]; + + out.push_str(&format!("({major}, {minor}, {micro});\n")); + out +} + +fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String { + let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>(); + let mut out = String::new(); + let mut line = String::from("\n "); + for piece in pieces { + if line.len() + piece.len() < 98 { + line.push_str(&piece); + } else { + out.push_str(line.trim_end()); + out.push('\n'); + line = format!(" {piece}"); + } + } + out.push_str(line.trim_end()); + out.push('\n'); + out +} + +fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String { + let mut s = String::new(); + s.push_str("#![allow(incomplete_features, unused)]\n"); + s.push_str("#![feature(const_generics)]\n\n"); + s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); + s.push_str(&format!("#[path = \"{data_path}\"]\n")); + s.push_str("mod unicode_data;\n\n"); + + s.push_str("\nfn main() {\n"); + + for (property, ranges) in ranges { + s.push_str(&format!(r#" println!("Testing {}");"#, property)); + s.push('\n'); + s.push_str(&format!(" {}_true();\n", property.to_lowercase())); + s.push_str(&format!(" {}_false();\n", property.to_lowercase())); + let mut is_true = Vec::new(); + let mut is_false = Vec::new(); + for ch_num in 0..(std::char::MAX as u32) { + if std::char::from_u32(ch_num).is_none() { + continue; + } + if ranges.iter().any(|r| r.contains(&ch_num)) { + is_true.push(ch_num); + } else { + is_false.push(ch_num); + } + } + + s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); + generate_asserts(&mut s, property, &is_true, true); + s.push_str(" }\n\n"); + s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); + generate_asserts(&mut s, property, &is_false, false); + s.push_str(" }\n\n"); + } + + s.push_str("}"); + s +} + +fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { + for range in ranges_from_set(points) { + if range.end == range.start + 1 { + s.push_str(&format!( + " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", + if truthy { "" } else { "!" }, + property.to_lowercase(), + std::char::from_u32(range.start).unwrap(), + range.start, + )); + } else { + s.push_str(&format!(" for chn in {:?}u32 {{\n", range)); + s.push_str(&format!( + " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", + if truthy { "" } else { "!" }, + property.to_lowercase(), + )); + s.push_str(" }\n"); + } + } +} + +fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> { + let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>(); + merge_ranges(&mut ranges); + ranges +} + +fn merge_ranges(ranges: &mut Vec<Range<u32>>) { + loop { + let mut new_ranges = Vec::new(); + let mut idx_iter = 0..(ranges.len() - 1); + let mut should_insert_last = true; + while let Some(idx) = idx_iter.next() { + let cur = ranges[idx].clone(); + let next = ranges[idx + 1].clone(); + if cur.end == next.start { + if idx_iter.next().is_none() { + // We're merging the last element + should_insert_last = false; + } + new_ranges.push(cur.start..next.end); + } else { + // We're *not* merging the last element + should_insert_last = true; + new_ranges.push(cur); + } + } + if should_insert_last { + new_ranges.push(ranges.last().unwrap().clone()); + } + if new_ranges.len() == ranges.len() { + *ranges = new_ranges; + break; + } else { + *ranges = new_ranges; + } + } + + let mut last_end = None; + for range in ranges { + if let Some(last) = last_end { + assert!(range.start > last, "{:?}", range); + } + last_end = Some(range.end); + } +} diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs new file mode 100644 index 000000000..39b47ce70 --- /dev/null +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -0,0 +1,93 @@ +#[inline(always)] +fn bitset_search< + const N: usize, + const CHUNK_SIZE: usize, + const N1: usize, + const CANONICAL: usize, + const CANONICALIZED: usize, +>( + needle: u32, + chunk_idx_map: &[u8; N], + bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1], + bitset_canonical: &[u64; CANONICAL], + bitset_canonicalized: &[(u8, u8); CANONICALIZED], +) -> bool { + let bucket_idx = (needle / 64) as usize; + let chunk_map_idx = bucket_idx / CHUNK_SIZE; + let chunk_piece = bucket_idx % CHUNK_SIZE; + let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { + v + } else { + return false; + }; + let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; + let word = if let Some(word) = bitset_canonical.get(idx) { + *word + } else { + let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; + let mut word = bitset_canonical[real_idx as usize]; + let should_invert = mapping & (1 << 6) != 0; + if should_invert { + word = !word; + } + // Lower 6 bits + let quantity = mapping & ((1 << 6) - 1); + if mapping & (1 << 7) != 0 { + // shift + word >>= quantity as u64; + } else { + word = word.rotate_left(quantity as u32); + } + word + }; + (word & (1 << (needle % 64) as u64)) != 0 +} + +fn decode_prefix_sum(short_offset_run_header: u32) -> u32 { + short_offset_run_header & ((1 << 21) - 1) +} + +fn decode_length(short_offset_run_header: u32) -> usize { + (short_offset_run_header >> 21) as usize +} + +#[inline(always)] +fn skip_search<const SOR: usize, const OFFSETS: usize>( + needle: u32, + short_offset_runs: &[u32; SOR], + offsets: &[u8; OFFSETS], +) -> bool { + // Note that this *cannot* be past the end of the array, as the last + // element is greater than std::char::MAX (the largest possible needle). + // + // So, we cannot have found it (i.e. Ok(idx) + 1 != length) and the correct + // location cannot be past it, so Err(idx) != length either. + // + // This means that we can avoid bounds checking for the accesses below, too. + let last_idx = + match short_offset_runs.binary_search_by_key(&(needle << 11), |header| header << 11) { + Ok(idx) => idx + 1, + Err(idx) => idx, + }; + + let mut offset_idx = decode_length(short_offset_runs[last_idx]); + let length = if let Some(next) = short_offset_runs.get(last_idx + 1) { + decode_length(*next) - offset_idx + } else { + offsets.len() - offset_idx + }; + let prev = + last_idx.checked_sub(1).map(|prev| decode_prefix_sum(short_offset_runs[prev])).unwrap_or(0); + + let total = needle - prev; + let mut prefix_sum = 0; + for _ in 0..(length - 1) { + let offset = offsets[offset_idx]; + prefix_sum += offset as u32; + if prefix_sum > total { + break; + } + offset_idx += 1; + } + offset_idx % 2 == 1 +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs new file mode 100644 index 000000000..ab8eaee95 --- /dev/null +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -0,0 +1,394 @@ +use crate::fmt_list; +use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::convert::TryFrom; +use std::fmt::{self, Write}; +use std::ops::Range; + +#[derive(Clone)] +pub struct RawEmitter { + pub file: String, + pub desc: String, + pub bytes_used: usize, +} + +impl RawEmitter { + pub fn new() -> RawEmitter { + RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() } + } + + fn blank_line(&mut self) { + if self.file.is_empty() || self.file.ends_with("\n\n") { + return; + } + writeln!(&mut self.file).unwrap(); + } + + fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> { + let last_code_point = ranges.last().unwrap().end; + // bitset for every bit in the codepoint range + // + // + 2 to ensure an all zero word to use for padding + let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; + for range in ranges { + for codepoint in range.clone() { + let bucket = codepoint as usize / 64; + let bit = codepoint as u64 % 64; + buckets[bucket] |= 1 << bit; + } + } + + let mut words = buckets; + // Ensure that there's a zero word in the dataset, used for padding and + // such. + words.push(0); + let unique_words = + words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>(); + if unique_words.len() > u8::MAX as usize { + return Err(format!("cannot pack {} into 8 bits", unique_words.len())); + } + // needed for the chunk mapping to work + assert_eq!(unique_words[0], 0, "has a zero word"); + let canonicalized = Canonicalized::canonicalize(&unique_words); + + let word_indices = canonicalized.unique_mapping.clone(); + let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>(); + + let mut best = None; + for length in 1..=64 { + let mut temp = self.clone(); + temp.emit_chunk_map(word_indices[&0], &compressed_words, length); + if let Some((_, size)) = best { + if temp.bytes_used < size { + best = Some((length, temp.bytes_used)); + } + } else { + best = Some((length, temp.bytes_used)); + } + } + self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); + + struct Bits(u64); + impl fmt::Debug for Bits { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "0b{:064b}", self.0) + } + } + + writeln!( + &mut self.file, + "static BITSET_CANONICAL: [u64; {}] = [{}];", + canonicalized.canonical_words.len(), + fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), + ) + .unwrap(); + self.bytes_used += 8 * canonicalized.canonical_words.len(); + writeln!( + &mut self.file, + "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", + canonicalized.canonicalized_words.len(), + fmt_list(&canonicalized.canonicalized_words), + ) + .unwrap(); + // 8 bit index into shifted words, 7 bits for shift + optional flip + // We only need it for the words that we removed by applying a shift and + // flip to them. + self.bytes_used += 2 * canonicalized.canonicalized_words.len(); + + self.blank_line(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::bitset_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); + writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); + writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); + writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + + Ok(()) + } + + fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { + let mut compressed_words = compressed_words.to_vec(); + for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) { + // pad out bitset index with zero words so we have all chunks of + // chunkchunk_length + compressed_words.push(zero_at); + } + + let mut chunks = BTreeSet::new(); + for chunk in compressed_words.chunks(chunk_length) { + chunks.insert(chunk); + } + let chunk_map = chunks + .clone() + .into_iter() + .enumerate() + .map(|(idx, chunk)| (chunk, idx)) + .collect::<HashMap<_, _>>(); + let mut chunk_indices = Vec::new(); + for chunk in compressed_words.chunks(chunk_length) { + chunk_indices.push(chunk_map[chunk]); + } + + writeln!( + &mut self.file, + "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", + chunk_indices.len(), + fmt_list(&chunk_indices), + ) + .unwrap(); + self.bytes_used += chunk_indices.len(); + writeln!( + &mut self.file, + "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];", + chunk_length, + chunks.len(), + fmt_list(chunks.iter()), + ) + .unwrap(); + self.bytes_used += chunk_length * chunks.len(); + } +} + +pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { + emitter.blank_line(); + + let mut bitset = emitter.clone(); + let bitset_ok = bitset.emit_bitset(&ranges).is_ok(); + + let mut skiplist = emitter.clone(); + skiplist.emit_skiplist(&ranges); + + if bitset_ok && bitset.bytes_used <= skiplist.bytes_used { + *emitter = bitset; + emitter.desc = String::from("bitset"); + } else { + *emitter = skiplist; + emitter.desc = String::from("skiplist"); + } +} + +struct Canonicalized { + canonical_words: Vec<u64>, + canonicalized_words: Vec<(u8, u8)>, + + /// Maps an input unique word to the associated index (u8) which is into + /// canonical_words or canonicalized_words (in order). + unique_mapping: HashMap<u64, u8>, +} + +impl Canonicalized { + fn canonicalize(unique_words: &[u64]) -> Self { + #[derive(Copy, Clone, Debug)] + enum Mapping { + Rotate(u32), + Invert, + RotateAndInvert(u32), + ShiftRight(u32), + } + + // key is the word being mapped to + let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new(); + for &a in unique_words { + 'b: for &b in unique_words { + // skip self + if a == b { + continue; + } + + // All possible distinct rotations + for rotation in 1..64 { + if a.rotate_right(rotation) == b { + mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + + if (!a) == b { + mappings.entry(b).or_default().push((a, Mapping::Invert)); + // We're not interested in further mappings between a and b + continue 'b; + } + + // All possible distinct rotations, inverted + for rotation in 1..64 { + if (!a.rotate_right(rotation)) == b { + mappings + .entry(b) + .or_default() + .push((a, Mapping::RotateAndInvert(rotation))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + + // All possible shifts + for shift_by in 1..64 { + if a == (b >> shift_by) { + mappings + .entry(b) + .or_default() + .push((a, Mapping::ShiftRight(shift_by as u32))); + // We're not interested in further mappings between a and b + continue 'b; + } + } + } + } + // These are the bitset words which will be represented "raw" (as a u64) + let mut canonical_words = Vec::new(); + // These are mapped words, which will be represented by an index into + // the canonical_words and a Mapping; u16 when encoded. + let mut canonicalized_words = Vec::new(); + let mut unique_mapping = HashMap::new(); + + #[derive(Debug, PartialEq, Eq)] + enum UniqueMapping { + Canonical(usize), + Canonicalized(usize), + } + + // Map 0 first, so that it is the first canonical word. + // This is realistically not inefficient because 0 is not mapped to by + // anything else (a shift pattern could do it, but would be wasteful). + // + // However, 0s are quite common in the overall dataset, and it is quite + // wasteful to have to go through a mapping function to determine that + // we have a zero. + // + // FIXME: Experiment with choosing most common words in overall data set + // for canonical when possible. + while let Some((&to, _)) = mappings + .iter() + .find(|(&to, _)| to == 0) + .or_else(|| mappings.iter().max_by_key(|m| m.1.len())) + { + // Get the mapping with the most entries. Currently, no mapping can + // only exist transitively (i.e., there is no A, B, C such that A + // does not map to C and but A maps to B maps to C), so this is + // guaranteed to be acceptable. + // + // In the future, we may need a more sophisticated algorithm to + // identify which keys to prefer as canonical. + let mapped_from = mappings.remove(&to).unwrap(); + for (from, how) in &mapped_from { + // Remove the entries which mapped to this one. + // Noting that it should be associated with the Nth canonical word. + // + // We do not assert that this is present, because there may be + // no mappings to the `from` word; that's fine. + mappings.remove(from); + assert_eq!( + unique_mapping + .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())), + None + ); + canonicalized_words.push((canonical_words.len(), *how)); + + // Remove the now-canonicalized word from other mappings, + // to ensure that we deprioritize them in the next iteration of + // the while loop. + for mapped in mappings.values_mut() { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == *from { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + assert!( + unique_mapping + .insert(to, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(to); + + // Remove the now-canonical word from other mappings, to ensure that + // we deprioritize them in the next iteration of the while loop. + for mapped in mappings.values_mut() { + let mut i = 0; + while i != mapped.len() { + if mapped[i].0 == to { + mapped.remove(i); + } else { + i += 1; + } + } + } + } + + // Any words which we couldn't shrink, just stick into the canonical + // words. + // + // FIXME: work harder -- there are more possibilities for mapping + // functions (e.g., multiplication, shifting instead of rotation, etc.) + // We'll probably always have some slack though so this loop will still + // be needed. + for &w in unique_words { + if !unique_mapping.contains_key(&w) { + assert!( + unique_mapping + .insert(w, UniqueMapping::Canonical(canonical_words.len())) + .is_none() + ); + canonical_words.push(w); + } + } + assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); + assert_eq!(unique_mapping.len(), unique_words.len()); + + let unique_mapping = unique_mapping + .into_iter() + .map(|(key, value)| { + ( + key, + match value { + UniqueMapping::Canonicalized(idx) => { + u8::try_from(canonical_words.len() + idx).unwrap() + } + UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(), + }, + ) + }) + .collect::<HashMap<_, _>>(); + + let mut distinct_indices = BTreeSet::new(); + for &w in unique_words { + let idx = unique_mapping.get(&w).unwrap(); + assert!(distinct_indices.insert(idx)); + } + + const LOWER_6: u32 = (1 << 6) - 1; + + let canonicalized_words = canonicalized_words + .into_iter() + .map(|v| { + ( + u8::try_from(v.0).unwrap(), + match v.1 { + Mapping::RotateAndInvert(amount) => { + assert_eq!(amount, amount & LOWER_6); + 1 << 6 | (amount as u8) + } + Mapping::Rotate(amount) => { + assert_eq!(amount, amount & LOWER_6); + amount as u8 + } + Mapping::Invert => 1 << 6, + Mapping::ShiftRight(shift_by) => { + assert_eq!(shift_by, shift_by & LOWER_6); + 1 << 7 | (shift_by as u8) + } + }, + ) + }) + .collect::<Vec<(u8, u8)>>(); + Canonicalized { unique_mapping, canonical_words, canonicalized_words } + } +} diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs new file mode 100644 index 000000000..6e439968c --- /dev/null +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -0,0 +1,98 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::convert::TryInto; +use std::fmt::Write as _; +use std::ops::Range; + +/// This will get packed into a single u32 before inserting into the data set. +#[derive(Debug, PartialEq)] +struct ShortOffsetRunHeader { + /// Note, we only allow for 21 bits here. + prefix_sum: u32, + + /// Note, we actually only allow for 11 bits here. This should be enough -- + /// our largest sets are around ~1400 offsets long. + start_idx: u16, +} + +impl ShortOffsetRunHeader { + fn pack(&self) -> u32 { + assert!(self.start_idx < (1 << 11)); + assert!(self.prefix_sum < (1 << 21)); + + (self.start_idx as u32) << 21 | self.prefix_sum + } +} + +impl RawEmitter { + pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) { + let mut offsets = Vec::<u32>::new(); + let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>(); + let mut offset = 0; + for pt in points { + let delta = pt - offset; + offsets.push(delta); + offset = pt; + } + // Guaranteed to terminate, as it's impossible to subtract a value this + // large from a valid char. + offsets.push(std::char::MAX as u32 + 1); + let mut coded_offsets: Vec<u8> = Vec::new(); + let mut short_offset_runs: Vec<ShortOffsetRunHeader> = vec![]; + let mut iter = offsets.iter().cloned(); + let mut prefix_sum = 0; + loop { + let mut any_elements = false; + let mut inserted = false; + let start = coded_offsets.len(); + for offset in iter.by_ref() { + any_elements = true; + prefix_sum += offset; + if let Ok(offset) = offset.try_into() { + coded_offsets.push(offset); + } else { + short_offset_runs.push(ShortOffsetRunHeader { + start_idx: start.try_into().unwrap(), + prefix_sum, + }); + // This is just needed to maintain indices even/odd + // correctly. + coded_offsets.push(0); + inserted = true; + break; + } + } + if !any_elements { + break; + } + // We always append the huge char::MAX offset to the end which + // should never be able to fit into the u8 offsets. + assert!(inserted); + } + + writeln!( + &mut self.file, + "static SHORT_OFFSET_RUNS: [u32; {}] = [{}];", + short_offset_runs.len(), + fmt_list(short_offset_runs.iter().map(|v| v.pack())) + ) + .unwrap(); + self.bytes_used += 4 * short_offset_runs.len(); + writeln!( + &mut self.file, + "static OFFSETS: [u8; {}] = [{}];", + coded_offsets.len(), + fmt_list(&coded_offsets) + ) + .unwrap(); + self.bytes_used += coded_offsets.len(); + + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " super::skip_search(",).unwrap(); + writeln!(&mut self.file, " c as u32,").unwrap(); + writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap(); + writeln!(&mut self.file, " &OFFSETS,").unwrap(); + writeln!(&mut self.file, " )").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + } +} diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs new file mode 100644 index 000000000..9b2e0a258 --- /dev/null +++ b/src/tools/unicode-table-generator/src/unicode_download.rs @@ -0,0 +1,46 @@ +use crate::UNICODE_DIRECTORY; +use std::path::Path; +use std::process::Command; + +static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/"; + +static README: &str = "ReadMe.txt"; + +static RESOURCES: &[&str] = + &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"]; + +pub fn fetch_latest() { + let directory = Path::new(UNICODE_DIRECTORY); + if directory.exists() { + eprintln!( + "Not refetching unicode data, already exists, please delete {directory:?} to regenerate", + ); + return; + } + if let Err(e) = std::fs::create_dir_all(directory) { + panic!("Failed to create {UNICODE_DIRECTORY:?}: {e}"); + } + let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap(); + if !output.status.success() { + panic!( + "Failed to run curl to fetch readme: stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default(); + if current.as_bytes() != &output.stdout[..] { + std::fs::write(directory.join(README), output.stdout).unwrap(); + } + + for resource in RESOURCES { + let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap(); + if !output.status.success() { + panic!( + "Failed to run curl to fetch {}: stderr: {}", + resource, + String::from_utf8_lossy(&output.stderr) + ); + } + std::fs::write(directory.join(resource), output.stdout).unwrap(); + } +} |