diff options
Diffstat (limited to 'src/tools/unicode-table-generator')
5 files changed, 133 insertions, 35 deletions
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs new file mode 100644 index 000000000..02c754230 --- /dev/null +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -0,0 +1,78 @@ +use crate::fmt_list; +use crate::raw_emitter::RawEmitter; +use std::collections::HashMap; +use std::fmt::Write as _; +use std::ops::Range; + +impl RawEmitter { + pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool { + let mut map: [u8; 256] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + let points = ranges + .iter() + .flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>()) + .collect::<Vec<u32>>(); + + println!("there are {} points", points.len()); + + // how many distinct ranges need to be counted? + let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new(); + for point in points { + // assert that there is no whitespace over the 0x3000 range. + assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); + let high_bytes = point as usize >> 8; + let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new); + codepoints.push(point); + } + + let mut bit_for_high_byte = 1u8; + let mut arms = Vec::<String>::new(); + + let mut high_bytes: Vec<usize> = + codepoints_by_high_bytes.keys().map(|k| k.clone()).collect(); + high_bytes.sort(); + for high_byte in high_bytes { + let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); + if codepoints.len() == 1 { + let ch = codepoints.pop().unwrap(); + arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch)); + continue; + } + // more than 1 codepoint in this arm + for codepoint in codepoints { + map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; + } + arms.push(format!( + "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0", + high_byte, bit_for_high_byte + )); + bit_for_high_byte <<= 1; + } + + writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) + .unwrap(); + self.bytes_used += 256; + + writeln!(&mut self.file, "#[inline]").unwrap(); + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); + for arm in arms { + writeln!(&mut self.file, " {},", arm).unwrap(); + } + writeln!(&mut self.file, " _ => false,").unwrap(); + writeln!(&mut self.file, " }}").unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + + true + } +} diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 4720ee702..2fe578acd 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -75,12 +75,13 @@ use std::collections::{BTreeMap, HashMap}; use std::ops::Range; use ucd_parse::Codepoints; +mod cascading_map; mod case_mapping; mod raw_emitter; mod skiplist; mod unicode_download; -use raw_emitter::{emit_codepoints, RawEmitter}; +use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter}; static PROPERTIES: &[&str] = &[ "Alphabetic", @@ -220,7 +221,7 @@ fn main() { let write_location = std::env::args().nth(1).unwrap_or_else(|| { eprintln!("Must provide path to write unicode tables to"); eprintln!( - "e.g. {} library/core/unicode/unicode_data.rs", + "e.g. {} library/core/src/unicode/unicode_data.rs", std::env::args().next().unwrap_or_default() ); std::process::exit(1); @@ -241,8 +242,13 @@ fn main() { let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>(); + let mut emitter = RawEmitter::new(); - emit_codepoints(&mut emitter, &ranges); + if property == &"White_Space" { + emit_whitespace(&mut emitter, &ranges); + } else { + emit_codepoints(&mut emitter, &ranges); + } modules.push((property.to_lowercase().to_string(), emitter.file)); println!( diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs index 39b47ce70..3a5b869f7 100644 --- a/src/tools/unicode-table-generator/src/range_search.rs +++ b/src/tools/unicode-table-generator/src/range_search.rs @@ -1,5 +1,6 @@ +#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")] #[inline(always)] -fn bitset_search< +const fn bitset_search< const N: usize, const CHUNK_SIZE: usize, const N1: usize, @@ -15,14 +16,18 @@ fn bitset_search< let bucket_idx = (needle / 64) as usize; let chunk_map_idx = bucket_idx / CHUNK_SIZE; let chunk_piece = bucket_idx % CHUNK_SIZE; - let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) { - v + // FIXME: const-hack: Revert to `slice::get` after `const_slice_index` + // feature stabilizes. + let chunk_idx = if chunk_map_idx < chunk_idx_map.len() { + chunk_idx_map[chunk_map_idx] } else { return false; }; let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize; - let word = if let Some(word) = bitset_canonical.get(idx) { - *word + // FIXME: const-hack: Revert to `slice::get` after `const_slice_index` + // feature stabilizes. + let word = if idx < bitset_canonical.len() { + bitset_canonical[idx] } else { let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()]; let mut word = bitset_canonical[real_idx as usize]; diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index ab8eaee95..890ff986c 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -76,7 +76,7 @@ impl RawEmitter { writeln!( &mut self.file, - "static BITSET_CANONICAL: [u64; {}] = [{}];", + "const BITSET_CANONICAL: &'static [u64; {}] = &[{}];", canonicalized.canonical_words.len(), fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), ) @@ -84,7 +84,7 @@ impl RawEmitter { self.bytes_used += 8 * canonicalized.canonical_words.len(); writeln!( &mut self.file, - "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", + "const BITSET_MAPPING: &'static [(u8, u8); {}] = &[{}];", canonicalized.canonicalized_words.len(), fmt_list(&canonicalized.canonicalized_words), ) @@ -96,7 +96,12 @@ impl RawEmitter { self.blank_line(); - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!( + &mut self.file, + r#"#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]"# + ) + .unwrap(); + writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); writeln!(&mut self.file, " super::bitset_search(",).unwrap(); writeln!(&mut self.file, " c as u32,").unwrap(); writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); @@ -121,12 +126,8 @@ impl RawEmitter { for chunk in compressed_words.chunks(chunk_length) { chunks.insert(chunk); } - let chunk_map = chunks - .clone() - .into_iter() - .enumerate() - .map(|(idx, chunk)| (chunk, idx)) - .collect::<HashMap<_, _>>(); + let chunk_map = + chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::<HashMap<_, _>>(); let mut chunk_indices = Vec::new(); for chunk in compressed_words.chunks(chunk_length) { chunk_indices.push(chunk_map[chunk]); @@ -134,7 +135,7 @@ impl RawEmitter { writeln!( &mut self.file, - "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", + "const BITSET_CHUNKS_MAP: &'static [u8; {}] = &[{}];", chunk_indices.len(), fmt_list(&chunk_indices), ) @@ -142,7 +143,7 @@ impl RawEmitter { self.bytes_used += chunk_indices.len(); writeln!( &mut self.file, - "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];", + "const BITSET_INDEX_CHUNKS: &'static [[u8; {}]; {}] = &[{}];", chunk_length, chunks.len(), fmt_list(chunks.iter()), @@ -170,6 +171,15 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { } } +pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { + emitter.blank_line(); + + let mut cascading = emitter.clone(); + cascading.emit_cascading_map(&ranges); + *emitter = cascading; + emitter.desc = String::from("cascading"); +} + struct Canonicalized { canonical_words: Vec<u64>, canonicalized_words: Vec<(u8, u8)>, diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs index 9b2e0a258..714bb5338 100644 --- a/src/tools/unicode-table-generator/src/unicode_download.rs +++ b/src/tools/unicode-table-generator/src/unicode_download.rs @@ -1,6 +1,6 @@ use crate::UNICODE_DIRECTORY; use std::path::Path; -use std::process::Command; +use std::process::{Command, Output}; static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/"; @@ -9,6 +9,18 @@ static README: &str = "ReadMe.txt"; static RESOURCES: &[&str] = &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"]; +#[track_caller] +fn fetch(url: &str) -> Output { + let output = Command::new("curl").arg(URL_PREFIX.to_owned() + url).output().unwrap(); + if !output.status.success() { + panic!( + "Failed to run curl to fetch {url}: stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + output +} + pub fn fetch_latest() { let directory = Path::new(UNICODE_DIRECTORY); if directory.exists() { @@ -20,27 +32,14 @@ pub fn fetch_latest() { if let Err(e) = std::fs::create_dir_all(directory) { panic!("Failed to create {UNICODE_DIRECTORY:?}: {e}"); } - let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap(); - if !output.status.success() { - panic!( - "Failed to run curl to fetch readme: stderr: {}", - String::from_utf8_lossy(&output.stderr) - ); - } + let output = fetch(README); let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default(); if current.as_bytes() != &output.stdout[..] { std::fs::write(directory.join(README), output.stdout).unwrap(); } for resource in RESOURCES { - let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap(); - if !output.status.success() { - panic!( - "Failed to run curl to fetch {}: stderr: {}", - resource, - String::from_utf8_lossy(&output.stderr) - ); - } + let output = fetch(resource); std::fs::write(directory.join(resource), output.stdout).unwrap(); } } |