summaryrefslogtreecommitdiffstats
path: root/src/tools/unicode-table-generator/src/cascading_map.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/tools/unicode-table-generator/src/cascading_map.rs')
-rw-r--r--src/tools/unicode-table-generator/src/cascading_map.rs78
1 files changed, 78 insertions, 0 deletions
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
new file mode 100644
index 000000000..02c754230
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -0,0 +1,78 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::collections::HashMap;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+impl RawEmitter {
+ pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
+ let mut map: [u8; 256] = [
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ ];
+
+ let points = ranges
+ .iter()
+ .flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>())
+ .collect::<Vec<u32>>();
+
+ println!("there are {} points", points.len());
+
+ // how many distinct ranges need to be counted?
+ let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
+ for point in points {
+ // assert that there is no whitespace over the 0x3000 range.
+ assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
+ let high_bytes = point as usize >> 8;
+ let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
+ codepoints.push(point);
+ }
+
+ let mut bit_for_high_byte = 1u8;
+ let mut arms = Vec::<String>::new();
+
+ let mut high_bytes: Vec<usize> =
+ codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
+ high_bytes.sort();
+ for high_byte in high_bytes {
+ let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
+ if codepoints.len() == 1 {
+ let ch = codepoints.pop().unwrap();
+ arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
+ continue;
+ }
+ // more than 1 codepoint in this arm
+ for codepoint in codepoints {
+ map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
+ }
+ arms.push(format!(
+ "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
+ high_byte, bit_for_high_byte
+ ));
+ bit_for_high_byte <<= 1;
+ }
+
+ writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
+ .unwrap();
+ self.bytes_used += 256;
+
+ writeln!(&mut self.file, "#[inline]").unwrap();
+ writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+ writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
+ for arm in arms {
+ writeln!(&mut self.file, " {},", arm).unwrap();
+ }
+ writeln!(&mut self.file, " _ => false,").unwrap();
+ writeln!(&mut self.file, " }}").unwrap();
+ writeln!(&mut self.file, "}}").unwrap();
+
+ true
+ }
+}