5 files changed, 133 insertions, 35 deletions
diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs
new file mode 100644
index 000000000..02c754230
--- /dev/null
+++ b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -0,0 +1,78 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::collections::HashMap;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+impl RawEmitter {
+    pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
+        let mut map: [u8; 256] = [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+
+        let points = ranges
+            .iter()
+            .flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>())
+            .collect::<Vec<u32>>();
+
+        println!("there are {} points", points.len());
+
+        // how many distinct ranges need to be counted?
+        let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
+        for point in points {
+            // assert that there is no whitespace over the 0x3000 range.
+            assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
+            let high_bytes = point as usize >> 8;
+            let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
+            codepoints.push(point);
+        }
+
+        let mut bit_for_high_byte = 1u8;
+        let mut arms = Vec::<String>::new();
+
+        let mut high_bytes: Vec<usize> =
+            codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
+        high_bytes.sort();
+        for high_byte in high_bytes {
+            let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
+            if codepoints.len() == 1 {
+                let ch = codepoints.pop().unwrap();
+                arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
+                continue;
+            }
+            // more than 1 codepoint in this arm
+            for codepoint in codepoints {
+                map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
+            }
+            arms.push(format!(
+                "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
+                high_byte, bit_for_high_byte
+            ));
+            bit_for_high_byte <<= 1;
+        }
+
+        writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
+            .unwrap();
+        self.bytes_used += 256;
+
+        writeln!(&mut self.file, "#[inline]").unwrap();
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
+        for arm in arms {
+            writeln!(&mut self.file, "        {},", arm).unwrap();
+        }
+        writeln!(&mut self.file, "        _ => false,").unwrap();
+        writeln!(&mut self.file, "    }}").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
+
+        true
+    }
+}
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
index 4720ee702..2fe578acd 100644
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -75,12 +75,13 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::Range;
 use ucd_parse::Codepoints;
 
+mod cascading_map;
 mod case_mapping;
 mod raw_emitter;
 mod skiplist;
 mod unicode_download;
 
-use raw_emitter::{emit_codepoints, RawEmitter};
+use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};
 
 static PROPERTIES: &[&str] = &[
     "Alphabetic",
@@ -220,7 +221,7 @@ fn main() {
     let write_location = std::env::args().nth(1).unwrap_or_else(|| {
         eprintln!("Must provide path to write unicode tables to");
         eprintln!(
-            "e.g. {} library/core/unicode/unicode_data.rs",
+            "e.g. {} library/core/src/unicode/unicode_data.rs",
             std::env::args().next().unwrap_or_default()
         );
         std::process::exit(1);
@@ -241,8 +242,13 @@ fn main() {
     let mut modules = Vec::new();
     for (property, ranges) in ranges_by_property {
         let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
+
         let mut emitter = RawEmitter::new();
-        emit_codepoints(&mut emitter, &ranges);
+        if property == &"White_Space" {
+            emit_whitespace(&mut emitter, &ranges);
+        } else {
+            emit_codepoints(&mut emitter, &ranges);
+        }
 
         modules.push((property.to_lowercase().to_string(), emitter.file));
         println!(
diff --git a/src/tools/unicode-table-generator/src/range_search.rs b/src/tools/unicode-table-generator/src/range_search.rs
index 39b47ce70..3a5b869f7 100644
--- a/src/tools/unicode-table-generator/src/range_search.rs
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@@ -1,5 +1,6 @@
+#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]
 #[inline(always)]
-fn bitset_search<
+const fn bitset_search<
     const N: usize,
     const CHUNK_SIZE: usize,
     const N1: usize,
@@ -15,14 +16,18 @@ fn bitset_search<
     let bucket_idx = (needle / 64) as usize;
     let chunk_map_idx = bucket_idx / CHUNK_SIZE;
     let chunk_piece = bucket_idx % CHUNK_SIZE;
-    let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
-        v
+    // FIXME: const-hack: Revert to `slice::get` after `const_slice_index`
+    // feature stabilizes.
+    let chunk_idx = if chunk_map_idx < chunk_idx_map.len() {
+        chunk_idx_map[chunk_map_idx]
     } else {
         return false;
     };
     let idx = bitset_chunk_idx[chunk_idx as usize][chunk_piece] as usize;
-    let word = if let Some(word) = bitset_canonical.get(idx) {
-        *word
+    // FIXME: const-hack: Revert to `slice::get` after `const_slice_index`
+    // feature stabilizes.
+    let word = if idx < bitset_canonical.len() {
+        bitset_canonical[idx]
     } else {
         let (real_idx, mapping) = bitset_canonicalized[idx - bitset_canonical.len()];
         let mut word = bitset_canonical[real_idx as usize];
diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs
index ab8eaee95..890ff986c 100644
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -76,7 +76,7 @@ impl RawEmitter {
 
         writeln!(
             &mut self.file,
-            "static BITSET_CANONICAL: [u64; {}] = [{}];",
+            "const BITSET_CANONICAL: &'static [u64; {}] = &[{}];",
             canonicalized.canonical_words.len(),
             fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
         )
@@ -84,7 +84,7 @@ impl RawEmitter {
         self.bytes_used += 8 * canonicalized.canonical_words.len();
         writeln!(
             &mut self.file,
-            "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
+            "const BITSET_MAPPING: &'static [(u8, u8); {}] = &[{}];",
             canonicalized.canonicalized_words.len(),
             fmt_list(&canonicalized.canonicalized_words),
         )
@@ -96,7 +96,12 @@ impl RawEmitter {
 
         self.blank_line();
 
-        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(
+            &mut self.file,
+            r#"#[rustc_const_unstable(feature = "const_unicode_case_lookup", issue = "101400")]"#
+        )
+        .unwrap();
+        writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
         writeln!(&mut self.file, "    super::bitset_search(",).unwrap();
         writeln!(&mut self.file, "        c as u32,").unwrap();
         writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
@@ -121,12 +126,8 @@ impl RawEmitter {
         for chunk in compressed_words.chunks(chunk_length) {
             chunks.insert(chunk);
         }
-        let chunk_map = chunks
-            .clone()
-            .into_iter()
-            .enumerate()
-            .map(|(idx, chunk)| (chunk, idx))
-            .collect::<HashMap<_, _>>();
+        let chunk_map =
+            chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::<HashMap<_, _>>();
         let mut chunk_indices = Vec::new();
         for chunk in compressed_words.chunks(chunk_length) {
             chunk_indices.push(chunk_map[chunk]);
@@ -134,7 +135,7 @@ impl RawEmitter {
 
         writeln!(
             &mut self.file,
-            "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
+            "const BITSET_CHUNKS_MAP: &'static [u8; {}] = &[{}];",
             chunk_indices.len(),
             fmt_list(&chunk_indices),
         )
@@ -142,7 +143,7 @@ impl RawEmitter {
         self.bytes_used += chunk_indices.len();
         writeln!(
             &mut self.file,
-            "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
+            "const BITSET_INDEX_CHUNKS: &'static [[u8; {}]; {}] = &[{}];",
             chunk_length,
             chunks.len(),
             fmt_list(chunks.iter()),
@@ -170,6 +171,15 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
     }
 }
 
+pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
+    emitter.blank_line();
+
+    let mut cascading = emitter.clone();
+    cascading.emit_cascading_map(&ranges);
+    *emitter = cascading;
+    emitter.desc = String::from("cascading");
+}
+
 struct Canonicalized {
     canonical_words: Vec<u64>,
     canonicalized_words: Vec<(u8, u8)>,
diff --git a/src/tools/unicode-table-generator/src/unicode_download.rs b/src/tools/unicode-table-generator/src/unicode_download.rs
index 9b2e0a258..714bb5338 100644
--- a/src/tools/unicode-table-generator/src/unicode_download.rs
+++ b/src/tools/unicode-table-generator/src/unicode_download.rs
@@ -1,6 +1,6 @@
 use crate::UNICODE_DIRECTORY;
 use std::path::Path;
-use std::process::Command;
+use std::process::{Command, Output};
 
 static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/";
 
@@ -9,6 +9,18 @@ static README: &str = "ReadMe.txt";
 static RESOURCES: &[&str] =
     &["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"];
 
+#[track_caller]
+fn fetch(url: &str) -> Output {
+    let output = Command::new("curl").arg(URL_PREFIX.to_owned() + url).output().unwrap();
+    if !output.status.success() {
+        panic!(
+            "Failed to run curl to fetch {url}: stderr: {}",
+            String::from_utf8_lossy(&output.stderr)
+        );
+    }
+    output
+}
+
 pub fn fetch_latest() {
     let directory = Path::new(UNICODE_DIRECTORY);
     if directory.exists() {
@@ -20,27 +32,14 @@ pub fn fetch_latest() {
     if let Err(e) = std::fs::create_dir_all(directory) {
         panic!("Failed to create {UNICODE_DIRECTORY:?}: {e}");
     }
-    let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
-    if !output.status.success() {
-        panic!(
-            "Failed to run curl to fetch readme: stderr: {}",
-            String::from_utf8_lossy(&output.stderr)
-        );
-    }
+    let output = fetch(README);
     let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default();
     if current.as_bytes() != &output.stdout[..] {
         std::fs::write(directory.join(README), output.stdout).unwrap();
     }
 
     for resource in RESOURCES {
-        let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap();
-        if !output.status.success() {
-            panic!(
-                "Failed to run curl to fetch {}: stderr: {}",
-                resource,
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
+        let output = fetch(resource);
         std::fs::write(directory.join(resource), output.stdout).unwrap();
     }
 }