Adding upstream version 115.7.0esr.upstream/115.7.0esr upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/mapped_hyph/src
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
5 files changed, 1512 insertions, 0 deletions
diff --git a/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs
new file mode 100644
index 0000000000..257c747f54
--- /dev/null
+++ b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs
@@ -0,0 +1,44 @@
+// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate log;
+extern crate mapped_hyph;
+
+use std::env;
+use std::fs::File;
+
+struct Logger {}
+
+impl log::Log for Logger {
+    fn enabled(&self, _: &log::Metadata) -> bool {
+        true
+    }
+
+    fn log(&self, record: &log::Record) {
+        eprintln!("{} - {}", record.level(), record.args());
+    }
+
+    fn flush(&self) {}
+}
+
+static LOGGER: Logger = Logger {};
+
+fn main() -> std::io::Result<()> {
+    unsafe { log::set_logger_racy(&LOGGER).unwrap() };
+
+    let args: Vec<String> = env::args().collect();
+    if args.len() == 3 {
+        let in_file = File::open(&args[1])?;
+        let mut out_file = File::create(&args[2])?;
+        mapped_hyph::builder::compile(&in_file, &mut out_file, true)?;
+    } else {
+        println!("usage: hyf_compile <pattern-file> <output-file>");
+    }
+    Ok(())
+}
diff --git a/third_party/rust/mapped_hyph/src/builder.rs b/third_party/rust/mapped_hyph/src/builder.rs
new file mode 100644
index 0000000000..e19a0087fd
--- /dev/null
+++ b/third_party/rust/mapped_hyph/src/builder.rs
@@ -0,0 +1,509 @@
+// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/// Functions to compile human-readable patterns into a mapped_hyph
+/// flattened representation of the hyphenation state machine.
+
+use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind};
+use std::collections::HashMap;
+use std::convert::TryInto;
+use std::hash::{Hash,Hasher};
+
+// Wrap a HashMap so that we can implement the Hash trait.
+#[derive(PartialEq, Eq, Clone)]
+struct TransitionMap (HashMap<u8,i32>);
+
+impl TransitionMap {
+    fn new() -> TransitionMap {
+        TransitionMap(HashMap::<u8,i32>::new())
+    }
+}
+
+impl Hash for TransitionMap {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        // We only look at the values here; that's likely to be enough
+        // for a reasonable hash.
+        let mut transitions: Vec<&i32> = self.0.values().collect();
+        transitions.sort();
+        for t in transitions {
+            t.hash(state);
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Hash, Clone)]
+struct State {
+    match_string: Option<Vec<u8>>,
+    #[allow(dead_code)]
+    repl_string: Option<Vec<u8>>,
+    #[allow(dead_code)]
+    repl_index: i32,
+    #[allow(dead_code)]
+    repl_cut: i32,
+    fallback_state: i32,
+    transitions: TransitionMap,
+}
+
+impl State {
+    fn new() -> State {
+        State {
+            match_string: None,
+            repl_string: None,
+            repl_index: -1,
+            repl_cut: -1,
+            fallback_state: -1,
+            transitions: TransitionMap::new(),
+        }
+    }
+}
+
+/// Structures returned by the read_dic_file() function;
+/// array of these can then be passed to write_hyf_file()
+/// to create the flattened output.
+struct LevelBuilder {
+    states: Vec<State>,
+    str_to_state: HashMap<Vec<u8>,i32>,
+    encoding: Option<String>,
+    nohyphen: Option<String>,
+    lh_min: u8,
+    rh_min: u8,
+    clh_min: u8,
+    crh_min: u8,
+}
+
+impl LevelBuilder {
+    fn new() -> LevelBuilder {
+        let mut result = LevelBuilder {
+            states: Vec::<State>::new(),
+            str_to_state: HashMap::<Vec<u8>,i32>::new(),
+            encoding: None,
+            nohyphen: None,
+            lh_min: 0,
+            rh_min: 0,
+            clh_min: 0,
+            crh_min: 0,
+        };
+        // Initialize the builder with an empty start state.
+        result.str_to_state.insert(vec![], 0);
+        result.states.push(State::new());
+        result
+    }
+
+    fn find_state_number_for(&mut self, text: &[u8]) -> i32 {
+        let count = self.states.len() as i32;
+        let index = *self.str_to_state.entry(text.to_vec()).or_insert(count);
+        if index == count {
+            self.states.push(State::new());
+        }
+        index
+    }
+
+    fn add_pattern(&mut self, pattern: &str) {
+        let mut bytes = pattern.as_bytes();
+        let mut text = Vec::<u8>::with_capacity(bytes.len());
+        let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1);
+        let mut repl_str = None;
+        let mut repl_index = 0;
+        let mut repl_cut = 0;
+
+        // Check for replacement rule (non-standard hyphenation spelling change).
+        if let Some(slash) = bytes.iter().position(|x| *x == b'/') {
+            let parts = bytes.split_at(slash);
+            bytes = parts.0;
+            let mut it = parts.1[1 ..].split(|x| *x == b',');
+            if let Some(repl) = it.next() {
+                repl_str = Some(repl.to_vec());
+            }
+            if let Some(num) = it.next() {
+                repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1;
+            }
+            if let Some(num) = it.next() {
+                repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap();
+            }
+        }
+
+        // Separate the input pattern into parallel arrays of text (bytes) and digits.
+        let mut got_digit = false;
+        for byte in bytes {
+            if *byte <= b'9' && *byte >= b'0' {
+                if got_digit {
+                    warn!("invalid pattern \"{}\": consecutive digits", pattern);
+                    return;
+                }
+                digits.push(*byte);
+                got_digit = true;
+            } else {
+                text.push(*byte);
+                if got_digit {
+                    got_digit = false;
+                } else {
+                    digits.push(b'0');
+                }
+            }
+        }
+        if !got_digit {
+            digits.push(b'0');
+        }
+
+        if repl_str.is_none() {
+            // Optimize away leading zeroes from the digits array.
+            while !digits.is_empty() && digits[0] == b'0' {
+                digits.remove(0);
+            }
+        } else {
+            // Convert repl_index and repl_cut from Unicode char to byte indexing.
+            let start = if text[0] == b'.' { 1 } else { 0 };
+            if start == 1 {
+                if digits[0] != b'0' {
+                    warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern);
+                    return;
+                }
+                digits.remove(0);
+            }
+            let word = std::str::from_utf8(&text[start..]).unwrap();
+            let mut chars: Vec<_> = word.char_indices().collect();
+            chars.push((word.len(), '.'));
+            repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32;
+            repl_index = chars[repl_index as usize].0 as i32;
+        }
+
+        // Create the new state, or add pattern into an existing state
+        // (which should not already have a match_string).
+        let mut state_num = self.find_state_number_for(&text);
+        let mut state = &mut self.states[state_num as usize];
+        if state.match_string.is_some() {
+            warn!("duplicate pattern \"{}\" discarded", pattern);
+            return;
+        }
+        if !digits.is_empty() {
+            state.match_string = Some(digits);
+        }
+        if repl_str.is_some() {
+            state.repl_string = repl_str;
+            state.repl_index = repl_index;
+            state.repl_cut = repl_cut;
+        }
+
+        // Set up prefix transitions, inserting additional states as needed.
+        while !text.is_empty() {
+            let last_state = state_num;
+            let ch = *text.last().unwrap();
+            text.truncate(text.len() - 1);
+            state_num = self.find_state_number_for(&text);
+            if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) {
+                assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern);
+                break;
+            }
+        }
+    }
+
+    fn merge_duplicate_states(&mut self) {
+        // We loop here because when we eliminate a duplicate, and update the transitons
+        // that referenced it, we may thereby create new duplicates that another pass
+        // will find and compress further.
+        loop {
+            let orig_len = self.states.len();
+            // Used to map State records to the (first) index at which they occur.
+            let mut state_to_index = HashMap::<&State,i32>::new();
+            // Mapping of old->new state indexes, and whether each old state is
+            // a duplicate that should be dropped.
+            let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len);
+            let mut next_new_index: i32 = 0;
+            for index in 0 .. self.states.len() {
+                // Find existing index for this state, or allocate the next new index to it.
+                let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index);
+                // Record the mapping, and whether the state was a duplicate.
+                mappings.push((new_index, new_index != next_new_index));
+                // If we used next_new_index for this state, increment it.
+                if new_index == next_new_index {
+                    next_new_index += 1;
+                }
+            }
+            // If we didn't find any duplicates, next_new_index will have kept pace with
+            // index, so we know we're finished.
+            if next_new_index as usize == self.states.len() {
+                break;
+            }
+            // Iterate over all the states, either deleting them or updating indexes
+            // according to the mapping we created; then repeat the search.
+            for index in (0 .. self.states.len()).rev() {
+                if mappings[index].1 {
+                    self.states.remove(index);
+                } else {
+                    let state = &mut self.states[index];
+                    if state.fallback_state != -1 {
+                        state.fallback_state = mappings[state.fallback_state as usize].0;
+                    }
+                    for t in state.transitions.0.iter_mut() {
+                        *t.1 = mappings[*t.1 as usize].0;
+                    }
+                }
+            }
+        }
+    }
+
+    fn flatten(&self) -> Vec<u8> {
+        // Calculate total space needed for state data, and build the state_to_offset table.
+        let mut state_data_size = 0;
+        let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len());
+        for state in &self.states {
+            state_to_offset.push(state_data_size);
+            state_data_size += if state.repl_string.is_some() { 12 } else { 8 };
+            state_data_size += state.transitions.0.len() * 4;
+        }
+
+        // Helper to map a state index to its offset in the final data block.
+        let get_state_offset_for = |state_index: i32| -> u32 {
+            if state_index < 0 {
+                return super::INVALID_STATE_OFFSET;
+            }
+            state_to_offset[state_index as usize] as u32
+        };
+
+        // Helper to map a byte string to its offset in the final data block, and
+        // store the bytes into string_data unless using an already-existing string.
+        let mut string_to_offset = HashMap::<Vec<u8>,usize>::new();
+        let mut string_data = Vec::<u8>::new();
+        let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 {
+            if bytes.is_none() {
+                return super::INVALID_STRING_OFFSET;
+            }
+            assert!(bytes.as_ref().unwrap().len() < 256);
+            let new_offset = string_data.len();
+            let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset);
+            if offset == new_offset {
+                string_data.push(bytes.as_ref().unwrap().len() as u8);
+                string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref());
+            }
+            offset.try_into().unwrap()
+        };
+
+        // Handle nohyphen string list if present, converting comma separators to NULs
+        // and trimming any surplus whitespace.
+        let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET;
+        let mut nohyphen_count: u16 = 0;
+        if self.nohyphen.is_some() {
+            let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect();
+            nohyphen_count = nohyphen_strings.len().try_into().unwrap();
+            nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec()));
+        }
+
+        let mut state_data = Vec::<u8>::with_capacity(state_data_size);
+        for state in &self.states {
+            state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes());
+            state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes());
+            state_data.push(state.transitions.0.len() as u8);
+            // Determine whether to use an extended state record, and if so add the
+            // replacement string and index fields.
+            if state.repl_string.is_none() {
+                state_data.push(0);
+            } else {
+                state_data.push(1);
+                state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes());
+                state_data.push(state.repl_index as u8);
+                state_data.push(state.repl_cut as u8);
+            }
+            // Collect transitions into an array so we can sort them.
+            let mut transitions = vec![];
+            for (key, value) in state.transitions.0.iter() {
+                transitions.push((*key, get_state_offset_for(*value)))
+            }
+            transitions.sort();
+            for t in transitions {
+                // New state offset is stored as a 24-bit value, so we do this manually.
+                state_data.push((t.1 & 0xff) as u8);
+                state_data.push(((t.1 >> 8) & 0xff) as u8);
+                state_data.push(((t.1 >> 16) & 0xff) as u8);
+                state_data.push(t.0);
+            }
+        }
+        assert_eq!(state_data.len(), state_data_size);
+
+        // Pad string data to a 4-byte boundary
+        while string_data.len() & 3 != 0 {
+            string_data.push(0);
+        }
+
+        let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len();
+        let mut result = Vec::<u8>::with_capacity(total_size);
+
+        let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32;
+        let string_data_base: u32 = state_data_base + state_data_size as u32;
+
+        result.extend(&state_data_base.to_le_bytes());
+        result.extend(&string_data_base.to_le_bytes());
+        result.extend(&nohyphen_string_offset.to_le_bytes());
+        result.extend(&nohyphen_count.to_le_bytes());
+        result.push(self.lh_min);
+        result.push(self.rh_min);
+        result.push(self.clh_min);
+        result.push(self.crh_min);
+
+        result.extend(state_data.iter());
+        result.extend(string_data.iter());
+
+        assert_eq!(result.len(), total_size);
+
+        result
+    }
+}
+
+/// Read a libhyphen-style pattern file and create the corresponding state
+/// machine transitions, etc.
+/// The returned Vec can be passed to write_hyf_file() to generate a flattened
+/// representation of the state machine in mapped_hyph's binary format.
+fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> {
+    let reader = BufReader::new(dic_file);
+
+    let mut builders = Vec::<LevelBuilder>::new();
+    builders.push(LevelBuilder::new());
+    let mut builder = &mut builders[0];
+
+    for (index, line) in reader.lines().enumerate() {
+        let mut trimmed = line.unwrap().trim().to_string();
+        // Strip comments.
+        if let Some(i) = trimmed.find('%') {
+            trimmed = trimmed[..i].trim().to_string();
+        }
+        // Ignore empty lines.
+        if trimmed.is_empty() {
+            continue;
+        }
+        // Uppercase indicates keyword rather than pattern.
+        if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' {
+            // First line is encoding; we only support UTF-8.
+            if builder.encoding.is_none() {
+                if trimmed != "UTF-8" {
+                    return Err("Only UTF-8 patterns are accepted!");
+                };
+                builder.encoding = Some(trimmed);
+                continue;
+            }
+            // Check for valid keyword-value pairs.
+            if trimmed.contains(' ') {
+                let parts: Vec<&str> = trimmed.split(' ').collect();
+                if parts.len() != 2 {
+                    warn!("unrecognized keyword/values: {}", trimmed);
+                    continue;
+                }
+                let keyword = parts[0];
+                let value = parts[1];
+                match keyword {
+                    "LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(),
+                    "RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(),
+                    "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(),
+                    "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(),
+                    "NOHYPHEN" => builder.nohyphen = Some(trimmed),
+                    _ => warn!("unknown keyword: {}", trimmed),
+                }
+                continue;
+            }
+            // Start a new hyphenation level?
+            if trimmed == "NEXTLEVEL" {
+                builders.push(LevelBuilder::new());
+                builder = builders.last_mut().unwrap();
+                continue;
+            }
+            warn!("unknown keyword: {}", trimmed);
+            continue;
+        }
+        // Patterns should always be provided in lowercase; complain if not, and discard
+        // the bad pattern.
+        if trimmed != trimmed.to_lowercase() {
+            warn!("pattern \"{}\" not lowercased at line {}", trimmed, index);
+            continue;
+        }
+        builder.add_pattern(&trimmed);
+    }
+
+    // Create default first (compound-word) level if only one level was provided.
+    // (Maybe this should be optional? Currently just copying libhyphen behavior.)
+    if builders.len() == 1 {
+        let (lh_min, rh_min, clh_min, crh_min) =
+            (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min);
+        builders.insert(0, LevelBuilder::new());
+        builder = builders.first_mut().unwrap();
+        builder.add_pattern("1-1");
+        builder.add_pattern("1'1");
+        builder.add_pattern("1\u{2013}1"); // en-dash
+        builder.add_pattern("1\u{2019}1"); // curly apostrophe
+        builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string());
+        builder.lh_min = lh_min;
+        builder.rh_min = rh_min;
+        builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 };
+        builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 };
+    }
+
+    // Put in fallback states in each builder.
+    for builder in &mut builders {
+        for (key, state_index) in builder.str_to_state.iter() {
+            if key.is_empty() {
+                continue;
+            }
+            let mut fallback_key = key.clone();
+            while !fallback_key.is_empty() {
+                fallback_key.remove(0);
+                if builder.str_to_state.contains_key(&fallback_key) {
+                    break;
+                }
+            }
+            builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key];
+        }
+    }
+
+    if compress {
+        // Merge duplicate states to reduce size.
+        for builder in &mut builders {
+            builder.merge_duplicate_states();
+        }
+    }
+
+    Ok(builders)
+}
+
+/// Write out the state machines representing a set of hyphenation rules
+/// to the given output stream.
+fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {
+    if levels.is_empty() {
+        return Err(Error::from(ErrorKind::InvalidData));
+    }
+    let mut flattened = vec![];
+    for level in levels {
+        flattened.push(level.flatten());
+    }
+    // Write file header: magic number, count of levels.
+    hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?;
+    let level_count: u32 = flattened.len() as u32;
+    hyf_file.write_all(&level_count.to_le_bytes())?;
+    // Write array of offsets to each level. First level will begin immediately
+    // after the array of offsets.
+    let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count;
+    for flat in &flattened {
+        hyf_file.write_all(&offset.to_le_bytes())?;
+        offset += flat.len() as u32;
+    }
+    // Write the flattened data for each level.
+    for flat in &flattened {
+        hyf_file.write_all(&flat)?;
+    }
+    Ok(())
+}
+
+/// The public API to the compilation process: reads `dic_file` and writes compiled tables
+/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the
+/// size of the output is performed.
+pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> {
+    match read_dic_file(dic_file, compress) {
+        Ok(dic) => write_hyf_file(hyf_file, dic),
+        Err(e) => {
+            warn!("parse error: {}", e);
+            return Err(Error::from(ErrorKind::InvalidData))
+        }
+    }
+}
diff --git a/third_party/rust/mapped_hyph/src/ffi.rs b/third_party/rust/mapped_hyph/src/ffi.rs
new file mode 100644
index 0000000000..1b546e2567
--- /dev/null
+++ b/third_party/rust/mapped_hyph/src/ffi.rs
@@ -0,0 +1,250 @@
+// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::slice;
+use std::str;
+use std::ffi::CStr;
+use std::fs::File;
+use std::io::Read;
+use std::os::raw::c_char;
+use std::str::Utf8Error;
+
+use memmap2::Mmap;
+
+use super::Hyphenator;
+
+/// Opaque type representing a hyphenation dictionary loaded from a file,
+/// for use in FFI function signatures.
+pub struct HyphDic;
+
+/// Opaque type representing a compiled dictionary in a memory buffer.
+pub struct CompiledData;
+
+// Helper to convert word and hyphen buffer parameters from raw C pointer/length
+// pairs to the Rust types expected by mapped_hyph.
+unsafe fn params_from_c<'a>(word: *const c_char, word_len: u32,
+                            hyphens: *mut u8, hyphens_len: u32) ->
+        (Result<&'a str, Utf8Error>, &'a mut [u8]) {
+    (str::from_utf8(slice::from_raw_parts(word as *const u8, word_len as usize)),
+     slice::from_raw_parts_mut(hyphens, hyphens_len as usize))
+}
+
+/// C-callable function to load a hyphenation dictionary from a file at `path`.
+///
+/// Returns null on failure.
+///
+/// This does not fully validate that the file contains usable hyphenation
+/// data, it only opens the file (read-only) and mmap's it into memory, and
+/// does some minimal sanity-checking that it *might* be valid.
+///
+/// The returned `HyphDic` must be released with `mapped_hyph_free_dictionary`.
+///
+/// # Safety
+/// The given `path` must be a valid pointer to a NUL-terminated (C-style)
+/// string.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_load_dictionary(path: *const c_char) -> *const HyphDic {
+    let path_str = match CStr::from_ptr(path).to_str() {
+        Ok(str) => str,
+        Err(_) => return std::ptr::null(),
+    };
+    let hyph = Box::new(match super::load_file(path_str) {
+        Some(dic) => dic,
+        _ => return std::ptr::null(),
+    });
+    Box::into_raw(hyph) as *const HyphDic
+}
+
+/// C-callable function to free a hyphenation dictionary
+/// that was loaded by `mapped_hyph_load_dictionary`.
+///
+/// # Safety
+/// The `dic` parameter must be a `HyphDic` pointer obtained from
+/// `mapped_hyph_load_dictionary`, and not previously freed.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_free_dictionary(dic: *mut HyphDic) {
+    Box::from_raw(dic);
+}
+
+/// C-callable function to find hyphenation values for a given `word`,
+/// using a dictionary loaded via `mapped_hyph_load_dictionary`.
+///
+/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters)
+/// long.
+///
+/// Caller must supply the `hyphens` output buffer for results; its size is
+/// given in `hyphens_len`.
+/// It should be at least `word_len` elements long.
+///
+/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is
+/// too small.
+/// Otherwise returns the number of potential hyphenation positions found.
+///
+/// # Panics
+/// This function may panic if the given dictionary is not valid.
+///
+/// # Safety
+/// The `dic` parameter must be a `HyphDic` pointer obtained from
+/// `mapped_hyph_load_dictionary`.
+///
+/// The `word` and `hyphens` parameter must be valid pointers to memory buffers
+/// of at least the respective sizes `word_len` and `hyphens_len`.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_dic(dic: *const HyphDic,
+                                                            word: *const c_char, word_len: u32,
+                                                            hyphens: *mut u8, hyphens_len: u32) -> i32 {
+    if word_len > hyphens_len {
+        return -1;
+    }
+    let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len);
+    if word_str.is_err() {
+        return -1;
+    }
+    Hyphenator::new(&*(dic as *const Mmap))
+        .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32
+}
+
+/// C-callable function to find hyphenation values for a given `word`,
+/// using a dictionary loaded and owned by the caller.
+///
+/// The dictionary is supplied as a raw memory buffer `dic_buf` of size
+/// `dic_len`.
+///
+/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters)
+/// long.
+///
+/// Caller must supply the `hyphens` output buffer for results; its size is
+/// given in `hyphens_len`.
+/// It should be at least `word_len` elements long.
+///
+/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is
+/// too small.
+/// Otherwise returns the number of potential hyphenation positions found.
+///
+/// # Panics
+/// This function may panic if the given dictionary is not valid.
+///
+/// # Safety
+/// The `dic_buf` parameter must be a valid pointer to a memory block of size
+/// at least `dic_len`.
+///
+/// The `word` and `hyphens` parameter must be valid pointers to memory buffers
+/// of at least the respective sizes `word_len` and `hyphens_len`.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_raw(dic_buf: *const u8, dic_len: u32,
+                                                            word: *const c_char, word_len: u32,
+                                                            hyphens: *mut u8, hyphens_len: u32) -> i32 {
+    if word_len > hyphens_len {
+        return -1;
+    }
+    let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len);
+    if word_str.is_err() {
+        return -1;
+    }
+    Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize))
+        .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32
+}
+
+/// C-callable function to check if a given memory buffer `dic_buf` of size
+/// `dic_len` is potentially usable as a hyphenation dictionary.
+///
+/// Returns `true` if the given memory buffer looks like it may be a valid
+/// hyphenation dictionary, `false` if it is clearly not usable.
+///
+/// # Safety
+/// The `dic_buf` parameter must be a valid pointer to a memory block of size
+/// at least `dic_len`.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic_len: u32) -> bool {
+    if dic_buf.is_null() {
+        return false;
+    }
+    let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize));
+    dic.is_valid_hyphenator()
+}
+
+/// C-callable function to free a CompiledData object created by
+/// a `mapped_hyph_compile_...` function (below).
+///
+/// # Safety
+/// The `data` parameter must be a `CompiledData` pointer obtained from
+/// a `mapped_hyph_compile_...` function, and not previously freed.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_free_compiled_data(data: *mut CompiledData) {
+    Box::from_raw(data);
+}
+
+// Helper for the compilation functions (from either memory buffer or file path).
+fn compile_and_wrap<T: Read>(input: T, compress: bool) -> *const CompiledData {
+    let mut compiled: Vec<u8> = vec![];
+    if super::builder::compile(input, &mut compiled, compress).is_err() {
+        return std::ptr::null();
+    }
+    compiled.shrink_to_fit();
+
+    // Create a persistent heap reference to the compiled data, and return a pointer to it.
+    Box::into_raw(Box::new(compiled)) as *const CompiledData
+}
+
+/// C-callable function to compile hyphenation patterns from `pattern_buf` and return
+/// the compiled data in a memory buffer, suitable to be stored somewhere or passed
+/// to `mapped_hyph_find_hyphen_values_raw` to perform hyphenation.
+///
+/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`.
+///
+/// # Safety
+/// The `pattern_buf` parameter must be a valid pointer to a memory block of size
+/// at least `pattern_len`.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_compile_buffer(pattern_buf: *const u8, pattern_len: u32, compress: bool) -> *const CompiledData {
+    compile_and_wrap(slice::from_raw_parts(pattern_buf, pattern_len as usize), compress)
+}
+
+/// C-callable function to compile hyphenation patterns from a file to a memory buffer.
+///
+/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`.
+///
+/// # Safety
+/// The given `path` must be a valid pointer to a NUL-terminated (C-style) string.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_compile_file(path: *const c_char, compress: bool) -> *const CompiledData {
+    // Try to open the file at the given path, returning null on failure.
+    let path_str = match CStr::from_ptr(path).to_str() {
+        Ok(str) => str,
+        Err(_) => return std::ptr::null(),
+    };
+    let in_file = match File::open(path_str) {
+        Ok(file) => file,
+        Err(_) => return std::ptr::null(),
+    };
+    compile_and_wrap(&in_file, compress)
+}
+
+/// Get the size of the compiled table buffer in a `CompiledData` object.
+///
+/// # Safety
+/// The `data` parameter must be a `CompiledData` pointer obtained from
+/// a `mapped_hyph_compile_...` function, and not previously freed.
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_compiled_data_size(data: *const CompiledData) -> u32 {
+    (&*(data as *const Vec<u8>)).len() as u32
+}
+
+/// Get a pointer to the raw data held by a `CompiledData` object.
+///
+/// # Safety
+/// The `data` parameter must be a `CompiledData` pointer obtained from
+/// a `mapped_hyph_compile_...` function, and not previously freed.
+///
+/// The returned pointer only remains valid as long as the `CompiledData` has not
+/// been released (by passing it to `mapped_hyph_free_compiled_data`).
+#[no_mangle]
+pub unsafe extern "C" fn mapped_hyph_compiled_data_ptr(data: *const CompiledData) -> *const u8 {
+    (&*(data as *const Vec<u8>)).as_ptr()
+}
diff --git a/third_party/rust/mapped_hyph/src/lib.rs b/third_party/rust/mapped_hyph/src/lib.rs
new file mode 100644
index 0000000000..848c93d257
--- /dev/null
+++ b/third_party/rust/mapped_hyph/src/lib.rs
@@ -0,0 +1,642 @@
+// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#[macro_use]
+extern crate arrayref;
+extern crate memmap2;
+#[macro_use]
+extern crate log;
+
+use std::slice;
+use std::str;
+use std::cmp::max;
+use std::fs::File;
+use std::mem;
+
+use memmap2::Mmap;
+
+// Make submodules available publicly.
+pub mod builder;
+pub mod ffi;
+
+// 4-byte identification expected at beginning of a compiled dictionary file.
+// (This will be updated if an incompatible change to the format is made in
+// some future revision.)
+const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0'];
+
+const INVALID_STRING_OFFSET: u16 = 0xffff;
+const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff;
+
+const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels
+const LEVEL_HEADER_SIZE: usize = 16;
+
+// Transition actually holds a 24-bit new state offset and an 8-bit input byte
+// to match. We will be interpreting byte ranges as Transition arrays (in the
+// State::transitions() method below), so use repr(C) to ensure we have the
+// memory layout we expect.
+// Transition records do not depend on any specific alignment.
+#[repr(C)]
+#[derive(Debug,Copy,Clone)]
+struct Transition(u8, u8, u8, u8);
+
+impl Transition {
+    fn new_state_offset(&self) -> usize {
+        // Read a 24-bit little-endian number from three bytes.
+        self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16)
+    }
+    fn match_byte(&self) -> u8 {
+        self.3
+    }
+}
+
+// State is an area of the Level's data block that begins with a fixed header,
+// followed by an array of transitions. The total size of each State's data
+// depends on the number of transitions in the state. Only the basic header
+// is defined by the struct here; the rest of the state is accessed via
+// pointer magic.
+// There are two versions of State, a basic version that supports only simple
+// hyphenation (no associated spelling change), and an extended version that
+// adds the replacement-string fields to support spelling changes at the
+// hyphenation point. Check is_extended() to know which version is present.
+// State records are NOT necessarily 4-byte aligned, so multi-byte fields
+// should be read with care.
+#[derive(Debug,Copy,Clone)]
+#[repr(C)]
+struct State {
+    fallback_state: [u8; 4],
+    match_string_offset: [u8; 2],
+    num_transitions: u8,
+    is_extended: u8,
+}
+
+#[repr(C)]
+struct StateExtended {
+    state: State,
+    repl_string_offset: [u8; 2],
+    repl_index: i8,
+    repl_cut: i8,
+}
+
+impl State {
+    // Accessors for the various State header fields; see file format description.
+    fn fallback_state(&self) -> usize {
+        u32::from_le_bytes(self.fallback_state) as usize
+    }
+    fn match_string_offset(&self) -> usize {
+        u16::from_le_bytes(self.match_string_offset) as usize
+    }
+    fn num_transitions(&self) -> u8 {
+        self.num_transitions
+    }
+    fn is_extended(&self) -> bool {
+        self.is_extended != 0
+    }
+    // Accessors that are only valid if is_extended() is true.
+    // These use `unsafe` to dereference a pointer to the relevant field;
+    // this is OK because Level::get_state always validates the total state size
+    // before returning a state reference, so these pointers will be valid for
+    // any extended state it returns.
+    #[allow(dead_code)]
+    fn as_extended(&self) -> &StateExtended {
+        debug_assert!(self.is_extended());
+        unsafe { mem::transmute(self) }
+    }
+    #[allow(dead_code)]
+    fn repl_string_offset(&self) -> usize {
+        u16::from_le_bytes(self.as_extended().repl_string_offset) as usize
+    }
+    #[allow(dead_code)]
+    fn repl_index(&self) -> i8 {
+        self.as_extended().repl_index
+    }
+    #[allow(dead_code)]
+    fn repl_cut(&self) -> i8 {
+        self.as_extended().repl_cut
+    }
+    // Return the state's Transitions as a slice reference.
+    fn transitions(&self) -> &[Transition] {
+        let count = self.num_transitions() as usize;
+        if count == 0 {
+            return &[];
+        }
+        let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize;
+        // We know the `offset` here will not look beyond the valid range of memory
+        // because Level::get_state() checks the state length (accounting for the
+        // number of transitions) before returning a State reference.
+        let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition };
+        // Again, because Level::get_state() already checked the state length, we know
+        // this slice address and count will be valid.
+        unsafe { slice::from_raw_parts(trans_ptr, count) }
+    }
+    // Look up the Transition for a given input byte, or None.
+    fn transition_for(&self, b: u8) -> Option<Transition> {
+        // The transitions array is sorted by match_byte() value, but there are
+        // usually very few entries; benchmarking showed that using binary_search_by
+        // here gave no benefit (possibly slightly slower).
+        self.transitions().iter().copied().find(|t| t.match_byte() == b)
+    }
+    // Just for debugging use...
+    #[allow(dead_code)]
+    fn deep_show(&self, prefix: &str, dic: &Level) {
+        if self.match_string_offset() != INVALID_STRING_OFFSET as usize {
+            let match_string = dic.string_at_offset(self.match_string_offset());
+            println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap());
+        }
+        for t in self.transitions() {
+            println!("{}{} ->", prefix, t.match_byte() as char);
+            let next_prefix = format!("{}  ", prefix);
+            dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic);
+        }
+    }
+}
+
+// We count the presentation-form ligature characters U+FB00..FB06 as multiple
+// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these
+// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this
+// helper returns the "decomposed length" of the ligature given its trailing
+// byte.
+fn lig_length(trail_byte: u8) -> usize {
+    // This is only called on valid UTF-8 where we already know trail_byte
+    // must be >= 0x80.
+    // Ligature lengths:       ff   fi   fl   ffi  ffl  long-st  st
+    const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8,     2u8 ];
+    if trail_byte > 0x86 {
+        return 1;
+    }
+    LENGTHS[trail_byte as usize - 0x80] as usize
+}
+
+fn is_utf8_trail_byte(byte: u8) -> bool {
+    (byte & 0xC0) == 0x80
+}
+
+fn is_ascii_digit(byte: u8) -> bool {
+    byte <= b'9' && byte >= b'0'
+}
+
+fn is_odd(byte: u8) -> bool {
+    (byte & 0x01) == 0x01
+}
+
+// A hyphenation Level has a header followed by State records and packed string
+// data. The total size of the slice depends on the number and size of the
+// States and Strings it contains.
+// Note that the data of the Level may not have any specific alignment!
+#[derive(Debug,Copy,Clone)]
+struct Level<'a> {
+    data: &'a [u8],
+    // Header fields cached by the constructor for faster access:
+    state_data_base_: usize,
+    string_data_base_: usize,
+}
+
+impl Level<'_> {
+    // Constructor that initializes our cache variables.
+    fn new(data: &[u8]) -> Level {
+        Level {
+            data,
+            state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize,
+            string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize,
+        }
+    }
+
+    // Accessors for Level header fields; see file format description.
+    fn state_data_base(&self) -> usize {
+        self.state_data_base_ // cached by constructor
+    }
+    fn string_data_base(&self) -> usize {
+        self.string_data_base_ // cached by constructor
+    }
+    fn nohyphen_string_offset(&self) -> usize {
+        u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize
+    }
+    #[allow(dead_code)]
+    fn nohyphen_count(&self) -> u16 {
+        u16::from_le_bytes(*array_ref!(self.data, 10, 2))
+    }
+    fn lh_min(&self) -> usize {
+        max(1, self.data[12] as usize)
+    }
+    fn rh_min(&self) -> usize {
+        max(1, self.data[13] as usize)
+    }
+    fn clh_min(&self) -> usize {
+        max(1, self.data[14] as usize)
+    }
+    fn crh_min(&self) -> usize {
+        max(1, self.data[15] as usize)
+    }
+    fn word_boundary_mins(&self) -> (usize, usize, usize, usize) {
+        (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min())
+    }
+    // Strings are represented as offsets from the Level's string_data_base.
+    // This returns a byte slice referencing the string at a given offset,
+    // or an empty slice if invalid.
+    fn string_at_offset(&self, offset: usize) -> &'_ [u8] {
+        if offset == INVALID_STRING_OFFSET as usize {
+            return &[];
+        }
+        let string_base = self.string_data_base() as usize + offset;
+        // TODO: move this to the validation function.
+        debug_assert!(string_base < self.data.len());
+        if string_base + 1 > self.data.len() {
+            return &[];
+        }
+        let len = self.data[string_base] as usize;
+        // TODO: move this to the validation function.
+        debug_assert!(string_base + 1 + len <= self.data.len());
+        if string_base + 1 + len > self.data.len() {
+            return &[];
+        }
+        self.data.get(string_base + 1 .. string_base + 1 + len).unwrap()
+    }
+    // The nohyphen field actually contains multiple NUL-separated substrings;
+    // return them as a vector of individual byte slices.
+    fn nohyphen(&self) -> Vec<&[u8]> {
+        let string_offset = self.nohyphen_string_offset();
+        let nohyph_str = self.string_at_offset(string_offset as usize);
+        if nohyph_str.is_empty() {
+            return vec![];
+        }
+        nohyph_str.split(|&b| b == 0).collect()
+    }
+    // States are represented as an offset from the Level's state_data_base.
+    // This returns a reference to the State at a given offset, or None if invalid.
+    fn get_state(&self, offset: usize) -> Option<&State> {
+        if offset == INVALID_STATE_OFFSET as usize {
+            return None;
+        }
+        debug_assert_eq!(offset & 3, 0);
+        let state_base = self.state_data_base() + offset;
+        // TODO: move this to the validation function.
+        debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base());
+        if state_base + mem::size_of::<State>() > self.string_data_base() {
+            return None;
+        }
+        let state_ptr = &self.data[state_base] as *const u8 as *const State;
+        // This is safe because we just checked against self.string_data_base() above.
+        let state = unsafe { state_ptr.as_ref().unwrap() };
+        let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() }
+                + mem::size_of::<Transition>() * state.num_transitions() as usize;
+        // TODO: move this to the validation function.
+        debug_assert!(state_base + length <= self.string_data_base());
+        if state_base + length > self.string_data_base() {
+            return None;
+        }
+        // This is safe because we checked the full state length against self.string_data_base().
+        unsafe { state_ptr.as_ref() }
+    }
+    // Sets hyphenation values (odd = potential break, even = no break) in values[],
+    // and returns the change in the number of odd values present, so the caller can
+    // keep track of the total number of potential breaks in the word.
+    fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize {
+        // Bail out immediately if the word is too short to hyphenate.
+        if word.len() < lh_min + rh_min {
+            return 0;
+        }
+        let start_state = self.get_state(0);
+        let mut st = start_state;
+        let mut hyph_count = 0;
+        for i in 0 .. word.len() + 2 {
+            // Loop over the word by bytes, with a virtual '.' added at each end
+            // to match word-boundary patterns.
+            let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] };
+            loop {
+                // Loop to repeatedly fall back if we don't find a matching transition.
+                // Note that this could infinite-loop if there is a state whose fallback
+                // points to itself (or a cycle of fallbacks), but this would represent
+                // a table compilation error.
+                // (A potential validation function could check for fallback cycles.)
+                if st.is_none() {
+                    st = start_state;
+                    break;
+                }
+                let state = st.unwrap();
+                if let Some(tr) = state.transition_for(b) {
+                    // Found a transition for the current byte. Look up the new state;
+                    // if it has a match_string, merge its weights into `values`.
+                    st = self.get_state(tr.new_state_offset());
+                    if let Some(state) = st {
+                        let match_offset = state.match_string_offset();
+                        if match_offset != INVALID_STRING_OFFSET as usize {
+                            if state.is_extended() {
+                                debug_assert!(false, "extended hyphenation not supported by this function");
+                            } else {
+                                let match_str = self.string_at_offset(match_offset);
+                                let offset = i + 1 - match_str.len();
+                                assert!(offset + match_str.len() <= word.len() + 2);
+                                for (j, ch) in match_str.iter().enumerate() {
+                                    let index = offset + j;
+                                    if index >= lh_min && index <= word.len() - rh_min {
+                                        // lh_min and rh_min are guaranteed to be >= 1,
+                                        // so this will not try to access outside values[].
+                                        let old_value = values[index - 1];
+                                        let value = ch - b'0';
+                                        if value > old_value {
+                                            if is_odd(old_value) != is_odd(value) {
+                                                // Adjust hyph_count for the change we're making
+                                                hyph_count += if is_odd(value) { 1 } else { -1 };
+                                            }
+                                            values[index - 1] = value;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    // We have handled the current input byte; leave the fallback loop
+                    // and get next input.
+                    break;
+                }
+                // No transition for the current byte; go to fallback state and try again.
+                st = self.get_state(state.fallback_state());
+            }
+        }
+
+        // If the word was not purely ASCII, or if the word begins/ends with
+        // digits, the use of lh_min and rh_min above may not have correctly
+        // excluded enough positions, so we need to fix things up here.
+        let mut index = 0;
+        let mut count = 0;
+        let word_bytes = word.as_bytes();
+        let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; };
+        // Handle lh_min.
+        while count < lh_min - 1 && index < word_bytes.len() {
+            let byte = word_bytes[index];
+            clear_hyphen_at(index);
+            if byte < 0x80 {
+                index += 1;
+                if is_ascii_digit(byte) {
+                    continue; // ASCII digits don't count
+                }
+            } else if byte == 0xEF && word_bytes[index + 1] == 0xAC {
+                // Unicode presentation-form ligature characters, which we count as
+                // multiple chars for the purpose of lh_min/rh_min, all begin with
+                // 0xEF, 0xAC in UTF-8.
+                count += lig_length(word_bytes[index + 2]);
+                clear_hyphen_at(index + 1);
+                clear_hyphen_at(index + 2);
+                index += 3;
+                continue;
+            } else {
+                index += 1;
+                while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index])  {
+                    clear_hyphen_at(index);
+                    index += 1;
+                }
+            }
+            count += 1;
+        }
+
+        // Handle rh_min.
+        count = 0;
+        index = word.len();
+        while count < rh_min && index > 0 {
+            index -= 1;
+            let byte = word_bytes[index];
+            if index < word.len() - 1 {
+                clear_hyphen_at(index);
+            }
+            if byte < 0x80 {
+                // Only count if not an ASCII digit
+                if !is_ascii_digit(byte) {
+                    count += 1;
+                }
+                continue;
+            }
+            if is_utf8_trail_byte(byte) {
+                continue;
+            }
+            if byte == 0xEF && word_bytes[index + 1] == 0xAC {
+                // Presentation-form ligatures count as multiple chars.
+                count += lig_length(word_bytes[index + 2]);
+                continue;
+            }
+            count += 1;
+        }
+
+        hyph_count
+    }
+}
+
+/// Hyphenation engine encapsulating a language-specific set of patterns (rules)
+/// that identify possible break positions within a word.
+pub struct Hyphenator<'a>(&'a [u8]);
+
+impl Hyphenator<'_> {
+    /// Return a Hyphenator that wraps the given buffer.
+    /// This does *not* check that the given buffer is in fact a valid hyphenation table.
+    /// Use `is_valid_hyphenator()` to determine whether it is usable.
+    /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
+    /// unvalidated data is not unsafe, but may panic.)
+    pub fn new(buffer: &[u8]) -> Hyphenator {
+        Hyphenator(buffer)
+    }
+
+    // Internal implementation details
+    fn magic_number(&self) -> &[u8] {
+        &self.0[0 .. 4]
+    }
+    fn num_levels(&self) -> usize {
+        u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize
+    }
+    fn level(&self, i: usize) -> Level {
+        let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize;
+        let limit = if i == self.num_levels() - 1 {
+            self.0.len()
+        } else {
+            u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize
+        };
+        debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len());
+        debug_assert_eq!(offset & 3, 0);
+        debug_assert_eq!(limit & 3, 0);
+        Level::new(&self.0[offset .. limit])
+    }
+
+    /// Identify acceptable hyphenation positions in the given `word`.
+    ///
+    /// The caller-supplied `values` must be at least as long as the `word`.
+    ///
+    /// On return, any elements with an odd value indicate positions in the word
+    /// after which a hyphen could be inserted.
+    ///
+    /// Returns the number of possible hyphenation positions that were found.
+    ///
+    /// # Panics
+    /// If the given `values` slice is too small to hold the results.
+    ///
+    /// If the block of memory represented by `self.0` is not in fact a valid
+    /// hyphenation dictionary, this function may panic with an overflow or
+    /// array bounds violation.
+    pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize {
+        assert!(values.len() >= word.len());
+        values.iter_mut().for_each(|x| *x = 0);
+        let top_level = self.level(0);
+        let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins();
+        if word.len() < lh_min + rh_min {
+            return 0;
+        }
+        let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min);
+        let compound = hyph_count > 0;
+        // Subsequent levels are applied to fragments between potential breaks
+        // already found:
+        for l in 1 .. self.num_levels() {
+            let level = self.level(l);
+            if hyph_count > 0 {
+                let mut begin = 0;
+                let mut lh = lh_min;
+                // lh_min and rh_min are both guaranteed to be greater than zero,
+                // so this loop will not reach fully to the end of the word.
+                for i in lh_min - 1 .. word.len() - rh_min {
+                    if is_odd(values[i]) {
+                        if i > begin {
+                            // We've found a component of a compound;
+                            // clear the corresponding values and apply the new level.
+                            // (These values must be even, so hyph_count is unchanged.)
+                            values[begin .. i].iter_mut().for_each(|x| {
+                                *x = 0;
+                            });
+                            hyph_count += level.find_hyphen_values(&word[begin ..= i],
+                                                                   &mut values[begin ..= i],
+                                                                   lh, crh_min);
+                        }
+                        begin = i + 1;
+                        lh = clh_min;
+                    }
+                }
+                if begin == 0 {
+                    // No compound-word breaks were found, just apply level to the whole word.
+                    hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
+                } else if begin < word.len() {
+                    // Handle trailing component of compound.
+                    hyph_count += level.find_hyphen_values(&word[begin .. word.len()],
+                                                           &mut values[begin .. word.len()],
+                                                           clh_min, rh_min);
+                }
+            } else {
+                hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
+            }
+        }
+
+        // Only need to check nohyphen strings if top-level (compound) breaks were found.
+        if compound && hyph_count > 0 {
+            let nohyph = top_level.nohyphen();
+            if !nohyph.is_empty() {
+                for i in lh_min ..= word.len() - rh_min {
+                    if is_odd(values[i - 1]) {
+                        for nh in &nohyph {
+                            if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] {
+                                values[i - 1] = 0;
+                                hyph_count -= 1;
+                                break;
+                            }
+                            if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] {
+                                values[i - 1] = 0;
+                                hyph_count -= 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        hyph_count
+    }
+
+    /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char`
+    /// at each valid break position.
+    ///
+    /// # Panics
+    /// If the block of memory represented by `self` is not in fact a valid
+    /// hyphenation dictionary, this function may panic with an overflow or
+    /// array bounds violation.
+    ///
+    /// Also panics if the length of the hyphenated word would overflow `usize`.
+    pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String {
+        let mut values = vec![0u8; word.len()];
+        let hyph_count = self.find_hyphen_values(word, &mut values);
+        if hyph_count <= 0 {
+            return word.to_string();
+        }
+        // We know how long the result will be, so we can preallocate here.
+        let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8();
+        let mut result = String::with_capacity(result_len);
+        let mut n = 0;
+        for ch in word.char_indices() {
+            if ch.0 > 0 && is_odd(values[ch.0 - 1]) {
+                result.push(hyphchar);
+                n += 1;
+            }
+            result.push(ch.1);
+        }
+        debug_assert_eq!(n, hyph_count);
+        debug_assert_eq!(result_len, result.len());
+        result
+    }
+
+    /// Check if the block of memory looks like it could be a valid hyphenation
+    /// table.
+    pub fn is_valid_hyphenator(&self) -> bool {
+        // Size must be at least 4 bytes for magic_number + 4 bytes num_levels;
+        // smaller than this cannot be safely inspected.
+        if self.0.len() < FILE_HEADER_SIZE {
+            return false;
+        }
+        if self.magic_number() != MAGIC_NUMBER {
+            return false;
+        }
+        // For each level, there's a 4-byte offset in the header, and the level
+        // has its own 16-byte header, so we can check a minimum size again here.
+        let num_levels = self.num_levels();
+        if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels {
+            return false;
+        }
+        // Check that state_data_base and string_data_base for each hyphenation
+        // level are within range.
+        for l in 0 .. num_levels {
+            let level = self.level(l);
+            if level.state_data_base() < LEVEL_HEADER_SIZE ||
+                   level.state_data_base() > level.string_data_base() ||
+                   level.string_data_base() > level.data.len() {
+                return false;
+            }
+            // TODO: consider doing more extensive validation of states and
+            // strings within the level?
+        }
+        // It's still possible the dic is internally broken, but at least it's
+        // worth trying to use it!
+        true
+    }
+}
+
+/// Load the compiled hyphenation file at `dic_path`, if present.
+///
+/// Returns `None` if the specified file cannot be opened or mapped,
+/// otherwise returns a `memmap2::Mmap` mapping the file.
+///
+/// # Safety
+///
+/// This is unsafe for the same reason `Mmap::map()` is unsafe:
+/// mapped_hyph does not guarantee safety if the mapped file is modified
+/// (e.g. by another process) while we're using it.
+///
+/// This verifies that the file looks superficially like it may be a
+/// compiled hyphenation table, but does *not* fully check the validity
+/// of the file contents! Calling hyphenation functions with the returned
+/// data is not unsafe, but may panic if the data is invalid.
+pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> {
+    let file = File::open(dic_path).ok()?;
+    let dic = Mmap::map(&file).ok()?;
+    let hyph = Hyphenator(&*dic);
+    if hyph.is_valid_hyphenator() {
+        return Some(dic);
+    }
+    None
+}
diff --git a/third_party/rust/mapped_hyph/src/main.rs b/third_party/rust/mapped_hyph/src/main.rs
new file mode 100644
index 0000000000..acc24babee
--- /dev/null
+++ b/third_party/rust/mapped_hyph/src/main.rs
@@ -0,0 +1,67 @@
+// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
+// file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate mapped_hyph;
+
+use mapped_hyph::Hyphenator;
+
+fn main() {
+    let dic_path = "hyph_en_US.hyf";
+
+    let dic = match unsafe { mapped_hyph::load_file(dic_path) } {
+        Some(dic) => dic,
+        _ => panic!("failed to load dictionary {}", dic_path),
+    };
+    let hyph = Hyphenator::new(&*dic);
+
+    println!("{}", hyph.hyphenate_word("haha", '-'));
+    println!("{}", hyph.hyphenate_word("hahaha", '-'));
+    println!("{}", hyph.hyphenate_word("photo", '-'));
+    println!("{}", hyph.hyphenate_word("photograph", '-'));
+    println!("{}", hyph.hyphenate_word("photographer", '-'));
+    println!("{}", hyph.hyphenate_word("photographic", '-'));
+    println!("{}", hyph.hyphenate_word("photographical", '-'));
+    println!("{}", hyph.hyphenate_word("photographically", '-'));
+    println!("{}", hyph.hyphenate_word("supercalifragilisticexpialidocious", '-'));
+    println!("{}", hyph.hyphenate_word("o'dwyer", '='));
+    println!("{}", hyph.hyphenate_word("o'callahan", '='));
+    println!("{}", hyph.hyphenate_word("o’dwyer", '='));
+    println!("{}", hyph.hyphenate_word("o’callahan", '='));
+    println!("{}", hyph.hyphenate_word("petti-fogging", '='));
+    println!("{}", hyph.hyphenate_word("e-mailing", '='));
+    println!("{}", hyph.hyphenate_word("-x-mailing", '='));
+    println!("{}", hyph.hyphenate_word("-strikeout-", '='));
+
+    let dic2 = match unsafe { mapped_hyph::load_file("tests/compound.hyf") } {
+        Some(dic) => dic,
+        _ => panic!("failed to load dictionary {}", "tests/compound.hyf"),
+    };
+
+    let h2 = Hyphenator::new(&*dic2);
+    println!("{}", h2.hyphenate_word("motorcycle", '='));
+
+    let dic3 = match unsafe { mapped_hyph::load_file("tests/rhmin.hyf") } {
+        Some(dic) => dic,
+        _ => panic!("failed to load dictionary {}", dic_path),
+    };
+    let h3 = Hyphenator::new(&*dic3);
+    println!("{}", h3.hyphenate_word("övéit", '='));
+    println!("{}", h3.hyphenate_word("అంగడిధర", '='));
+
+    let dic4 = match unsafe { mapped_hyph::load_file("tests/num.hyf") } {
+        Some(dic) => dic,
+        _ => panic!("failed to load dictionary {}", "tests/num.hyf"),
+    };
+    let h4 = Hyphenator::new(&*dic4);
+
+    println!("{}", h4.hyphenate_word("123foobar123", '='));
+    println!("{}", h4.hyphenate_word("123foobarfoobar", '='));
+    println!("{}", h4.hyphenate_word("foobarfoobar123", '='));
+    println!("{}", h4.hyphenate_word("123foobarfoobar123", '='));
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/mapped_hyph/src
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip