diff options
Diffstat (limited to 'third_party/rust/mapped_hyph/src')
-rw-r--r-- | third_party/rust/mapped_hyph/src/bin/hyf_compile.rs | 44 | ||||
-rw-r--r-- | third_party/rust/mapped_hyph/src/builder.rs | 509 | ||||
-rw-r--r-- | third_party/rust/mapped_hyph/src/ffi.rs | 250 | ||||
-rw-r--r-- | third_party/rust/mapped_hyph/src/lib.rs | 642 | ||||
-rw-r--r-- | third_party/rust/mapped_hyph/src/main.rs | 67 |
5 files changed, 1512 insertions, 0 deletions
diff --git a/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs new file mode 100644 index 0000000000..257c747f54 --- /dev/null +++ b/third_party/rust/mapped_hyph/src/bin/hyf_compile.rs @@ -0,0 +1,44 @@ +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate log; +extern crate mapped_hyph; + +use std::env; +use std::fs::File; + +struct Logger {} + +impl log::Log for Logger { + fn enabled(&self, _: &log::Metadata) -> bool { + true + } + + fn log(&self, record: &log::Record) { + eprintln!("{} - {}", record.level(), record.args()); + } + + fn flush(&self) {} +} + +static LOGGER: Logger = Logger {}; + +fn main() -> std::io::Result<()> { + unsafe { log::set_logger_racy(&LOGGER).unwrap() }; + + let args: Vec<String> = env::args().collect(); + if args.len() == 3 { + let in_file = File::open(&args[1])?; + let mut out_file = File::create(&args[2])?; + mapped_hyph::builder::compile(&in_file, &mut out_file, true)?; + } else { + println!("usage: hyf_compile <pattern-file> <output-file>"); + } + Ok(()) +} diff --git a/third_party/rust/mapped_hyph/src/builder.rs b/third_party/rust/mapped_hyph/src/builder.rs new file mode 100644 index 0000000000..e19a0087fd --- /dev/null +++ b/third_party/rust/mapped_hyph/src/builder.rs @@ -0,0 +1,509 @@ +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// Functions to compile human-readable patterns into a mapped_hyph +/// flattened representation of the hyphenation state machine. + +use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::hash::{Hash,Hasher}; + +// Wrap a HashMap so that we can implement the Hash trait. +#[derive(PartialEq, Eq, Clone)] +struct TransitionMap (HashMap<u8,i32>); + +impl TransitionMap { + fn new() -> TransitionMap { + TransitionMap(HashMap::<u8,i32>::new()) + } +} + +impl Hash for TransitionMap { + fn hash<H: Hasher>(&self, state: &mut H) { + // We only look at the values here; that's likely to be enough + // for a reasonable hash. + let mut transitions: Vec<&i32> = self.0.values().collect(); + transitions.sort(); + for t in transitions { + t.hash(state); + } + } +} + +#[derive(PartialEq, Eq, Hash, Clone)] +struct State { + match_string: Option<Vec<u8>>, + #[allow(dead_code)] + repl_string: Option<Vec<u8>>, + #[allow(dead_code)] + repl_index: i32, + #[allow(dead_code)] + repl_cut: i32, + fallback_state: i32, + transitions: TransitionMap, +} + +impl State { + fn new() -> State { + State { + match_string: None, + repl_string: None, + repl_index: -1, + repl_cut: -1, + fallback_state: -1, + transitions: TransitionMap::new(), + } + } +} + +/// Structures returned by the read_dic_file() function; +/// array of these can then be passed to write_hyf_file() +/// to create the flattened output. +struct LevelBuilder { + states: Vec<State>, + str_to_state: HashMap<Vec<u8>,i32>, + encoding: Option<String>, + nohyphen: Option<String>, + lh_min: u8, + rh_min: u8, + clh_min: u8, + crh_min: u8, +} + +impl LevelBuilder { + fn new() -> LevelBuilder { + let mut result = LevelBuilder { + states: Vec::<State>::new(), + str_to_state: HashMap::<Vec<u8>,i32>::new(), + encoding: None, + nohyphen: None, + lh_min: 0, + rh_min: 0, + clh_min: 0, + crh_min: 0, + }; + // Initialize the builder with an empty start state. + result.str_to_state.insert(vec![], 0); + result.states.push(State::new()); + result + } + + fn find_state_number_for(&mut self, text: &[u8]) -> i32 { + let count = self.states.len() as i32; + let index = *self.str_to_state.entry(text.to_vec()).or_insert(count); + if index == count { + self.states.push(State::new()); + } + index + } + + fn add_pattern(&mut self, pattern: &str) { + let mut bytes = pattern.as_bytes(); + let mut text = Vec::<u8>::with_capacity(bytes.len()); + let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1); + let mut repl_str = None; + let mut repl_index = 0; + let mut repl_cut = 0; + + // Check for replacement rule (non-standard hyphenation spelling change). + if let Some(slash) = bytes.iter().position(|x| *x == b'/') { + let parts = bytes.split_at(slash); + bytes = parts.0; + let mut it = parts.1[1 ..].split(|x| *x == b','); + if let Some(repl) = it.next() { + repl_str = Some(repl.to_vec()); + } + if let Some(num) = it.next() { + repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1; + } + if let Some(num) = it.next() { + repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap(); + } + } + + // Separate the input pattern into parallel arrays of text (bytes) and digits. + let mut got_digit = false; + for byte in bytes { + if *byte <= b'9' && *byte >= b'0' { + if got_digit { + warn!("invalid pattern \"{}\": consecutive digits", pattern); + return; + } + digits.push(*byte); + got_digit = true; + } else { + text.push(*byte); + if got_digit { + got_digit = false; + } else { + digits.push(b'0'); + } + } + } + if !got_digit { + digits.push(b'0'); + } + + if repl_str.is_none() { + // Optimize away leading zeroes from the digits array. + while !digits.is_empty() && digits[0] == b'0' { + digits.remove(0); + } + } else { + // Convert repl_index and repl_cut from Unicode char to byte indexing. + let start = if text[0] == b'.' { 1 } else { 0 }; + if start == 1 { + if digits[0] != b'0' { + warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern); + return; + } + digits.remove(0); + } + let word = std::str::from_utf8(&text[start..]).unwrap(); + let mut chars: Vec<_> = word.char_indices().collect(); + chars.push((word.len(), '.')); + repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32; + repl_index = chars[repl_index as usize].0 as i32; + } + + // Create the new state, or add pattern into an existing state + // (which should not already have a match_string). + let mut state_num = self.find_state_number_for(&text); + let mut state = &mut self.states[state_num as usize]; + if state.match_string.is_some() { + warn!("duplicate pattern \"{}\" discarded", pattern); + return; + } + if !digits.is_empty() { + state.match_string = Some(digits); + } + if repl_str.is_some() { + state.repl_string = repl_str; + state.repl_index = repl_index; + state.repl_cut = repl_cut; + } + + // Set up prefix transitions, inserting additional states as needed. + while !text.is_empty() { + let last_state = state_num; + let ch = *text.last().unwrap(); + text.truncate(text.len() - 1); + state_num = self.find_state_number_for(&text); + if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) { + assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern); + break; + } + } + } + + fn merge_duplicate_states(&mut self) { + // We loop here because when we eliminate a duplicate, and update the transitons + // that referenced it, we may thereby create new duplicates that another pass + // will find and compress further. + loop { + let orig_len = self.states.len(); + // Used to map State records to the (first) index at which they occur. + let mut state_to_index = HashMap::<&State,i32>::new(); + // Mapping of old->new state indexes, and whether each old state is + // a duplicate that should be dropped. + let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len); + let mut next_new_index: i32 = 0; + for index in 0 .. self.states.len() { + // Find existing index for this state, or allocate the next new index to it. + let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index); + // Record the mapping, and whether the state was a duplicate. + mappings.push((new_index, new_index != next_new_index)); + // If we used next_new_index for this state, increment it. + if new_index == next_new_index { + next_new_index += 1; + } + } + // If we didn't find any duplicates, next_new_index will have kept pace with + // index, so we know we're finished. + if next_new_index as usize == self.states.len() { + break; + } + // Iterate over all the states, either deleting them or updating indexes + // according to the mapping we created; then repeat the search. + for index in (0 .. self.states.len()).rev() { + if mappings[index].1 { + self.states.remove(index); + } else { + let state = &mut self.states[index]; + if state.fallback_state != -1 { + state.fallback_state = mappings[state.fallback_state as usize].0; + } + for t in state.transitions.0.iter_mut() { + *t.1 = mappings[*t.1 as usize].0; + } + } + } + } + } + + fn flatten(&self) -> Vec<u8> { + // Calculate total space needed for state data, and build the state_to_offset table. + let mut state_data_size = 0; + let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len()); + for state in &self.states { + state_to_offset.push(state_data_size); + state_data_size += if state.repl_string.is_some() { 12 } else { 8 }; + state_data_size += state.transitions.0.len() * 4; + } + + // Helper to map a state index to its offset in the final data block. + let get_state_offset_for = |state_index: i32| -> u32 { + if state_index < 0 { + return super::INVALID_STATE_OFFSET; + } + state_to_offset[state_index as usize] as u32 + }; + + // Helper to map a byte string to its offset in the final data block, and + // store the bytes into string_data unless using an already-existing string. + let mut string_to_offset = HashMap::<Vec<u8>,usize>::new(); + let mut string_data = Vec::<u8>::new(); + let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 { + if bytes.is_none() { + return super::INVALID_STRING_OFFSET; + } + assert!(bytes.as_ref().unwrap().len() < 256); + let new_offset = string_data.len(); + let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset); + if offset == new_offset { + string_data.push(bytes.as_ref().unwrap().len() as u8); + string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref()); + } + offset.try_into().unwrap() + }; + + // Handle nohyphen string list if present, converting comma separators to NULs + // and trimming any surplus whitespace. + let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET; + let mut nohyphen_count: u16 = 0; + if self.nohyphen.is_some() { + let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect(); + nohyphen_count = nohyphen_strings.len().try_into().unwrap(); + nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec())); + } + + let mut state_data = Vec::<u8>::with_capacity(state_data_size); + for state in &self.states { + state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes()); + state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes()); + state_data.push(state.transitions.0.len() as u8); + // Determine whether to use an extended state record, and if so add the + // replacement string and index fields. + if state.repl_string.is_none() { + state_data.push(0); + } else { + state_data.push(1); + state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes()); + state_data.push(state.repl_index as u8); + state_data.push(state.repl_cut as u8); + } + // Collect transitions into an array so we can sort them. + let mut transitions = vec![]; + for (key, value) in state.transitions.0.iter() { + transitions.push((*key, get_state_offset_for(*value))) + } + transitions.sort(); + for t in transitions { + // New state offset is stored as a 24-bit value, so we do this manually. + state_data.push((t.1 & 0xff) as u8); + state_data.push(((t.1 >> 8) & 0xff) as u8); + state_data.push(((t.1 >> 16) & 0xff) as u8); + state_data.push(t.0); + } + } + assert_eq!(state_data.len(), state_data_size); + + // Pad string data to a 4-byte boundary + while string_data.len() & 3 != 0 { + string_data.push(0); + } + + let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len(); + let mut result = Vec::<u8>::with_capacity(total_size); + + let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32; + let string_data_base: u32 = state_data_base + state_data_size as u32; + + result.extend(&state_data_base.to_le_bytes()); + result.extend(&string_data_base.to_le_bytes()); + result.extend(&nohyphen_string_offset.to_le_bytes()); + result.extend(&nohyphen_count.to_le_bytes()); + result.push(self.lh_min); + result.push(self.rh_min); + result.push(self.clh_min); + result.push(self.crh_min); + + result.extend(state_data.iter()); + result.extend(string_data.iter()); + + assert_eq!(result.len(), total_size); + + result + } +} + +/// Read a libhyphen-style pattern file and create the corresponding state +/// machine transitions, etc. +/// The returned Vec can be passed to write_hyf_file() to generate a flattened +/// representation of the state machine in mapped_hyph's binary format. +fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> { + let reader = BufReader::new(dic_file); + + let mut builders = Vec::<LevelBuilder>::new(); + builders.push(LevelBuilder::new()); + let mut builder = &mut builders[0]; + + for (index, line) in reader.lines().enumerate() { + let mut trimmed = line.unwrap().trim().to_string(); + // Strip comments. + if let Some(i) = trimmed.find('%') { + trimmed = trimmed[..i].trim().to_string(); + } + // Ignore empty lines. + if trimmed.is_empty() { + continue; + } + // Uppercase indicates keyword rather than pattern. + if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' { + // First line is encoding; we only support UTF-8. + if builder.encoding.is_none() { + if trimmed != "UTF-8" { + return Err("Only UTF-8 patterns are accepted!"); + }; + builder.encoding = Some(trimmed); + continue; + } + // Check for valid keyword-value pairs. + if trimmed.contains(' ') { + let parts: Vec<&str> = trimmed.split(' ').collect(); + if parts.len() != 2 { + warn!("unrecognized keyword/values: {}", trimmed); + continue; + } + let keyword = parts[0]; + let value = parts[1]; + match keyword { + "LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(), + "RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(), + "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(), + "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(), + "NOHYPHEN" => builder.nohyphen = Some(trimmed), + _ => warn!("unknown keyword: {}", trimmed), + } + continue; + } + // Start a new hyphenation level? + if trimmed == "NEXTLEVEL" { + builders.push(LevelBuilder::new()); + builder = builders.last_mut().unwrap(); + continue; + } + warn!("unknown keyword: {}", trimmed); + continue; + } + // Patterns should always be provided in lowercase; complain if not, and discard + // the bad pattern. + if trimmed != trimmed.to_lowercase() { + warn!("pattern \"{}\" not lowercased at line {}", trimmed, index); + continue; + } + builder.add_pattern(&trimmed); + } + + // Create default first (compound-word) level if only one level was provided. + // (Maybe this should be optional? Currently just copying libhyphen behavior.) + if builders.len() == 1 { + let (lh_min, rh_min, clh_min, crh_min) = + (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min); + builders.insert(0, LevelBuilder::new()); + builder = builders.first_mut().unwrap(); + builder.add_pattern("1-1"); + builder.add_pattern("1'1"); + builder.add_pattern("1\u{2013}1"); // en-dash + builder.add_pattern("1\u{2019}1"); // curly apostrophe + builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string()); + builder.lh_min = lh_min; + builder.rh_min = rh_min; + builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 }; + builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 }; + } + + // Put in fallback states in each builder. + for builder in &mut builders { + for (key, state_index) in builder.str_to_state.iter() { + if key.is_empty() { + continue; + } + let mut fallback_key = key.clone(); + while !fallback_key.is_empty() { + fallback_key.remove(0); + if builder.str_to_state.contains_key(&fallback_key) { + break; + } + } + builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key]; + } + } + + if compress { + // Merge duplicate states to reduce size. + for builder in &mut builders { + builder.merge_duplicate_states(); + } + } + + Ok(builders) +} + +/// Write out the state machines representing a set of hyphenation rules +/// to the given output stream. +fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> { + if levels.is_empty() { + return Err(Error::from(ErrorKind::InvalidData)); + } + let mut flattened = vec![]; + for level in levels { + flattened.push(level.flatten()); + } + // Write file header: magic number, count of levels. + hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?; + let level_count: u32 = flattened.len() as u32; + hyf_file.write_all(&level_count.to_le_bytes())?; + // Write array of offsets to each level. First level will begin immediately + // after the array of offsets. + let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count; + for flat in &flattened { + hyf_file.write_all(&offset.to_le_bytes())?; + offset += flat.len() as u32; + } + // Write the flattened data for each level. + for flat in &flattened { + hyf_file.write_all(&flat)?; + } + Ok(()) +} + +/// The public API to the compilation process: reads `dic_file` and writes compiled tables +/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the +/// size of the output is performed. +pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> { + match read_dic_file(dic_file, compress) { + Ok(dic) => write_hyf_file(hyf_file, dic), + Err(e) => { + warn!("parse error: {}", e); + return Err(Error::from(ErrorKind::InvalidData)) + } + } +} diff --git a/third_party/rust/mapped_hyph/src/ffi.rs b/third_party/rust/mapped_hyph/src/ffi.rs new file mode 100644 index 0000000000..1b546e2567 --- /dev/null +++ b/third_party/rust/mapped_hyph/src/ffi.rs @@ -0,0 +1,250 @@ +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::slice; +use std::str; +use std::ffi::CStr; +use std::fs::File; +use std::io::Read; +use std::os::raw::c_char; +use std::str::Utf8Error; + +use memmap2::Mmap; + +use super::Hyphenator; + +/// Opaque type representing a hyphenation dictionary loaded from a file, +/// for use in FFI function signatures. +pub struct HyphDic; + +/// Opaque type representing a compiled dictionary in a memory buffer. +pub struct CompiledData; + +// Helper to convert word and hyphen buffer parameters from raw C pointer/length +// pairs to the Rust types expected by mapped_hyph. +unsafe fn params_from_c<'a>(word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> + (Result<&'a str, Utf8Error>, &'a mut [u8]) { + (str::from_utf8(slice::from_raw_parts(word as *const u8, word_len as usize)), + slice::from_raw_parts_mut(hyphens, hyphens_len as usize)) +} + +/// C-callable function to load a hyphenation dictionary from a file at `path`. +/// +/// Returns null on failure. +/// +/// This does not fully validate that the file contains usable hyphenation +/// data, it only opens the file (read-only) and mmap's it into memory, and +/// does some minimal sanity-checking that it *might* be valid. +/// +/// The returned `HyphDic` must be released with `mapped_hyph_free_dictionary`. +/// +/// # Safety +/// The given `path` must be a valid pointer to a NUL-terminated (C-style) +/// string. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_load_dictionary(path: *const c_char) -> *const HyphDic { + let path_str = match CStr::from_ptr(path).to_str() { + Ok(str) => str, + Err(_) => return std::ptr::null(), + }; + let hyph = Box::new(match super::load_file(path_str) { + Some(dic) => dic, + _ => return std::ptr::null(), + }); + Box::into_raw(hyph) as *const HyphDic +} + +/// C-callable function to free a hyphenation dictionary +/// that was loaded by `mapped_hyph_load_dictionary`. +/// +/// # Safety +/// The `dic` parameter must be a `HyphDic` pointer obtained from +/// `mapped_hyph_load_dictionary`, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_free_dictionary(dic: *mut HyphDic) { + Box::from_raw(dic); +} + +/// C-callable function to find hyphenation values for a given `word`, +/// using a dictionary loaded via `mapped_hyph_load_dictionary`. +/// +/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters) +/// long. +/// +/// Caller must supply the `hyphens` output buffer for results; its size is +/// given in `hyphens_len`. +/// It should be at least `word_len` elements long. +/// +/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is +/// too small. +/// Otherwise returns the number of potential hyphenation positions found. +/// +/// # Panics +/// This function may panic if the given dictionary is not valid. +/// +/// # Safety +/// The `dic` parameter must be a `HyphDic` pointer obtained from +/// `mapped_hyph_load_dictionary`. +/// +/// The `word` and `hyphens` parameter must be valid pointers to memory buffers +/// of at least the respective sizes `word_len` and `hyphens_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_dic(dic: *const HyphDic, + word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> i32 { + if word_len > hyphens_len { + return -1; + } + let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len); + if word_str.is_err() { + return -1; + } + Hyphenator::new(&*(dic as *const Mmap)) + .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32 +} + +/// C-callable function to find hyphenation values for a given `word`, +/// using a dictionary loaded and owned by the caller. +/// +/// The dictionary is supplied as a raw memory buffer `dic_buf` of size +/// `dic_len`. +/// +/// The `word` must be UTF-8-encoded, and is `word_len` bytes (not characters) +/// long. +/// +/// Caller must supply the `hyphens` output buffer for results; its size is +/// given in `hyphens_len`. +/// It should be at least `word_len` elements long. +/// +/// Returns -1 if `word` is not valid UTF-8, or the output `hyphens` buffer is +/// too small. +/// Otherwise returns the number of potential hyphenation positions found. +/// +/// # Panics +/// This function may panic if the given dictionary is not valid. +/// +/// # Safety +/// The `dic_buf` parameter must be a valid pointer to a memory block of size +/// at least `dic_len`. +/// +/// The `word` and `hyphens` parameter must be valid pointers to memory buffers +/// of at least the respective sizes `word_len` and `hyphens_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_find_hyphen_values_raw(dic_buf: *const u8, dic_len: u32, + word: *const c_char, word_len: u32, + hyphens: *mut u8, hyphens_len: u32) -> i32 { + if word_len > hyphens_len { + return -1; + } + let (word_str, hyphen_buf) = params_from_c(word, word_len, hyphens, hyphens_len); + if word_str.is_err() { + return -1; + } + Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize)) + .find_hyphen_values(word_str.unwrap(), hyphen_buf) as i32 +} + +/// C-callable function to check if a given memory buffer `dic_buf` of size +/// `dic_len` is potentially usable as a hyphenation dictionary. +/// +/// Returns `true` if the given memory buffer looks like it may be a valid +/// hyphenation dictionary, `false` if it is clearly not usable. +/// +/// # Safety +/// The `dic_buf` parameter must be a valid pointer to a memory block of size +/// at least `dic_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_is_valid_hyphenator(dic_buf: *const u8, dic_len: u32) -> bool { + if dic_buf.is_null() { + return false; + } + let dic = Hyphenator::new(slice::from_raw_parts(dic_buf, dic_len as usize)); + dic.is_valid_hyphenator() +} + +/// C-callable function to free a CompiledData object created by +/// a `mapped_hyph_compile_...` function (below). +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_free_compiled_data(data: *mut CompiledData) { + Box::from_raw(data); +} + +// Helper for the compilation functions (from either memory buffer or file path). +fn compile_and_wrap<T: Read>(input: T, compress: bool) -> *const CompiledData { + let mut compiled: Vec<u8> = vec![]; + if super::builder::compile(input, &mut compiled, compress).is_err() { + return std::ptr::null(); + } + compiled.shrink_to_fit(); + + // Create a persistent heap reference to the compiled data, and return a pointer to it. + Box::into_raw(Box::new(compiled)) as *const CompiledData +} + +/// C-callable function to compile hyphenation patterns from `pattern_buf` and return +/// the compiled data in a memory buffer, suitable to be stored somewhere or passed +/// to `mapped_hyph_find_hyphen_values_raw` to perform hyphenation. +/// +/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`. +/// +/// # Safety +/// The `pattern_buf` parameter must be a valid pointer to a memory block of size +/// at least `pattern_len`. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compile_buffer(pattern_buf: *const u8, pattern_len: u32, compress: bool) -> *const CompiledData { + compile_and_wrap(slice::from_raw_parts(pattern_buf, pattern_len as usize), compress) +} + +/// C-callable function to compile hyphenation patterns from a file to a memory buffer. +/// +/// The returned `CompiledData` must be released with `mapped_hyph_free_compiled_data`. +/// +/// # Safety +/// The given `path` must be a valid pointer to a NUL-terminated (C-style) string. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compile_file(path: *const c_char, compress: bool) -> *const CompiledData { + // Try to open the file at the given path, returning null on failure. + let path_str = match CStr::from_ptr(path).to_str() { + Ok(str) => str, + Err(_) => return std::ptr::null(), + }; + let in_file = match File::open(path_str) { + Ok(file) => file, + Err(_) => return std::ptr::null(), + }; + compile_and_wrap(&in_file, compress) +} + +/// Get the size of the compiled table buffer in a `CompiledData` object. +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compiled_data_size(data: *const CompiledData) -> u32 { + (&*(data as *const Vec<u8>)).len() as u32 +} + +/// Get a pointer to the raw data held by a `CompiledData` object. +/// +/// # Safety +/// The `data` parameter must be a `CompiledData` pointer obtained from +/// a `mapped_hyph_compile_...` function, and not previously freed. +/// +/// The returned pointer only remains valid as long as the `CompiledData` has not +/// been released (by passing it to `mapped_hyph_free_compiled_data`). +#[no_mangle] +pub unsafe extern "C" fn mapped_hyph_compiled_data_ptr(data: *const CompiledData) -> *const u8 { + (&*(data as *const Vec<u8>)).as_ptr() +} diff --git a/third_party/rust/mapped_hyph/src/lib.rs b/third_party/rust/mapped_hyph/src/lib.rs new file mode 100644 index 0000000000..848c93d257 --- /dev/null +++ b/third_party/rust/mapped_hyph/src/lib.rs @@ -0,0 +1,642 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[macro_use] +extern crate arrayref; +extern crate memmap2; +#[macro_use] +extern crate log; + +use std::slice; +use std::str; +use std::cmp::max; +use std::fs::File; +use std::mem; + +use memmap2::Mmap; + +// Make submodules available publicly. +pub mod builder; +pub mod ffi; + +// 4-byte identification expected at beginning of a compiled dictionary file. +// (This will be updated if an incompatible change to the format is made in +// some future revision.) +const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0']; + +const INVALID_STRING_OFFSET: u16 = 0xffff; +const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff; + +const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels +const LEVEL_HEADER_SIZE: usize = 16; + +// Transition actually holds a 24-bit new state offset and an 8-bit input byte +// to match. We will be interpreting byte ranges as Transition arrays (in the +// State::transitions() method below), so use repr(C) to ensure we have the +// memory layout we expect. +// Transition records do not depend on any specific alignment. +#[repr(C)] +#[derive(Debug,Copy,Clone)] +struct Transition(u8, u8, u8, u8); + +impl Transition { + fn new_state_offset(&self) -> usize { + // Read a 24-bit little-endian number from three bytes. + self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16) + } + fn match_byte(&self) -> u8 { + self.3 + } +} + +// State is an area of the Level's data block that begins with a fixed header, +// followed by an array of transitions. The total size of each State's data +// depends on the number of transitions in the state. Only the basic header +// is defined by the struct here; the rest of the state is accessed via +// pointer magic. +// There are two versions of State, a basic version that supports only simple +// hyphenation (no associated spelling change), and an extended version that +// adds the replacement-string fields to support spelling changes at the +// hyphenation point. Check is_extended() to know which version is present. +// State records are NOT necessarily 4-byte aligned, so multi-byte fields +// should be read with care. +#[derive(Debug,Copy,Clone)] +#[repr(C)] +struct State { + fallback_state: [u8; 4], + match_string_offset: [u8; 2], + num_transitions: u8, + is_extended: u8, +} + +#[repr(C)] +struct StateExtended { + state: State, + repl_string_offset: [u8; 2], + repl_index: i8, + repl_cut: i8, +} + +impl State { + // Accessors for the various State header fields; see file format description. + fn fallback_state(&self) -> usize { + u32::from_le_bytes(self.fallback_state) as usize + } + fn match_string_offset(&self) -> usize { + u16::from_le_bytes(self.match_string_offset) as usize + } + fn num_transitions(&self) -> u8 { + self.num_transitions + } + fn is_extended(&self) -> bool { + self.is_extended != 0 + } + // Accessors that are only valid if is_extended() is true. + // These use `unsafe` to dereference a pointer to the relevant field; + // this is OK because Level::get_state always validates the total state size + // before returning a state reference, so these pointers will be valid for + // any extended state it returns. + #[allow(dead_code)] + fn as_extended(&self) -> &StateExtended { + debug_assert!(self.is_extended()); + unsafe { mem::transmute(self) } + } + #[allow(dead_code)] + fn repl_string_offset(&self) -> usize { + u16::from_le_bytes(self.as_extended().repl_string_offset) as usize + } + #[allow(dead_code)] + fn repl_index(&self) -> i8 { + self.as_extended().repl_index + } + #[allow(dead_code)] + fn repl_cut(&self) -> i8 { + self.as_extended().repl_cut + } + // Return the state's Transitions as a slice reference. + fn transitions(&self) -> &[Transition] { + let count = self.num_transitions() as usize; + if count == 0 { + return &[]; + } + let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize; + // We know the `offset` here will not look beyond the valid range of memory + // because Level::get_state() checks the state length (accounting for the + // number of transitions) before returning a State reference. + let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition }; + // Again, because Level::get_state() already checked the state length, we know + // this slice address and count will be valid. + unsafe { slice::from_raw_parts(trans_ptr, count) } + } + // Look up the Transition for a given input byte, or None. + fn transition_for(&self, b: u8) -> Option<Transition> { + // The transitions array is sorted by match_byte() value, but there are + // usually very few entries; benchmarking showed that using binary_search_by + // here gave no benefit (possibly slightly slower). + self.transitions().iter().copied().find(|t| t.match_byte() == b) + } + // Just for debugging use... + #[allow(dead_code)] + fn deep_show(&self, prefix: &str, dic: &Level) { + if self.match_string_offset() != INVALID_STRING_OFFSET as usize { + let match_string = dic.string_at_offset(self.match_string_offset()); + println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap()); + } + for t in self.transitions() { + println!("{}{} ->", prefix, t.match_byte() as char); + let next_prefix = format!("{} ", prefix); + dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic); + } + } +} + +// We count the presentation-form ligature characters U+FB00..FB06 as multiple +// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these +// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this +// helper returns the "decomposed length" of the ligature given its trailing +// byte. +fn lig_length(trail_byte: u8) -> usize { + // This is only called on valid UTF-8 where we already know trail_byte + // must be >= 0x80. + // Ligature lengths: ff fi fl ffi ffl long-st st + const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8, 2u8 ]; + if trail_byte > 0x86 { + return 1; + } + LENGTHS[trail_byte as usize - 0x80] as usize +} + +fn is_utf8_trail_byte(byte: u8) -> bool { + (byte & 0xC0) == 0x80 +} + +fn is_ascii_digit(byte: u8) -> bool { + byte <= b'9' && byte >= b'0' +} + +fn is_odd(byte: u8) -> bool { + (byte & 0x01) == 0x01 +} + +// A hyphenation Level has a header followed by State records and packed string +// data. The total size of the slice depends on the number and size of the +// States and Strings it contains. +// Note that the data of the Level may not have any specific alignment! +#[derive(Debug,Copy,Clone)] +struct Level<'a> { + data: &'a [u8], + // Header fields cached by the constructor for faster access: + state_data_base_: usize, + string_data_base_: usize, +} + +impl Level<'_> { + // Constructor that initializes our cache variables. + fn new(data: &[u8]) -> Level { + Level { + data, + state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize, + string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize, + } + } + + // Accessors for Level header fields; see file format description. + fn state_data_base(&self) -> usize { + self.state_data_base_ // cached by constructor + } + fn string_data_base(&self) -> usize { + self.string_data_base_ // cached by constructor + } + fn nohyphen_string_offset(&self) -> usize { + u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize + } + #[allow(dead_code)] + fn nohyphen_count(&self) -> u16 { + u16::from_le_bytes(*array_ref!(self.data, 10, 2)) + } + fn lh_min(&self) -> usize { + max(1, self.data[12] as usize) + } + fn rh_min(&self) -> usize { + max(1, self.data[13] as usize) + } + fn clh_min(&self) -> usize { + max(1, self.data[14] as usize) + } + fn crh_min(&self) -> usize { + max(1, self.data[15] as usize) + } + fn word_boundary_mins(&self) -> (usize, usize, usize, usize) { + (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min()) + } + // Strings are represented as offsets from the Level's string_data_base. + // This returns a byte slice referencing the string at a given offset, + // or an empty slice if invalid. + fn string_at_offset(&self, offset: usize) -> &'_ [u8] { + if offset == INVALID_STRING_OFFSET as usize { + return &[]; + } + let string_base = self.string_data_base() as usize + offset; + // TODO: move this to the validation function. + debug_assert!(string_base < self.data.len()); + if string_base + 1 > self.data.len() { + return &[]; + } + let len = self.data[string_base] as usize; + // TODO: move this to the validation function. + debug_assert!(string_base + 1 + len <= self.data.len()); + if string_base + 1 + len > self.data.len() { + return &[]; + } + self.data.get(string_base + 1 .. string_base + 1 + len).unwrap() + } + // The nohyphen field actually contains multiple NUL-separated substrings; + // return them as a vector of individual byte slices. + fn nohyphen(&self) -> Vec<&[u8]> { + let string_offset = self.nohyphen_string_offset(); + let nohyph_str = self.string_at_offset(string_offset as usize); + if nohyph_str.is_empty() { + return vec![]; + } + nohyph_str.split(|&b| b == 0).collect() + } + // States are represented as an offset from the Level's state_data_base. + // This returns a reference to the State at a given offset, or None if invalid. + fn get_state(&self, offset: usize) -> Option<&State> { + if offset == INVALID_STATE_OFFSET as usize { + return None; + } + debug_assert_eq!(offset & 3, 0); + let state_base = self.state_data_base() + offset; + // TODO: move this to the validation function. + debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base()); + if state_base + mem::size_of::<State>() > self.string_data_base() { + return None; + } + let state_ptr = &self.data[state_base] as *const u8 as *const State; + // This is safe because we just checked against self.string_data_base() above. + let state = unsafe { state_ptr.as_ref().unwrap() }; + let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } + + mem::size_of::<Transition>() * state.num_transitions() as usize; + // TODO: move this to the validation function. + debug_assert!(state_base + length <= self.string_data_base()); + if state_base + length > self.string_data_base() { + return None; + } + // This is safe because we checked the full state length against self.string_data_base(). + unsafe { state_ptr.as_ref() } + } + // Sets hyphenation values (odd = potential break, even = no break) in values[], + // and returns the change in the number of odd values present, so the caller can + // keep track of the total number of potential breaks in the word. + fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize { + // Bail out immediately if the word is too short to hyphenate. + if word.len() < lh_min + rh_min { + return 0; + } + let start_state = self.get_state(0); + let mut st = start_state; + let mut hyph_count = 0; + for i in 0 .. word.len() + 2 { + // Loop over the word by bytes, with a virtual '.' added at each end + // to match word-boundary patterns. + let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] }; + loop { + // Loop to repeatedly fall back if we don't find a matching transition. + // Note that this could infinite-loop if there is a state whose fallback + // points to itself (or a cycle of fallbacks), but this would represent + // a table compilation error. + // (A potential validation function could check for fallback cycles.) + if st.is_none() { + st = start_state; + break; + } + let state = st.unwrap(); + if let Some(tr) = state.transition_for(b) { + // Found a transition for the current byte. Look up the new state; + // if it has a match_string, merge its weights into `values`. + st = self.get_state(tr.new_state_offset()); + if let Some(state) = st { + let match_offset = state.match_string_offset(); + if match_offset != INVALID_STRING_OFFSET as usize { + if state.is_extended() { + debug_assert!(false, "extended hyphenation not supported by this function"); + } else { + let match_str = self.string_at_offset(match_offset); + let offset = i + 1 - match_str.len(); + assert!(offset + match_str.len() <= word.len() + 2); + for (j, ch) in match_str.iter().enumerate() { + let index = offset + j; + if index >= lh_min && index <= word.len() - rh_min { + // lh_min and rh_min are guaranteed to be >= 1, + // so this will not try to access outside values[]. + let old_value = values[index - 1]; + let value = ch - b'0'; + if value > old_value { + if is_odd(old_value) != is_odd(value) { + // Adjust hyph_count for the change we're making + hyph_count += if is_odd(value) { 1 } else { -1 }; + } + values[index - 1] = value; + } + } + } + } + } + } + // We have handled the current input byte; leave the fallback loop + // and get next input. + break; + } + // No transition for the current byte; go to fallback state and try again. + st = self.get_state(state.fallback_state()); + } + } + + // If the word was not purely ASCII, or if the word begins/ends with + // digits, the use of lh_min and rh_min above may not have correctly + // excluded enough positions, so we need to fix things up here. + let mut index = 0; + let mut count = 0; + let word_bytes = word.as_bytes(); + let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; }; + // Handle lh_min. + while count < lh_min - 1 && index < word_bytes.len() { + let byte = word_bytes[index]; + clear_hyphen_at(index); + if byte < 0x80 { + index += 1; + if is_ascii_digit(byte) { + continue; // ASCII digits don't count + } + } else if byte == 0xEF && word_bytes[index + 1] == 0xAC { + // Unicode presentation-form ligature characters, which we count as + // multiple chars for the purpose of lh_min/rh_min, all begin with + // 0xEF, 0xAC in UTF-8. + count += lig_length(word_bytes[index + 2]); + clear_hyphen_at(index + 1); + clear_hyphen_at(index + 2); + index += 3; + continue; + } else { + index += 1; + while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index]) { + clear_hyphen_at(index); + index += 1; + } + } + count += 1; + } + + // Handle rh_min. + count = 0; + index = word.len(); + while count < rh_min && index > 0 { + index -= 1; + let byte = word_bytes[index]; + if index < word.len() - 1 { + clear_hyphen_at(index); + } + if byte < 0x80 { + // Only count if not an ASCII digit + if !is_ascii_digit(byte) { + count += 1; + } + continue; + } + if is_utf8_trail_byte(byte) { + continue; + } + if byte == 0xEF && word_bytes[index + 1] == 0xAC { + // Presentation-form ligatures count as multiple chars. + count += lig_length(word_bytes[index + 2]); + continue; + } + count += 1; + } + + hyph_count + } +} + +/// Hyphenation engine encapsulating a language-specific set of patterns (rules) +/// that identify possible break positions within a word. +pub struct Hyphenator<'a>(&'a [u8]); + +impl Hyphenator<'_> { + /// Return a Hyphenator that wraps the given buffer. + /// This does *not* check that the given buffer is in fact a valid hyphenation table. + /// Use `is_valid_hyphenator()` to determine whether it is usable. + /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary, + /// unvalidated data is not unsafe, but may panic.) + pub fn new(buffer: &[u8]) -> Hyphenator { + Hyphenator(buffer) + } + + // Internal implementation details + fn magic_number(&self) -> &[u8] { + &self.0[0 .. 4] + } + fn num_levels(&self) -> usize { + u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize + } + fn level(&self, i: usize) -> Level { + let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize; + let limit = if i == self.num_levels() - 1 { + self.0.len() + } else { + u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize + }; + debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len()); + debug_assert_eq!(offset & 3, 0); + debug_assert_eq!(limit & 3, 0); + Level::new(&self.0[offset .. limit]) + } + + /// Identify acceptable hyphenation positions in the given `word`. + /// + /// The caller-supplied `values` must be at least as long as the `word`. + /// + /// On return, any elements with an odd value indicate positions in the word + /// after which a hyphen could be inserted. + /// + /// Returns the number of possible hyphenation positions that were found. + /// + /// # Panics + /// If the given `values` slice is too small to hold the results. + /// + /// If the block of memory represented by `self.0` is not in fact a valid + /// hyphenation dictionary, this function may panic with an overflow or + /// array bounds violation. + pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize { + assert!(values.len() >= word.len()); + values.iter_mut().for_each(|x| *x = 0); + let top_level = self.level(0); + let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins(); + if word.len() < lh_min + rh_min { + return 0; + } + let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min); + let compound = hyph_count > 0; + // Subsequent levels are applied to fragments between potential breaks + // already found: + for l in 1 .. self.num_levels() { + let level = self.level(l); + if hyph_count > 0 { + let mut begin = 0; + let mut lh = lh_min; + // lh_min and rh_min are both guaranteed to be greater than zero, + // so this loop will not reach fully to the end of the word. + for i in lh_min - 1 .. word.len() - rh_min { + if is_odd(values[i]) { + if i > begin { + // We've found a component of a compound; + // clear the corresponding values and apply the new level. + // (These values must be even, so hyph_count is unchanged.) + values[begin .. i].iter_mut().for_each(|x| { + *x = 0; + }); + hyph_count += level.find_hyphen_values(&word[begin ..= i], + &mut values[begin ..= i], + lh, crh_min); + } + begin = i + 1; + lh = clh_min; + } + } + if begin == 0 { + // No compound-word breaks were found, just apply level to the whole word. + hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); + } else if begin < word.len() { + // Handle trailing component of compound. + hyph_count += level.find_hyphen_values(&word[begin .. word.len()], + &mut values[begin .. word.len()], + clh_min, rh_min); + } + } else { + hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); + } + } + + // Only need to check nohyphen strings if top-level (compound) breaks were found. + if compound && hyph_count > 0 { + let nohyph = top_level.nohyphen(); + if !nohyph.is_empty() { + for i in lh_min ..= word.len() - rh_min { + if is_odd(values[i - 1]) { + for nh in &nohyph { + if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] { + values[i - 1] = 0; + hyph_count -= 1; + break; + } + if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] { + values[i - 1] = 0; + hyph_count -= 1; + break; + } + } + } + } + } + } + + hyph_count + } + + /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char` + /// at each valid break position. + /// + /// # Panics + /// If the block of memory represented by `self` is not in fact a valid + /// hyphenation dictionary, this function may panic with an overflow or + /// array bounds violation. + /// + /// Also panics if the length of the hyphenated word would overflow `usize`. + pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String { + let mut values = vec![0u8; word.len()]; + let hyph_count = self.find_hyphen_values(word, &mut values); + if hyph_count <= 0 { + return word.to_string(); + } + // We know how long the result will be, so we can preallocate here. + let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8(); + let mut result = String::with_capacity(result_len); + let mut n = 0; + for ch in word.char_indices() { + if ch.0 > 0 && is_odd(values[ch.0 - 1]) { + result.push(hyphchar); + n += 1; + } + result.push(ch.1); + } + debug_assert_eq!(n, hyph_count); + debug_assert_eq!(result_len, result.len()); + result + } + + /// Check if the block of memory looks like it could be a valid hyphenation + /// table. + pub fn is_valid_hyphenator(&self) -> bool { + // Size must be at least 4 bytes for magic_number + 4 bytes num_levels; + // smaller than this cannot be safely inspected. + if self.0.len() < FILE_HEADER_SIZE { + return false; + } + if self.magic_number() != MAGIC_NUMBER { + return false; + } + // For each level, there's a 4-byte offset in the header, and the level + // has its own 16-byte header, so we can check a minimum size again here. + let num_levels = self.num_levels(); + if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels { + return false; + } + // Check that state_data_base and string_data_base for each hyphenation + // level are within range. + for l in 0 .. num_levels { + let level = self.level(l); + if level.state_data_base() < LEVEL_HEADER_SIZE || + level.state_data_base() > level.string_data_base() || + level.string_data_base() > level.data.len() { + return false; + } + // TODO: consider doing more extensive validation of states and + // strings within the level? + } + // It's still possible the dic is internally broken, but at least it's + // worth trying to use it! + true + } +} + +/// Load the compiled hyphenation file at `dic_path`, if present. +/// +/// Returns `None` if the specified file cannot be opened or mapped, +/// otherwise returns a `memmap2::Mmap` mapping the file. +/// +/// # Safety +/// +/// This is unsafe for the same reason `Mmap::map()` is unsafe: +/// mapped_hyph does not guarantee safety if the mapped file is modified +/// (e.g. by another process) while we're using it. +/// +/// This verifies that the file looks superficially like it may be a +/// compiled hyphenation table, but does *not* fully check the validity +/// of the file contents! Calling hyphenation functions with the returned +/// data is not unsafe, but may panic if the data is invalid. +pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> { + let file = File::open(dic_path).ok()?; + let dic = Mmap::map(&file).ok()?; + let hyph = Hyphenator(&*dic); + if hyph.is_valid_hyphenator() { + return Some(dic); + } + None +} diff --git a/third_party/rust/mapped_hyph/src/main.rs b/third_party/rust/mapped_hyph/src/main.rs new file mode 100644 index 0000000000..acc24babee --- /dev/null +++ b/third_party/rust/mapped_hyph/src/main.rs @@ -0,0 +1,67 @@ +// Copyright 2019 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +extern crate mapped_hyph; + +use mapped_hyph::Hyphenator; + +fn main() { + let dic_path = "hyph_en_US.hyf"; + + let dic = match unsafe { mapped_hyph::load_file(dic_path) } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let hyph = Hyphenator::new(&*dic); + + println!("{}", hyph.hyphenate_word("haha", '-')); + println!("{}", hyph.hyphenate_word("hahaha", '-')); + println!("{}", hyph.hyphenate_word("photo", '-')); + println!("{}", hyph.hyphenate_word("photograph", '-')); + println!("{}", hyph.hyphenate_word("photographer", '-')); + println!("{}", hyph.hyphenate_word("photographic", '-')); + println!("{}", hyph.hyphenate_word("photographical", '-')); + println!("{}", hyph.hyphenate_word("photographically", '-')); + println!("{}", hyph.hyphenate_word("supercalifragilisticexpialidocious", '-')); + println!("{}", hyph.hyphenate_word("o'dwyer", '=')); + println!("{}", hyph.hyphenate_word("o'callahan", '=')); + println!("{}", hyph.hyphenate_word("o’dwyer", '=')); + println!("{}", hyph.hyphenate_word("o’callahan", '=')); + println!("{}", hyph.hyphenate_word("petti-fogging", '=')); + println!("{}", hyph.hyphenate_word("e-mailing", '=')); + println!("{}", hyph.hyphenate_word("-x-mailing", '=')); + println!("{}", hyph.hyphenate_word("-strikeout-", '=')); + + let dic2 = match unsafe { mapped_hyph::load_file("tests/compound.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", "tests/compound.hyf"), + }; + + let h2 = Hyphenator::new(&*dic2); + println!("{}", h2.hyphenate_word("motorcycle", '=')); + + let dic3 = match unsafe { mapped_hyph::load_file("tests/rhmin.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", dic_path), + }; + let h3 = Hyphenator::new(&*dic3); + println!("{}", h3.hyphenate_word("övéit", '=')); + println!("{}", h3.hyphenate_word("అంగడిధర", '=')); + + let dic4 = match unsafe { mapped_hyph::load_file("tests/num.hyf") } { + Some(dic) => dic, + _ => panic!("failed to load dictionary {}", "tests/num.hyf"), + }; + let h4 = Hyphenator::new(&*dic4); + + println!("{}", h4.hyphenate_word("123foobar123", '=')); + println!("{}", h4.hyphenate_word("123foobarfoobar", '=')); + println!("{}", h4.hyphenate_word("foobarfoobar123", '=')); + println!("{}", h4.hyphenate_word("123foobarfoobar123", '=')); +} |