diff options
Diffstat (limited to 'third_party/rust/mapped_hyph/src/builder.rs')
-rw-r--r-- | third_party/rust/mapped_hyph/src/builder.rs | 509 |
1 files changed, 509 insertions, 0 deletions
diff --git a/third_party/rust/mapped_hyph/src/builder.rs b/third_party/rust/mapped_hyph/src/builder.rs new file mode 100644 index 0000000000..e19a0087fd --- /dev/null +++ b/third_party/rust/mapped_hyph/src/builder.rs @@ -0,0 +1,509 @@ +// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// Functions to compile human-readable patterns into a mapped_hyph +/// flattened representation of the hyphenation state machine. + +use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::hash::{Hash,Hasher}; + +// Wrap a HashMap so that we can implement the Hash trait. +#[derive(PartialEq, Eq, Clone)] +struct TransitionMap (HashMap<u8,i32>); + +impl TransitionMap { + fn new() -> TransitionMap { + TransitionMap(HashMap::<u8,i32>::new()) + } +} + +impl Hash for TransitionMap { + fn hash<H: Hasher>(&self, state: &mut H) { + // We only look at the values here; that's likely to be enough + // for a reasonable hash. + let mut transitions: Vec<&i32> = self.0.values().collect(); + transitions.sort(); + for t in transitions { + t.hash(state); + } + } +} + +#[derive(PartialEq, Eq, Hash, Clone)] +struct State { + match_string: Option<Vec<u8>>, + #[allow(dead_code)] + repl_string: Option<Vec<u8>>, + #[allow(dead_code)] + repl_index: i32, + #[allow(dead_code)] + repl_cut: i32, + fallback_state: i32, + transitions: TransitionMap, +} + +impl State { + fn new() -> State { + State { + match_string: None, + repl_string: None, + repl_index: -1, + repl_cut: -1, + fallback_state: -1, + transitions: TransitionMap::new(), + } + } +} + +/// Structures returned by the read_dic_file() function; +/// array of these can then be passed to write_hyf_file() +/// to create the flattened output. +struct LevelBuilder { + states: Vec<State>, + str_to_state: HashMap<Vec<u8>,i32>, + encoding: Option<String>, + nohyphen: Option<String>, + lh_min: u8, + rh_min: u8, + clh_min: u8, + crh_min: u8, +} + +impl LevelBuilder { + fn new() -> LevelBuilder { + let mut result = LevelBuilder { + states: Vec::<State>::new(), + str_to_state: HashMap::<Vec<u8>,i32>::new(), + encoding: None, + nohyphen: None, + lh_min: 0, + rh_min: 0, + clh_min: 0, + crh_min: 0, + }; + // Initialize the builder with an empty start state. + result.str_to_state.insert(vec![], 0); + result.states.push(State::new()); + result + } + + fn find_state_number_for(&mut self, text: &[u8]) -> i32 { + let count = self.states.len() as i32; + let index = *self.str_to_state.entry(text.to_vec()).or_insert(count); + if index == count { + self.states.push(State::new()); + } + index + } + + fn add_pattern(&mut self, pattern: &str) { + let mut bytes = pattern.as_bytes(); + let mut text = Vec::<u8>::with_capacity(bytes.len()); + let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1); + let mut repl_str = None; + let mut repl_index = 0; + let mut repl_cut = 0; + + // Check for replacement rule (non-standard hyphenation spelling change). + if let Some(slash) = bytes.iter().position(|x| *x == b'/') { + let parts = bytes.split_at(slash); + bytes = parts.0; + let mut it = parts.1[1 ..].split(|x| *x == b','); + if let Some(repl) = it.next() { + repl_str = Some(repl.to_vec()); + } + if let Some(num) = it.next() { + repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1; + } + if let Some(num) = it.next() { + repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap(); + } + } + + // Separate the input pattern into parallel arrays of text (bytes) and digits. + let mut got_digit = false; + for byte in bytes { + if *byte <= b'9' && *byte >= b'0' { + if got_digit { + warn!("invalid pattern \"{}\": consecutive digits", pattern); + return; + } + digits.push(*byte); + got_digit = true; + } else { + text.push(*byte); + if got_digit { + got_digit = false; + } else { + digits.push(b'0'); + } + } + } + if !got_digit { + digits.push(b'0'); + } + + if repl_str.is_none() { + // Optimize away leading zeroes from the digits array. + while !digits.is_empty() && digits[0] == b'0' { + digits.remove(0); + } + } else { + // Convert repl_index and repl_cut from Unicode char to byte indexing. + let start = if text[0] == b'.' { 1 } else { 0 }; + if start == 1 { + if digits[0] != b'0' { + warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern); + return; + } + digits.remove(0); + } + let word = std::str::from_utf8(&text[start..]).unwrap(); + let mut chars: Vec<_> = word.char_indices().collect(); + chars.push((word.len(), '.')); + repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32; + repl_index = chars[repl_index as usize].0 as i32; + } + + // Create the new state, or add pattern into an existing state + // (which should not already have a match_string). + let mut state_num = self.find_state_number_for(&text); + let mut state = &mut self.states[state_num as usize]; + if state.match_string.is_some() { + warn!("duplicate pattern \"{}\" discarded", pattern); + return; + } + if !digits.is_empty() { + state.match_string = Some(digits); + } + if repl_str.is_some() { + state.repl_string = repl_str; + state.repl_index = repl_index; + state.repl_cut = repl_cut; + } + + // Set up prefix transitions, inserting additional states as needed. + while !text.is_empty() { + let last_state = state_num; + let ch = *text.last().unwrap(); + text.truncate(text.len() - 1); + state_num = self.find_state_number_for(&text); + if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) { + assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern); + break; + } + } + } + + fn merge_duplicate_states(&mut self) { + // We loop here because when we eliminate a duplicate, and update the transitons + // that referenced it, we may thereby create new duplicates that another pass + // will find and compress further. + loop { + let orig_len = self.states.len(); + // Used to map State records to the (first) index at which they occur. + let mut state_to_index = HashMap::<&State,i32>::new(); + // Mapping of old->new state indexes, and whether each old state is + // a duplicate that should be dropped. + let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len); + let mut next_new_index: i32 = 0; + for index in 0 .. self.states.len() { + // Find existing index for this state, or allocate the next new index to it. + let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index); + // Record the mapping, and whether the state was a duplicate. + mappings.push((new_index, new_index != next_new_index)); + // If we used next_new_index for this state, increment it. + if new_index == next_new_index { + next_new_index += 1; + } + } + // If we didn't find any duplicates, next_new_index will have kept pace with + // index, so we know we're finished. + if next_new_index as usize == self.states.len() { + break; + } + // Iterate over all the states, either deleting them or updating indexes + // according to the mapping we created; then repeat the search. + for index in (0 .. self.states.len()).rev() { + if mappings[index].1 { + self.states.remove(index); + } else { + let state = &mut self.states[index]; + if state.fallback_state != -1 { + state.fallback_state = mappings[state.fallback_state as usize].0; + } + for t in state.transitions.0.iter_mut() { + *t.1 = mappings[*t.1 as usize].0; + } + } + } + } + } + + fn flatten(&self) -> Vec<u8> { + // Calculate total space needed for state data, and build the state_to_offset table. + let mut state_data_size = 0; + let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len()); + for state in &self.states { + state_to_offset.push(state_data_size); + state_data_size += if state.repl_string.is_some() { 12 } else { 8 }; + state_data_size += state.transitions.0.len() * 4; + } + + // Helper to map a state index to its offset in the final data block. + let get_state_offset_for = |state_index: i32| -> u32 { + if state_index < 0 { + return super::INVALID_STATE_OFFSET; + } + state_to_offset[state_index as usize] as u32 + }; + + // Helper to map a byte string to its offset in the final data block, and + // store the bytes into string_data unless using an already-existing string. + let mut string_to_offset = HashMap::<Vec<u8>,usize>::new(); + let mut string_data = Vec::<u8>::new(); + let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 { + if bytes.is_none() { + return super::INVALID_STRING_OFFSET; + } + assert!(bytes.as_ref().unwrap().len() < 256); + let new_offset = string_data.len(); + let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset); + if offset == new_offset { + string_data.push(bytes.as_ref().unwrap().len() as u8); + string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref()); + } + offset.try_into().unwrap() + }; + + // Handle nohyphen string list if present, converting comma separators to NULs + // and trimming any surplus whitespace. + let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET; + let mut nohyphen_count: u16 = 0; + if self.nohyphen.is_some() { + let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect(); + nohyphen_count = nohyphen_strings.len().try_into().unwrap(); + nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec())); + } + + let mut state_data = Vec::<u8>::with_capacity(state_data_size); + for state in &self.states { + state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes()); + state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes()); + state_data.push(state.transitions.0.len() as u8); + // Determine whether to use an extended state record, and if so add the + // replacement string and index fields. + if state.repl_string.is_none() { + state_data.push(0); + } else { + state_data.push(1); + state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes()); + state_data.push(state.repl_index as u8); + state_data.push(state.repl_cut as u8); + } + // Collect transitions into an array so we can sort them. + let mut transitions = vec![]; + for (key, value) in state.transitions.0.iter() { + transitions.push((*key, get_state_offset_for(*value))) + } + transitions.sort(); + for t in transitions { + // New state offset is stored as a 24-bit value, so we do this manually. + state_data.push((t.1 & 0xff) as u8); + state_data.push(((t.1 >> 8) & 0xff) as u8); + state_data.push(((t.1 >> 16) & 0xff) as u8); + state_data.push(t.0); + } + } + assert_eq!(state_data.len(), state_data_size); + + // Pad string data to a 4-byte boundary + while string_data.len() & 3 != 0 { + string_data.push(0); + } + + let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len(); + let mut result = Vec::<u8>::with_capacity(total_size); + + let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32; + let string_data_base: u32 = state_data_base + state_data_size as u32; + + result.extend(&state_data_base.to_le_bytes()); + result.extend(&string_data_base.to_le_bytes()); + result.extend(&nohyphen_string_offset.to_le_bytes()); + result.extend(&nohyphen_count.to_le_bytes()); + result.push(self.lh_min); + result.push(self.rh_min); + result.push(self.clh_min); + result.push(self.crh_min); + + result.extend(state_data.iter()); + result.extend(string_data.iter()); + + assert_eq!(result.len(), total_size); + + result + } +} + +/// Read a libhyphen-style pattern file and create the corresponding state +/// machine transitions, etc. +/// The returned Vec can be passed to write_hyf_file() to generate a flattened +/// representation of the state machine in mapped_hyph's binary format. +fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> { + let reader = BufReader::new(dic_file); + + let mut builders = Vec::<LevelBuilder>::new(); + builders.push(LevelBuilder::new()); + let mut builder = &mut builders[0]; + + for (index, line) in reader.lines().enumerate() { + let mut trimmed = line.unwrap().trim().to_string(); + // Strip comments. + if let Some(i) = trimmed.find('%') { + trimmed = trimmed[..i].trim().to_string(); + } + // Ignore empty lines. + if trimmed.is_empty() { + continue; + } + // Uppercase indicates keyword rather than pattern. + if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' { + // First line is encoding; we only support UTF-8. + if builder.encoding.is_none() { + if trimmed != "UTF-8" { + return Err("Only UTF-8 patterns are accepted!"); + }; + builder.encoding = Some(trimmed); + continue; + } + // Check for valid keyword-value pairs. + if trimmed.contains(' ') { + let parts: Vec<&str> = trimmed.split(' ').collect(); + if parts.len() != 2 { + warn!("unrecognized keyword/values: {}", trimmed); + continue; + } + let keyword = parts[0]; + let value = parts[1]; + match keyword { + "LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(), + "RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(), + "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(), + "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(), + "NOHYPHEN" => builder.nohyphen = Some(trimmed), + _ => warn!("unknown keyword: {}", trimmed), + } + continue; + } + // Start a new hyphenation level? + if trimmed == "NEXTLEVEL" { + builders.push(LevelBuilder::new()); + builder = builders.last_mut().unwrap(); + continue; + } + warn!("unknown keyword: {}", trimmed); + continue; + } + // Patterns should always be provided in lowercase; complain if not, and discard + // the bad pattern. + if trimmed != trimmed.to_lowercase() { + warn!("pattern \"{}\" not lowercased at line {}", trimmed, index); + continue; + } + builder.add_pattern(&trimmed); + } + + // Create default first (compound-word) level if only one level was provided. + // (Maybe this should be optional? Currently just copying libhyphen behavior.) + if builders.len() == 1 { + let (lh_min, rh_min, clh_min, crh_min) = + (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min); + builders.insert(0, LevelBuilder::new()); + builder = builders.first_mut().unwrap(); + builder.add_pattern("1-1"); + builder.add_pattern("1'1"); + builder.add_pattern("1\u{2013}1"); // en-dash + builder.add_pattern("1\u{2019}1"); // curly apostrophe + builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string()); + builder.lh_min = lh_min; + builder.rh_min = rh_min; + builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 }; + builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 }; + } + + // Put in fallback states in each builder. + for builder in &mut builders { + for (key, state_index) in builder.str_to_state.iter() { + if key.is_empty() { + continue; + } + let mut fallback_key = key.clone(); + while !fallback_key.is_empty() { + fallback_key.remove(0); + if builder.str_to_state.contains_key(&fallback_key) { + break; + } + } + builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key]; + } + } + + if compress { + // Merge duplicate states to reduce size. + for builder in &mut builders { + builder.merge_duplicate_states(); + } + } + + Ok(builders) +} + +/// Write out the state machines representing a set of hyphenation rules +/// to the given output stream. +fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> { + if levels.is_empty() { + return Err(Error::from(ErrorKind::InvalidData)); + } + let mut flattened = vec![]; + for level in levels { + flattened.push(level.flatten()); + } + // Write file header: magic number, count of levels. + hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?; + let level_count: u32 = flattened.len() as u32; + hyf_file.write_all(&level_count.to_le_bytes())?; + // Write array of offsets to each level. First level will begin immediately + // after the array of offsets. + let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count; + for flat in &flattened { + hyf_file.write_all(&offset.to_le_bytes())?; + offset += flat.len() as u32; + } + // Write the flattened data for each level. + for flat in &flattened { + hyf_file.write_all(&flat)?; + } + Ok(()) +} + +/// The public API to the compilation process: reads `dic_file` and writes compiled tables +/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the +/// size of the output is performed. +pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> { + match read_dic_file(dic_file, compress) { + Ok(dic) => write_hyf_file(hyf_file, dic), + Err(e) => { + warn!("parse error: {}", e); + return Err(Error::from(ErrorKind::InvalidData)) + } + } +} |