// Copyright 2019 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #[macro_use] extern crate arrayref; extern crate memmap2; #[macro_use] extern crate log; use std::slice; use std::str; use std::cmp::max; use std::fs::File; use std::mem; use memmap2::Mmap; // Make submodules available publicly. pub mod builder; pub mod ffi; // 4-byte identification expected at beginning of a compiled dictionary file. // (This will be updated if an incompatible change to the format is made in // some future revision.) const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0']; const INVALID_STRING_OFFSET: u16 = 0xffff; const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff; const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels const LEVEL_HEADER_SIZE: usize = 16; // Transition actually holds a 24-bit new state offset and an 8-bit input byte // to match. We will be interpreting byte ranges as Transition arrays (in the // State::transitions() method below), so use repr(C) to ensure we have the // memory layout we expect. // Transition records do not depend on any specific alignment. #[repr(C)] #[derive(Debug,Copy,Clone)] struct Transition(u8, u8, u8, u8); impl Transition { fn new_state_offset(&self) -> usize { // Read a 24-bit little-endian number from three bytes. self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16) } fn match_byte(&self) -> u8 { self.3 } } // State is an area of the Level's data block that begins with a fixed header, // followed by an array of transitions. The total size of each State's data // depends on the number of transitions in the state. Only the basic header // is defined by the struct here; the rest of the state is accessed via // pointer magic. // There are two versions of State, a basic version that supports only simple // hyphenation (no associated spelling change), and an extended version that // adds the replacement-string fields to support spelling changes at the // hyphenation point. Check is_extended() to know which version is present. // State records are NOT necessarily 4-byte aligned, so multi-byte fields // should be read with care. #[derive(Debug,Copy,Clone)] #[repr(C)] struct State { fallback_state: [u8; 4], match_string_offset: [u8; 2], num_transitions: u8, is_extended: u8, } #[repr(C)] struct StateExtended { state: State, repl_string_offset: [u8; 2], repl_index: i8, repl_cut: i8, } impl State { // Accessors for the various State header fields; see file format description. fn fallback_state(&self) -> usize { u32::from_le_bytes(self.fallback_state) as usize } fn match_string_offset(&self) -> usize { u16::from_le_bytes(self.match_string_offset) as usize } fn num_transitions(&self) -> u8 { self.num_transitions } fn is_extended(&self) -> bool { self.is_extended != 0 } // Accessors that are only valid if is_extended() is true. // These use `unsafe` to dereference a pointer to the relevant field; // this is OK because Level::get_state always validates the total state size // before returning a state reference, so these pointers will be valid for // any extended state it returns. #[allow(dead_code)] fn as_extended(&self) -> &StateExtended { debug_assert!(self.is_extended()); unsafe { mem::transmute(self) } } #[allow(dead_code)] fn repl_string_offset(&self) -> usize { u16::from_le_bytes(self.as_extended().repl_string_offset) as usize } #[allow(dead_code)] fn repl_index(&self) -> i8 { self.as_extended().repl_index } #[allow(dead_code)] fn repl_cut(&self) -> i8 { self.as_extended().repl_cut } // Return the state's Transitions as a slice reference. fn transitions(&self) -> &[Transition] { let count = self.num_transitions() as usize; if count == 0 { return &[]; } let transition_offset = if self.is_extended() { mem::size_of::() } else { mem::size_of::() } as isize; // We know the `offset` here will not look beyond the valid range of memory // because Level::get_state() checks the state length (accounting for the // number of transitions) before returning a State reference. let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition }; // Again, because Level::get_state() already checked the state length, we know // this slice address and count will be valid. unsafe { slice::from_raw_parts(trans_ptr, count) } } // Look up the Transition for a given input byte, or None. fn transition_for(&self, b: u8) -> Option { // The transitions array is sorted by match_byte() value, but there are // usually very few entries; benchmarking showed that using binary_search_by // here gave no benefit (possibly slightly slower). self.transitions().iter().copied().find(|t| t.match_byte() == b) } // Just for debugging use... #[allow(dead_code)] fn deep_show(&self, prefix: &str, dic: &Level) { if self.match_string_offset() != INVALID_STRING_OFFSET as usize { let match_string = dic.string_at_offset(self.match_string_offset()); println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap()); } for t in self.transitions() { println!("{}{} ->", prefix, t.match_byte() as char); let next_prefix = format!("{} ", prefix); dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic); } } } // We count the presentation-form ligature characters U+FB00..FB06 as multiple // chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these // ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this // helper returns the "decomposed length" of the ligature given its trailing // byte. fn lig_length(trail_byte: u8) -> usize { // This is only called on valid UTF-8 where we already know trail_byte // must be >= 0x80. // Ligature lengths: ff fi fl ffi ffl long-st st const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8, 2u8 ]; if trail_byte > 0x86 { return 1; } LENGTHS[trail_byte as usize - 0x80] as usize } fn is_utf8_trail_byte(byte: u8) -> bool { (byte & 0xC0) == 0x80 } fn is_ascii_digit(byte: u8) -> bool { byte <= b'9' && byte >= b'0' } fn is_odd(byte: u8) -> bool { (byte & 0x01) == 0x01 } // A hyphenation Level has a header followed by State records and packed string // data. The total size of the slice depends on the number and size of the // States and Strings it contains. // Note that the data of the Level may not have any specific alignment! #[derive(Debug,Copy,Clone)] struct Level<'a> { data: &'a [u8], // Header fields cached by the constructor for faster access: state_data_base_: usize, string_data_base_: usize, } impl Level<'_> { // Constructor that initializes our cache variables. fn new(data: &[u8]) -> Level { Level { data, state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize, string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize, } } // Accessors for Level header fields; see file format description. fn state_data_base(&self) -> usize { self.state_data_base_ // cached by constructor } fn string_data_base(&self) -> usize { self.string_data_base_ // cached by constructor } fn nohyphen_string_offset(&self) -> usize { u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize } #[allow(dead_code)] fn nohyphen_count(&self) -> u16 { u16::from_le_bytes(*array_ref!(self.data, 10, 2)) } fn lh_min(&self) -> usize { max(1, self.data[12] as usize) } fn rh_min(&self) -> usize { max(1, self.data[13] as usize) } fn clh_min(&self) -> usize { max(1, self.data[14] as usize) } fn crh_min(&self) -> usize { max(1, self.data[15] as usize) } fn word_boundary_mins(&self) -> (usize, usize, usize, usize) { (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min()) } // Strings are represented as offsets from the Level's string_data_base. // This returns a byte slice referencing the string at a given offset, // or an empty slice if invalid. fn string_at_offset(&self, offset: usize) -> &'_ [u8] { if offset == INVALID_STRING_OFFSET as usize { return &[]; } let string_base = self.string_data_base() as usize + offset; // TODO: move this to the validation function. debug_assert!(string_base < self.data.len()); if string_base + 1 > self.data.len() { return &[]; } let len = self.data[string_base] as usize; // TODO: move this to the validation function. debug_assert!(string_base + 1 + len <= self.data.len()); if string_base + 1 + len > self.data.len() { return &[]; } self.data.get(string_base + 1 .. string_base + 1 + len).unwrap() } // The nohyphen field actually contains multiple NUL-separated substrings; // return them as a vector of individual byte slices. fn nohyphen(&self) -> Vec<&[u8]> { let string_offset = self.nohyphen_string_offset(); let nohyph_str = self.string_at_offset(string_offset as usize); if nohyph_str.is_empty() { return vec![]; } nohyph_str.split(|&b| b == 0).collect() } // States are represented as an offset from the Level's state_data_base. // This returns a reference to the State at a given offset, or None if invalid. fn get_state(&self, offset: usize) -> Option<&State> { if offset == INVALID_STATE_OFFSET as usize { return None; } debug_assert_eq!(offset & 3, 0); let state_base = self.state_data_base() + offset; // TODO: move this to the validation function. debug_assert!(state_base + mem::size_of::() <= self.string_data_base()); if state_base + mem::size_of::() > self.string_data_base() { return None; } let state_ptr = &self.data[state_base] as *const u8 as *const State; // This is safe because we just checked against self.string_data_base() above. let state = unsafe { state_ptr.as_ref().unwrap() }; let length = if state.is_extended() { mem::size_of::() } else { mem::size_of::() } + mem::size_of::() * state.num_transitions() as usize; // TODO: move this to the validation function. debug_assert!(state_base + length <= self.string_data_base()); if state_base + length > self.string_data_base() { return None; } // This is safe because we checked the full state length against self.string_data_base(). unsafe { state_ptr.as_ref() } } // Sets hyphenation values (odd = potential break, even = no break) in values[], // and returns the change in the number of odd values present, so the caller can // keep track of the total number of potential breaks in the word. fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize { // Bail out immediately if the word is too short to hyphenate. if word.len() < lh_min + rh_min { return 0; } let start_state = self.get_state(0); let mut st = start_state; let mut hyph_count = 0; for i in 0 .. word.len() + 2 { // Loop over the word by bytes, with a virtual '.' added at each end // to match word-boundary patterns. let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] }; loop { // Loop to repeatedly fall back if we don't find a matching transition. // Note that this could infinite-loop if there is a state whose fallback // points to itself (or a cycle of fallbacks), but this would represent // a table compilation error. // (A potential validation function could check for fallback cycles.) if st.is_none() { st = start_state; break; } let state = st.unwrap(); if let Some(tr) = state.transition_for(b) { // Found a transition for the current byte. Look up the new state; // if it has a match_string, merge its weights into `values`. st = self.get_state(tr.new_state_offset()); if let Some(state) = st { let match_offset = state.match_string_offset(); if match_offset != INVALID_STRING_OFFSET as usize { if state.is_extended() { debug_assert!(false, "extended hyphenation not supported by this function"); } else { let match_str = self.string_at_offset(match_offset); let offset = i + 1 - match_str.len(); assert!(offset + match_str.len() <= word.len() + 2); for (j, ch) in match_str.iter().enumerate() { let index = offset + j; if index >= lh_min && index <= word.len() - rh_min { // lh_min and rh_min are guaranteed to be >= 1, // so this will not try to access outside values[]. let old_value = values[index - 1]; let value = ch - b'0'; if value > old_value { if is_odd(old_value) != is_odd(value) { // Adjust hyph_count for the change we're making hyph_count += if is_odd(value) { 1 } else { -1 }; } values[index - 1] = value; } } } } } } // We have handled the current input byte; leave the fallback loop // and get next input. break; } // No transition for the current byte; go to fallback state and try again. st = self.get_state(state.fallback_state()); } } // If the word was not purely ASCII, or if the word begins/ends with // digits, the use of lh_min and rh_min above may not have correctly // excluded enough positions, so we need to fix things up here. let mut index = 0; let mut count = 0; let word_bytes = word.as_bytes(); let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; }; // Handle lh_min. while count < lh_min - 1 && index < word_bytes.len() { let byte = word_bytes[index]; clear_hyphen_at(index); if byte < 0x80 { index += 1; if is_ascii_digit(byte) { continue; // ASCII digits don't count } } else if byte == 0xEF && word_bytes[index + 1] == 0xAC { // Unicode presentation-form ligature characters, which we count as // multiple chars for the purpose of lh_min/rh_min, all begin with // 0xEF, 0xAC in UTF-8. count += lig_length(word_bytes[index + 2]); clear_hyphen_at(index + 1); clear_hyphen_at(index + 2); index += 3; continue; } else { index += 1; while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index]) { clear_hyphen_at(index); index += 1; } } count += 1; } // Handle rh_min. count = 0; index = word.len(); while count < rh_min && index > 0 { index -= 1; let byte = word_bytes[index]; if index < word.len() - 1 { clear_hyphen_at(index); } if byte < 0x80 { // Only count if not an ASCII digit if !is_ascii_digit(byte) { count += 1; } continue; } if is_utf8_trail_byte(byte) { continue; } if byte == 0xEF && word_bytes[index + 1] == 0xAC { // Presentation-form ligatures count as multiple chars. count += lig_length(word_bytes[index + 2]); continue; } count += 1; } hyph_count } } /// Hyphenation engine encapsulating a language-specific set of patterns (rules) /// that identify possible break positions within a word. pub struct Hyphenator<'a>(&'a [u8]); impl Hyphenator<'_> { /// Return a Hyphenator that wraps the given buffer. /// This does *not* check that the given buffer is in fact a valid hyphenation table. /// Use `is_valid_hyphenator()` to determine whether it is usable. /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary, /// unvalidated data is not unsafe, but may panic.) pub fn new(buffer: &[u8]) -> Hyphenator { Hyphenator(buffer) } // Internal implementation details fn magic_number(&self) -> &[u8] { &self.0[0 .. 4] } fn num_levels(&self) -> usize { u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize } fn level(&self, i: usize) -> Level { let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize; let limit = if i == self.num_levels() - 1 { self.0.len() } else { u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize }; debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len()); debug_assert_eq!(offset & 3, 0); debug_assert_eq!(limit & 3, 0); Level::new(&self.0[offset .. limit]) } /// Identify acceptable hyphenation positions in the given `word`. /// /// The caller-supplied `values` must be at least as long as the `word`. /// /// On return, any elements with an odd value indicate positions in the word /// after which a hyphen could be inserted. /// /// Returns the number of possible hyphenation positions that were found. /// /// # Panics /// If the given `values` slice is too small to hold the results. /// /// If the block of memory represented by `self.0` is not in fact a valid /// hyphenation dictionary, this function may panic with an overflow or /// array bounds violation. pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize { assert!(values.len() >= word.len()); values.iter_mut().for_each(|x| *x = 0); let top_level = self.level(0); let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins(); if word.len() < lh_min + rh_min { return 0; } let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min); let compound = hyph_count > 0; // Subsequent levels are applied to fragments between potential breaks // already found: for l in 1 .. self.num_levels() { let level = self.level(l); if hyph_count > 0 { let mut begin = 0; let mut lh = lh_min; // lh_min and rh_min are both guaranteed to be greater than zero, // so this loop will not reach fully to the end of the word. for i in lh_min - 1 .. word.len() - rh_min { if is_odd(values[i]) { if i > begin { // We've found a component of a compound; // clear the corresponding values and apply the new level. // (These values must be even, so hyph_count is unchanged.) values[begin .. i].iter_mut().for_each(|x| { *x = 0; }); hyph_count += level.find_hyphen_values(&word[begin ..= i], &mut values[begin ..= i], lh, crh_min); } begin = i + 1; lh = clh_min; } } if begin == 0 { // No compound-word breaks were found, just apply level to the whole word. hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); } else if begin < word.len() { // Handle trailing component of compound. hyph_count += level.find_hyphen_values(&word[begin .. word.len()], &mut values[begin .. word.len()], clh_min, rh_min); } } else { hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min); } } // Only need to check nohyphen strings if top-level (compound) breaks were found. if compound && hyph_count > 0 { let nohyph = top_level.nohyphen(); if !nohyph.is_empty() { for i in lh_min ..= word.len() - rh_min { if is_odd(values[i - 1]) { for nh in &nohyph { if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] { values[i - 1] = 0; hyph_count -= 1; break; } if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] { values[i - 1] = 0; hyph_count -= 1; break; } } } } } } hyph_count } /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char` /// at each valid break position. /// /// # Panics /// If the block of memory represented by `self` is not in fact a valid /// hyphenation dictionary, this function may panic with an overflow or /// array bounds violation. /// /// Also panics if the length of the hyphenated word would overflow `usize`. pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String { let mut values = vec![0u8; word.len()]; let hyph_count = self.find_hyphen_values(word, &mut values); if hyph_count <= 0 { return word.to_string(); } // We know how long the result will be, so we can preallocate here. let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8(); let mut result = String::with_capacity(result_len); let mut n = 0; for ch in word.char_indices() { if ch.0 > 0 && is_odd(values[ch.0 - 1]) { result.push(hyphchar); n += 1; } result.push(ch.1); } debug_assert_eq!(n, hyph_count); debug_assert_eq!(result_len, result.len()); result } /// Check if the block of memory looks like it could be a valid hyphenation /// table. pub fn is_valid_hyphenator(&self) -> bool { // Size must be at least 4 bytes for magic_number + 4 bytes num_levels; // smaller than this cannot be safely inspected. if self.0.len() < FILE_HEADER_SIZE { return false; } if self.magic_number() != MAGIC_NUMBER { return false; } // For each level, there's a 4-byte offset in the header, and the level // has its own 16-byte header, so we can check a minimum size again here. let num_levels = self.num_levels(); if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels { return false; } // Check that state_data_base and string_data_base for each hyphenation // level are within range. for l in 0 .. num_levels { let level = self.level(l); if level.state_data_base() < LEVEL_HEADER_SIZE || level.state_data_base() > level.string_data_base() || level.string_data_base() > level.data.len() { return false; } // TODO: consider doing more extensive validation of states and // strings within the level? } // It's still possible the dic is internally broken, but at least it's // worth trying to use it! true } } /// Load the compiled hyphenation file at `dic_path`, if present. /// /// Returns `None` if the specified file cannot be opened or mapped, /// otherwise returns a `memmap2::Mmap` mapping the file. /// /// # Safety /// /// This is unsafe for the same reason `Mmap::map()` is unsafe: /// mapped_hyph does not guarantee safety if the mapped file is modified /// (e.g. by another process) while we're using it. /// /// This verifies that the file looks superficially like it may be a /// compiled hyphenation table, but does *not* fully check the validity /// of the file contents! Calling hyphenation functions with the returned /// data is not unsafe, but may panic if the data is invalid. pub unsafe fn load_file(dic_path: &str) -> Option { let file = File::open(dic_path).ok()?; let dic = Mmap::map(&file).ok()?; let hyph = Hyphenator(&*dic); if hyph.is_valid_hyphenator() { return Some(dic); } None }