third_party/rust/mapped_hyph/src/builder.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509

// Copyright 2019-2020 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

/// Functions to compile human-readable patterns into a mapped_hyph
/// flattened representation of the hyphenation state machine.

use std::io::{Read,BufRead,BufReader,Write,Error,ErrorKind};
use std::collections::HashMap;
use std::convert::TryInto;
use std::hash::{Hash,Hasher};

// Wrap a HashMap so that we can implement the Hash trait.
#[derive(PartialEq, Eq, Clone)]
struct TransitionMap (HashMap<u8,i32>);

impl TransitionMap {
    fn new() -> TransitionMap {
        TransitionMap(HashMap::<u8,i32>::new())
    }
}

impl Hash for TransitionMap {
    fn hash<H: Hasher>(&self, state: &mut H) {
        // We only look at the values here; that's likely to be enough
        // for a reasonable hash.
        let mut transitions: Vec<&i32> = self.0.values().collect();
        transitions.sort();
        for t in transitions {
            t.hash(state);
        }
    }
}

#[derive(PartialEq, Eq, Hash, Clone)]
struct State {
    match_string: Option<Vec<u8>>,
    #[allow(dead_code)]
    repl_string: Option<Vec<u8>>,
    #[allow(dead_code)]
    repl_index: i32,
    #[allow(dead_code)]
    repl_cut: i32,
    fallback_state: i32,
    transitions: TransitionMap,
}

impl State {
    fn new() -> State {
        State {
            match_string: None,
            repl_string: None,
            repl_index: -1,
            repl_cut: -1,
            fallback_state: -1,
            transitions: TransitionMap::new(),
        }
    }
}

/// Structures returned by the read_dic_file() function;
/// array of these can then be passed to write_hyf_file()
/// to create the flattened output.
struct LevelBuilder {
    states: Vec<State>,
    str_to_state: HashMap<Vec<u8>,i32>,
    encoding: Option<String>,
    nohyphen: Option<String>,
    lh_min: u8,
    rh_min: u8,
    clh_min: u8,
    crh_min: u8,
}

impl LevelBuilder {
    fn new() -> LevelBuilder {
        let mut result = LevelBuilder {
            states: Vec::<State>::new(),
            str_to_state: HashMap::<Vec<u8>,i32>::new(),
            encoding: None,
            nohyphen: None,
            lh_min: 0,
            rh_min: 0,
            clh_min: 0,
            crh_min: 0,
        };
        // Initialize the builder with an empty start state.
        result.str_to_state.insert(vec![], 0);
        result.states.push(State::new());
        result
    }

    fn find_state_number_for(&mut self, text: &[u8]) -> i32 {
        let count = self.states.len() as i32;
        let index = *self.str_to_state.entry(text.to_vec()).or_insert(count);
        if index == count {
            self.states.push(State::new());
        }
        index
    }

    fn add_pattern(&mut self, pattern: &str) {
        let mut bytes = pattern.as_bytes();
        let mut text = Vec::<u8>::with_capacity(bytes.len());
        let mut digits = Vec::<u8>::with_capacity(bytes.len() + 1);
        let mut repl_str = None;
        let mut repl_index = 0;
        let mut repl_cut = 0;

        // Check for replacement rule (non-standard hyphenation spelling change).
        if let Some(slash) = bytes.iter().position(|x| *x == b'/') {
            let parts = bytes.split_at(slash);
            bytes = parts.0;
            let mut it = parts.1[1 ..].split(|x| *x == b',');
            if let Some(repl) = it.next() {
                repl_str = Some(repl.to_vec());
            }
            if let Some(num) = it.next() {
                repl_index = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap() - 1;
            }
            if let Some(num) = it.next() {
                repl_cut = std::str::from_utf8(num).unwrap().parse::<i32>().unwrap();
            }
        }

        // Separate the input pattern into parallel arrays of text (bytes) and digits.
        let mut got_digit = false;
        for byte in bytes {
            if *byte <= b'9' && *byte >= b'0' {
                if got_digit {
                    warn!("invalid pattern \"{}\": consecutive digits", pattern);
                    return;
                }
                digits.push(*byte);
                got_digit = true;
            } else {
                text.push(*byte);
                if got_digit {
                    got_digit = false;
                } else {
                    digits.push(b'0');
                }
            }
        }
        if !got_digit {
            digits.push(b'0');
        }

        if repl_str.is_none() {
            // Optimize away leading zeroes from the digits array.
            while !digits.is_empty() && digits[0] == b'0' {
                digits.remove(0);
            }
        } else {
            // Convert repl_index and repl_cut from Unicode char to byte indexing.
            let start = if text[0] == b'.' { 1 } else { 0 };
            if start == 1 {
                if digits[0] != b'0' {
                    warn!("invalid pattern \"{}\": unexpected digit before start of word", pattern);
                    return;
                }
                digits.remove(0);
            }
            let word = std::str::from_utf8(&text[start..]).unwrap();
            let mut chars: Vec<_> = word.char_indices().collect();
            chars.push((word.len(), '.'));
            repl_cut = chars[(repl_index + repl_cut) as usize].0 as i32 - chars[repl_index as usize].0 as i32;
            repl_index = chars[repl_index as usize].0 as i32;
        }

        // Create the new state, or add pattern into an existing state
        // (which should not already have a match_string).
        let mut state_num = self.find_state_number_for(&text);
        let mut state = &mut self.states[state_num as usize];
        if state.match_string.is_some() {
            warn!("duplicate pattern \"{}\" discarded", pattern);
            return;
        }
        if !digits.is_empty() {
            state.match_string = Some(digits);
        }
        if repl_str.is_some() {
            state.repl_string = repl_str;
            state.repl_index = repl_index;
            state.repl_cut = repl_cut;
        }

        // Set up prefix transitions, inserting additional states as needed.
        while !text.is_empty() {
            let last_state = state_num;
            let ch = *text.last().unwrap();
            text.truncate(text.len() - 1);
            state_num = self.find_state_number_for(&text);
            if let Some(exists) = self.states[state_num as usize].transitions.0.insert(ch, last_state) {
                assert_eq!(exists, last_state, "overwriting existing transition at pattern \"{}\"", pattern);
                break;
            }
        }
    }

    fn merge_duplicate_states(&mut self) {
        // We loop here because when we eliminate a duplicate, and update the transitons
        // that referenced it, we may thereby create new duplicates that another pass
        // will find and compress further.
        loop {
            let orig_len = self.states.len();
            // Used to map State records to the (first) index at which they occur.
            let mut state_to_index = HashMap::<&State,i32>::new();
            // Mapping of old->new state indexes, and whether each old state is
            // a duplicate that should be dropped.
            let mut mappings = Vec::<(i32,bool)>::with_capacity(orig_len);
            let mut next_new_index: i32 = 0;
            for index in 0 .. self.states.len() {
                // Find existing index for this state, or allocate the next new index to it.
                let new_index = *state_to_index.entry(&self.states[index]).or_insert(next_new_index);
                // Record the mapping, and whether the state was a duplicate.
                mappings.push((new_index, new_index != next_new_index));
                // If we used next_new_index for this state, increment it.
                if new_index == next_new_index {
                    next_new_index += 1;
                }
            }
            // If we didn't find any duplicates, next_new_index will have kept pace with
            // index, so we know we're finished.
            if next_new_index as usize == self.states.len() {
                break;
            }
            // Iterate over all the states, either deleting them or updating indexes
            // according to the mapping we created; then repeat the search.
            for index in (0 .. self.states.len()).rev() {
                if mappings[index].1 {
                    self.states.remove(index);
                } else {
                    let state = &mut self.states[index];
                    if state.fallback_state != -1 {
                        state.fallback_state = mappings[state.fallback_state as usize].0;
                    }
                    for t in state.transitions.0.iter_mut() {
                        *t.1 = mappings[*t.1 as usize].0;
                    }
                }
            }
        }
    }

    fn flatten(&self) -> Vec<u8> {
        // Calculate total space needed for state data, and build the state_to_offset table.
        let mut state_data_size = 0;
        let mut state_to_offset = Vec::<usize>::with_capacity(self.states.len());
        for state in &self.states {
            state_to_offset.push(state_data_size);
            state_data_size += if state.repl_string.is_some() { 12 } else { 8 };
            state_data_size += state.transitions.0.len() * 4;
        }

        // Helper to map a state index to its offset in the final data block.
        let get_state_offset_for = |state_index: i32| -> u32 {
            if state_index < 0 {
                return super::INVALID_STATE_OFFSET;
            }
            state_to_offset[state_index as usize] as u32
        };

        // Helper to map a byte string to its offset in the final data block, and
        // store the bytes into string_data unless using an already-existing string.
        let mut string_to_offset = HashMap::<Vec<u8>,usize>::new();
        let mut string_data = Vec::<u8>::new();
        let mut get_string_offset_for = |bytes: &Option<Vec<u8>>| -> u16 {
            if bytes.is_none() {
                return super::INVALID_STRING_OFFSET;
            }
            assert!(bytes.as_ref().unwrap().len() < 256);
            let new_offset = string_data.len();
            let offset = *string_to_offset.entry(bytes.as_ref().unwrap().clone()).or_insert(new_offset);
            if offset == new_offset {
                string_data.push(bytes.as_ref().unwrap().len() as u8);
                string_data.extend_from_slice(bytes.as_ref().unwrap().as_ref());
            }
            offset.try_into().unwrap()
        };

        // Handle nohyphen string list if present, converting comma separators to NULs
        // and trimming any surplus whitespace.
        let mut nohyphen_string_offset: u16 = super::INVALID_STRING_OFFSET;
        let mut nohyphen_count: u16 = 0;
        if self.nohyphen.is_some() {
            let nohyphen_strings: Vec<_> = self.nohyphen.as_ref().unwrap().split(',').map(|x| x.trim()).collect();
            nohyphen_count = nohyphen_strings.len().try_into().unwrap();
            nohyphen_string_offset = get_string_offset_for(&Some(nohyphen_strings.join("\0").as_bytes().to_vec()));
        }

        let mut state_data = Vec::<u8>::with_capacity(state_data_size);
        for state in &self.states {
            state_data.extend(&get_state_offset_for(state.fallback_state).to_le_bytes());
            state_data.extend(&get_string_offset_for(&state.match_string).to_le_bytes());
            state_data.push(state.transitions.0.len() as u8);
            // Determine whether to use an extended state record, and if so add the
            // replacement string and index fields.
            if state.repl_string.is_none() {
                state_data.push(0);
            } else {
                state_data.push(1);
                state_data.extend(&get_string_offset_for(&state.repl_string).to_le_bytes());
                state_data.push(state.repl_index as u8);
                state_data.push(state.repl_cut as u8);
            }
            // Collect transitions into an array so we can sort them.
            let mut transitions = vec![];
            for (key, value) in state.transitions.0.iter() {
                transitions.push((*key, get_state_offset_for(*value)))
            }
            transitions.sort();
            for t in transitions {
                // New state offset is stored as a 24-bit value, so we do this manually.
                state_data.push((t.1 & 0xff) as u8);
                state_data.push(((t.1 >> 8) & 0xff) as u8);
                state_data.push(((t.1 >> 16) & 0xff) as u8);
                state_data.push(t.0);
            }
        }
        assert_eq!(state_data.len(), state_data_size);

        // Pad string data to a 4-byte boundary
        while string_data.len() & 3 != 0 {
            string_data.push(0);
        }

        let total_size = super::LEVEL_HEADER_SIZE as usize + state_data_size + string_data.len();
        let mut result = Vec::<u8>::with_capacity(total_size);

        let state_data_base: u32 = super::LEVEL_HEADER_SIZE as u32;
        let string_data_base: u32 = state_data_base + state_data_size as u32;

        result.extend(&state_data_base.to_le_bytes());
        result.extend(&string_data_base.to_le_bytes());
        result.extend(&nohyphen_string_offset.to_le_bytes());
        result.extend(&nohyphen_count.to_le_bytes());
        result.push(self.lh_min);
        result.push(self.rh_min);
        result.push(self.clh_min);
        result.push(self.crh_min);

        result.extend(state_data.iter());
        result.extend(string_data.iter());

        assert_eq!(result.len(), total_size);

        result
    }
}

/// Read a libhyphen-style pattern file and create the corresponding state
/// machine transitions, etc.
/// The returned Vec can be passed to write_hyf_file() to generate a flattened
/// representation of the state machine in mapped_hyph's binary format.
fn read_dic_file<T: Read>(dic_file: T, compress: bool) -> Result<Vec<LevelBuilder>, &'static str> {
    let reader = BufReader::new(dic_file);

    let mut builders = Vec::<LevelBuilder>::new();
    builders.push(LevelBuilder::new());
    let mut builder = &mut builders[0];

    for (index, line) in reader.lines().enumerate() {
        let mut trimmed = line.unwrap().trim().to_string();
        // Strip comments.
        if let Some(i) = trimmed.find('%') {
            trimmed = trimmed[..i].trim().to_string();
        }
        // Ignore empty lines.
        if trimmed.is_empty() {
            continue;
        }
        // Uppercase indicates keyword rather than pattern.
        if trimmed.as_bytes()[0] >= b'A' && trimmed.as_bytes()[0] <= b'Z' {
            // First line is encoding; we only support UTF-8.
            if builder.encoding.is_none() {
                if trimmed != "UTF-8" {
                    return Err("Only UTF-8 patterns are accepted!");
                };
                builder.encoding = Some(trimmed);
                continue;
            }
            // Check for valid keyword-value pairs.
            if trimmed.contains(' ') {
                let parts: Vec<&str> = trimmed.split(' ').collect();
                if parts.len() != 2 {
                    warn!("unrecognized keyword/values: {}", trimmed);
                    continue;
                }
                let keyword = parts[0];
                let value = parts[1];
                match keyword {
                    "LEFTHYPHENMIN" => builder.lh_min = value.parse::<u8>().unwrap(),
                    "RIGHTHYPHENMIN" => builder.rh_min = value.parse::<u8>().unwrap(),
                    "COMPOUNDLEFTHYPHENMIN" => builder.clh_min = value.parse::<u8>().unwrap(),
                    "COMPOUNDRIGHTHYPHENMIN" => builder.crh_min = value.parse::<u8>().unwrap(),
                    "NOHYPHEN" => builder.nohyphen = Some(trimmed),
                    _ => warn!("unknown keyword: {}", trimmed),
                }
                continue;
            }
            // Start a new hyphenation level?
            if trimmed == "NEXTLEVEL" {
                builders.push(LevelBuilder::new());
                builder = builders.last_mut().unwrap();
                continue;
            }
            warn!("unknown keyword: {}", trimmed);
            continue;
        }
        // Patterns should always be provided in lowercase; complain if not, and discard
        // the bad pattern.
        if trimmed != trimmed.to_lowercase() {
            warn!("pattern \"{}\" not lowercased at line {}", trimmed, index);
            continue;
        }
        builder.add_pattern(&trimmed);
    }

    // Create default first (compound-word) level if only one level was provided.
    // (Maybe this should be optional? Currently just copying libhyphen behavior.)
    if builders.len() == 1 {
        let (lh_min, rh_min, clh_min, crh_min) =
            (builders[0].lh_min, builders[0].rh_min, builders[0].clh_min, builders[0].crh_min);
        builders.insert(0, LevelBuilder::new());
        builder = builders.first_mut().unwrap();
        builder.add_pattern("1-1");
        builder.add_pattern("1'1");
        builder.add_pattern("1\u{2013}1"); // en-dash
        builder.add_pattern("1\u{2019}1"); // curly apostrophe
        builder.nohyphen = Some("',\u{2013},\u{2019},-".to_string());
        builder.lh_min = lh_min;
        builder.rh_min = rh_min;
        builder.clh_min = if clh_min > 0 { clh_min } else if lh_min > 0 { lh_min } else { 3 };
        builder.crh_min = if crh_min > 0 { crh_min } else if rh_min > 0 { rh_min } else { 3 };
    }

    // Put in fallback states in each builder.
    for builder in &mut builders {
        for (key, state_index) in builder.str_to_state.iter() {
            if key.is_empty() {
                continue;
            }
            let mut fallback_key = key.clone();
            while !fallback_key.is_empty() {
                fallback_key.remove(0);
                if builder.str_to_state.contains_key(&fallback_key) {
                    break;
                }
            }
            builder.states[*state_index as usize].fallback_state = builder.str_to_state[&fallback_key];
        }
    }

    if compress {
        // Merge duplicate states to reduce size.
        for builder in &mut builders {
            builder.merge_duplicate_states();
        }
    }

    Ok(builders)
}

/// Write out the state machines representing a set of hyphenation rules
/// to the given output stream.
fn write_hyf_file<T: Write>(hyf_file: &mut T, levels: Vec<LevelBuilder>) -> std::io::Result<()> {
    if levels.is_empty() {
        return Err(Error::from(ErrorKind::InvalidData));
    }
    let mut flattened = vec![];
    for level in levels {
        flattened.push(level.flatten());
    }
    // Write file header: magic number, count of levels.
    hyf_file.write_all(&[b'H', b'y', b'f', b'0'])?;
    let level_count: u32 = flattened.len() as u32;
    hyf_file.write_all(&level_count.to_le_bytes())?;
    // Write array of offsets to each level. First level will begin immediately
    // after the array of offsets.
    let mut offset: u32 = super::FILE_HEADER_SIZE as u32 + 4 * level_count;
    for flat in &flattened {
        hyf_file.write_all(&offset.to_le_bytes())?;
        offset += flat.len() as u32;
    }
    // Write the flattened data for each level.
    for flat in &flattened {
        hyf_file.write_all(&flat)?;
    }
    Ok(())
}

/// The public API to the compilation process: reads `dic_file` and writes compiled tables
/// to `hyf_file`. The `compress` param determines whether extra processing to reduce the
/// size of the output is performed.
pub fn compile<T1: Read, T2: Write>(dic_file: T1, hyf_file: &mut T2, compress: bool) -> std::io::Result<()> {
    match read_dic_file(dic_file, compress) {
        Ok(dic) => write_hyf_file(hyf_file, dic),
        Err(e) => {
            warn!("parse error: {}", e);
            return Err(Error::from(ErrorKind::InvalidData))
        }
    }
}