summaryrefslogtreecommitdiffstats
path: root/third_party/rust/mapped_hyph/src/lib.rs
blob: 848c93d25790454a8b8db8949735e4681aef310e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#[macro_use]
extern crate arrayref;
extern crate memmap2;
#[macro_use]
extern crate log;

use std::slice;
use std::str;
use std::cmp::max;
use std::fs::File;
use std::mem;

use memmap2::Mmap;

// Make submodules available publicly.
pub mod builder;
pub mod ffi;

// 4-byte identification expected at beginning of a compiled dictionary file.
// (This will be updated if an incompatible change to the format is made in
// some future revision.)
const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0'];

const INVALID_STRING_OFFSET: u16 = 0xffff;
const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff;

const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels
const LEVEL_HEADER_SIZE: usize = 16;

// Transition actually holds a 24-bit new state offset and an 8-bit input byte
// to match. We will be interpreting byte ranges as Transition arrays (in the
// State::transitions() method below), so use repr(C) to ensure we have the
// memory layout we expect.
// Transition records do not depend on any specific alignment.
#[repr(C)]
#[derive(Debug,Copy,Clone)]
struct Transition(u8, u8, u8, u8);

impl Transition {
    fn new_state_offset(&self) -> usize {
        // Read a 24-bit little-endian number from three bytes.
        self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16)
    }
    fn match_byte(&self) -> u8 {
        self.3
    }
}

// State is an area of the Level's data block that begins with a fixed header,
// followed by an array of transitions. The total size of each State's data
// depends on the number of transitions in the state. Only the basic header
// is defined by the struct here; the rest of the state is accessed via
// pointer magic.
// There are two versions of State, a basic version that supports only simple
// hyphenation (no associated spelling change), and an extended version that
// adds the replacement-string fields to support spelling changes at the
// hyphenation point. Check is_extended() to know which version is present.
// State records are NOT necessarily 4-byte aligned, so multi-byte fields
// should be read with care.
#[derive(Debug,Copy,Clone)]
#[repr(C)]
struct State {
    fallback_state: [u8; 4],
    match_string_offset: [u8; 2],
    num_transitions: u8,
    is_extended: u8,
}

#[repr(C)]
struct StateExtended {
    state: State,
    repl_string_offset: [u8; 2],
    repl_index: i8,
    repl_cut: i8,
}

impl State {
    // Accessors for the various State header fields; see file format description.
    fn fallback_state(&self) -> usize {
        u32::from_le_bytes(self.fallback_state) as usize
    }
    fn match_string_offset(&self) -> usize {
        u16::from_le_bytes(self.match_string_offset) as usize
    }
    fn num_transitions(&self) -> u8 {
        self.num_transitions
    }
    fn is_extended(&self) -> bool {
        self.is_extended != 0
    }
    // Accessors that are only valid if is_extended() is true.
    // These use `unsafe` to dereference a pointer to the relevant field;
    // this is OK because Level::get_state always validates the total state size
    // before returning a state reference, so these pointers will be valid for
    // any extended state it returns.
    #[allow(dead_code)]
    fn as_extended(&self) -> &StateExtended {
        debug_assert!(self.is_extended());
        unsafe { mem::transmute(self) }
    }
    #[allow(dead_code)]
    fn repl_string_offset(&self) -> usize {
        u16::from_le_bytes(self.as_extended().repl_string_offset) as usize
    }
    #[allow(dead_code)]
    fn repl_index(&self) -> i8 {
        self.as_extended().repl_index
    }
    #[allow(dead_code)]
    fn repl_cut(&self) -> i8 {
        self.as_extended().repl_cut
    }
    // Return the state's Transitions as a slice reference.
    fn transitions(&self) -> &[Transition] {
        let count = self.num_transitions() as usize;
        if count == 0 {
            return &[];
        }
        let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize;
        // We know the `offset` here will not look beyond the valid range of memory
        // because Level::get_state() checks the state length (accounting for the
        // number of transitions) before returning a State reference.
        let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition };
        // Again, because Level::get_state() already checked the state length, we know
        // this slice address and count will be valid.
        unsafe { slice::from_raw_parts(trans_ptr, count) }
    }
    // Look up the Transition for a given input byte, or None.
    fn transition_for(&self, b: u8) -> Option<Transition> {
        // The transitions array is sorted by match_byte() value, but there are
        // usually very few entries; benchmarking showed that using binary_search_by
        // here gave no benefit (possibly slightly slower).
        self.transitions().iter().copied().find(|t| t.match_byte() == b)
    }
    // Just for debugging use...
    #[allow(dead_code)]
    fn deep_show(&self, prefix: &str, dic: &Level) {
        if self.match_string_offset() != INVALID_STRING_OFFSET as usize {
            let match_string = dic.string_at_offset(self.match_string_offset());
            println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap());
        }
        for t in self.transitions() {
            println!("{}{} ->", prefix, t.match_byte() as char);
            let next_prefix = format!("{}  ", prefix);
            dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic);
        }
    }
}

// We count the presentation-form ligature characters U+FB00..FB06 as multiple
// chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these
// ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this
// helper returns the "decomposed length" of the ligature given its trailing
// byte.
fn lig_length(trail_byte: u8) -> usize {
    // This is only called on valid UTF-8 where we already know trail_byte
    // must be >= 0x80.
    // Ligature lengths:       ff   fi   fl   ffi  ffl  long-st  st
    const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8,     2u8 ];
    if trail_byte > 0x86 {
        return 1;
    }
    LENGTHS[trail_byte as usize - 0x80] as usize
}

fn is_utf8_trail_byte(byte: u8) -> bool {
    (byte & 0xC0) == 0x80
}

fn is_ascii_digit(byte: u8) -> bool {
    byte <= b'9' && byte >= b'0'
}

fn is_odd(byte: u8) -> bool {
    (byte & 0x01) == 0x01
}

// A hyphenation Level has a header followed by State records and packed string
// data. The total size of the slice depends on the number and size of the
// States and Strings it contains.
// Note that the data of the Level may not have any specific alignment!
#[derive(Debug,Copy,Clone)]
struct Level<'a> {
    data: &'a [u8],
    // Header fields cached by the constructor for faster access:
    state_data_base_: usize,
    string_data_base_: usize,
}

impl Level<'_> {
    // Constructor that initializes our cache variables.
    fn new(data: &[u8]) -> Level {
        Level {
            data,
            state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize,
            string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize,
        }
    }

    // Accessors for Level header fields; see file format description.
    fn state_data_base(&self) -> usize {
        self.state_data_base_ // cached by constructor
    }
    fn string_data_base(&self) -> usize {
        self.string_data_base_ // cached by constructor
    }
    fn nohyphen_string_offset(&self) -> usize {
        u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize
    }
    #[allow(dead_code)]
    fn nohyphen_count(&self) -> u16 {
        u16::from_le_bytes(*array_ref!(self.data, 10, 2))
    }
    fn lh_min(&self) -> usize {
        max(1, self.data[12] as usize)
    }
    fn rh_min(&self) -> usize {
        max(1, self.data[13] as usize)
    }
    fn clh_min(&self) -> usize {
        max(1, self.data[14] as usize)
    }
    fn crh_min(&self) -> usize {
        max(1, self.data[15] as usize)
    }
    fn word_boundary_mins(&self) -> (usize, usize, usize, usize) {
        (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min())
    }
    // Strings are represented as offsets from the Level's string_data_base.
    // This returns a byte slice referencing the string at a given offset,
    // or an empty slice if invalid.
    fn string_at_offset(&self, offset: usize) -> &'_ [u8] {
        if offset == INVALID_STRING_OFFSET as usize {
            return &[];
        }
        let string_base = self.string_data_base() as usize + offset;
        // TODO: move this to the validation function.
        debug_assert!(string_base < self.data.len());
        if string_base + 1 > self.data.len() {
            return &[];
        }
        let len = self.data[string_base] as usize;
        // TODO: move this to the validation function.
        debug_assert!(string_base + 1 + len <= self.data.len());
        if string_base + 1 + len > self.data.len() {
            return &[];
        }
        self.data.get(string_base + 1 .. string_base + 1 + len).unwrap()
    }
    // The nohyphen field actually contains multiple NUL-separated substrings;
    // return them as a vector of individual byte slices.
    fn nohyphen(&self) -> Vec<&[u8]> {
        let string_offset = self.nohyphen_string_offset();
        let nohyph_str = self.string_at_offset(string_offset as usize);
        if nohyph_str.is_empty() {
            return vec![];
        }
        nohyph_str.split(|&b| b == 0).collect()
    }
    // States are represented as an offset from the Level's state_data_base.
    // This returns a reference to the State at a given offset, or None if invalid.
    fn get_state(&self, offset: usize) -> Option<&State> {
        if offset == INVALID_STATE_OFFSET as usize {
            return None;
        }
        debug_assert_eq!(offset & 3, 0);
        let state_base = self.state_data_base() + offset;
        // TODO: move this to the validation function.
        debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base());
        if state_base + mem::size_of::<State>() > self.string_data_base() {
            return None;
        }
        let state_ptr = &self.data[state_base] as *const u8 as *const State;
        // This is safe because we just checked against self.string_data_base() above.
        let state = unsafe { state_ptr.as_ref().unwrap() };
        let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() }
                + mem::size_of::<Transition>() * state.num_transitions() as usize;
        // TODO: move this to the validation function.
        debug_assert!(state_base + length <= self.string_data_base());
        if state_base + length > self.string_data_base() {
            return None;
        }
        // This is safe because we checked the full state length against self.string_data_base().
        unsafe { state_ptr.as_ref() }
    }
    // Sets hyphenation values (odd = potential break, even = no break) in values[],
    // and returns the change in the number of odd values present, so the caller can
    // keep track of the total number of potential breaks in the word.
    fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize {
        // Bail out immediately if the word is too short to hyphenate.
        if word.len() < lh_min + rh_min {
            return 0;
        }
        let start_state = self.get_state(0);
        let mut st = start_state;
        let mut hyph_count = 0;
        for i in 0 .. word.len() + 2 {
            // Loop over the word by bytes, with a virtual '.' added at each end
            // to match word-boundary patterns.
            let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] };
            loop {
                // Loop to repeatedly fall back if we don't find a matching transition.
                // Note that this could infinite-loop if there is a state whose fallback
                // points to itself (or a cycle of fallbacks), but this would represent
                // a table compilation error.
                // (A potential validation function could check for fallback cycles.)
                if st.is_none() {
                    st = start_state;
                    break;
                }
                let state = st.unwrap();
                if let Some(tr) = state.transition_for(b) {
                    // Found a transition for the current byte. Look up the new state;
                    // if it has a match_string, merge its weights into `values`.
                    st = self.get_state(tr.new_state_offset());
                    if let Some(state) = st {
                        let match_offset = state.match_string_offset();
                        if match_offset != INVALID_STRING_OFFSET as usize {
                            if state.is_extended() {
                                debug_assert!(false, "extended hyphenation not supported by this function");
                            } else {
                                let match_str = self.string_at_offset(match_offset);
                                let offset = i + 1 - match_str.len();
                                assert!(offset + match_str.len() <= word.len() + 2);
                                for (j, ch) in match_str.iter().enumerate() {
                                    let index = offset + j;
                                    if index >= lh_min && index <= word.len() - rh_min {
                                        // lh_min and rh_min are guaranteed to be >= 1,
                                        // so this will not try to access outside values[].
                                        let old_value = values[index - 1];
                                        let value = ch - b'0';
                                        if value > old_value {
                                            if is_odd(old_value) != is_odd(value) {
                                                // Adjust hyph_count for the change we're making
                                                hyph_count += if is_odd(value) { 1 } else { -1 };
                                            }
                                            values[index - 1] = value;
                                        }
                                    }
                                }
                            }
                        }
                    }
                    // We have handled the current input byte; leave the fallback loop
                    // and get next input.
                    break;
                }
                // No transition for the current byte; go to fallback state and try again.
                st = self.get_state(state.fallback_state());
            }
        }

        // If the word was not purely ASCII, or if the word begins/ends with
        // digits, the use of lh_min and rh_min above may not have correctly
        // excluded enough positions, so we need to fix things up here.
        let mut index = 0;
        let mut count = 0;
        let word_bytes = word.as_bytes();
        let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; };
        // Handle lh_min.
        while count < lh_min - 1 && index < word_bytes.len() {
            let byte = word_bytes[index];
            clear_hyphen_at(index);
            if byte < 0x80 {
                index += 1;
                if is_ascii_digit(byte) {
                    continue; // ASCII digits don't count
                }
            } else if byte == 0xEF && word_bytes[index + 1] == 0xAC {
                // Unicode presentation-form ligature characters, which we count as
                // multiple chars for the purpose of lh_min/rh_min, all begin with
                // 0xEF, 0xAC in UTF-8.
                count += lig_length(word_bytes[index + 2]);
                clear_hyphen_at(index + 1);
                clear_hyphen_at(index + 2);
                index += 3;
                continue;
            } else {
                index += 1;
                while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index])  {
                    clear_hyphen_at(index);
                    index += 1;
                }
            }
            count += 1;
        }

        // Handle rh_min.
        count = 0;
        index = word.len();
        while count < rh_min && index > 0 {
            index -= 1;
            let byte = word_bytes[index];
            if index < word.len() - 1 {
                clear_hyphen_at(index);
            }
            if byte < 0x80 {
                // Only count if not an ASCII digit
                if !is_ascii_digit(byte) {
                    count += 1;
                }
                continue;
            }
            if is_utf8_trail_byte(byte) {
                continue;
            }
            if byte == 0xEF && word_bytes[index + 1] == 0xAC {
                // Presentation-form ligatures count as multiple chars.
                count += lig_length(word_bytes[index + 2]);
                continue;
            }
            count += 1;
        }

        hyph_count
    }
}

/// Hyphenation engine encapsulating a language-specific set of patterns (rules)
/// that identify possible break positions within a word.
pub struct Hyphenator<'a>(&'a [u8]);

impl Hyphenator<'_> {
    /// Return a Hyphenator that wraps the given buffer.
    /// This does *not* check that the given buffer is in fact a valid hyphenation table.
    /// Use `is_valid_hyphenator()` to determine whether it is usable.
    /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
    /// unvalidated data is not unsafe, but may panic.)
    pub fn new(buffer: &[u8]) -> Hyphenator {
        Hyphenator(buffer)
    }

    // Internal implementation details
    fn magic_number(&self) -> &[u8] {
        &self.0[0 .. 4]
    }
    fn num_levels(&self) -> usize {
        u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize
    }
    fn level(&self, i: usize) -> Level {
        let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize;
        let limit = if i == self.num_levels() - 1 {
            self.0.len()
        } else {
            u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize
        };
        debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len());
        debug_assert_eq!(offset & 3, 0);
        debug_assert_eq!(limit & 3, 0);
        Level::new(&self.0[offset .. limit])
    }

    /// Identify acceptable hyphenation positions in the given `word`.
    ///
    /// The caller-supplied `values` must be at least as long as the `word`.
    ///
    /// On return, any elements with an odd value indicate positions in the word
    /// after which a hyphen could be inserted.
    ///
    /// Returns the number of possible hyphenation positions that were found.
    ///
    /// # Panics
    /// If the given `values` slice is too small to hold the results.
    ///
    /// If the block of memory represented by `self.0` is not in fact a valid
    /// hyphenation dictionary, this function may panic with an overflow or
    /// array bounds violation.
    pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize {
        assert!(values.len() >= word.len());
        values.iter_mut().for_each(|x| *x = 0);
        let top_level = self.level(0);
        let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins();
        if word.len() < lh_min + rh_min {
            return 0;
        }
        let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min);
        let compound = hyph_count > 0;
        // Subsequent levels are applied to fragments between potential breaks
        // already found:
        for l in 1 .. self.num_levels() {
            let level = self.level(l);
            if hyph_count > 0 {
                let mut begin = 0;
                let mut lh = lh_min;
                // lh_min and rh_min are both guaranteed to be greater than zero,
                // so this loop will not reach fully to the end of the word.
                for i in lh_min - 1 .. word.len() - rh_min {
                    if is_odd(values[i]) {
                        if i > begin {
                            // We've found a component of a compound;
                            // clear the corresponding values and apply the new level.
                            // (These values must be even, so hyph_count is unchanged.)
                            values[begin .. i].iter_mut().for_each(|x| {
                                *x = 0;
                            });
                            hyph_count += level.find_hyphen_values(&word[begin ..= i],
                                                                   &mut values[begin ..= i],
                                                                   lh, crh_min);
                        }
                        begin = i + 1;
                        lh = clh_min;
                    }
                }
                if begin == 0 {
                    // No compound-word breaks were found, just apply level to the whole word.
                    hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
                } else if begin < word.len() {
                    // Handle trailing component of compound.
                    hyph_count += level.find_hyphen_values(&word[begin .. word.len()],
                                                           &mut values[begin .. word.len()],
                                                           clh_min, rh_min);
                }
            } else {
                hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
            }
        }

        // Only need to check nohyphen strings if top-level (compound) breaks were found.
        if compound && hyph_count > 0 {
            let nohyph = top_level.nohyphen();
            if !nohyph.is_empty() {
                for i in lh_min ..= word.len() - rh_min {
                    if is_odd(values[i - 1]) {
                        for nh in &nohyph {
                            if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] {
                                values[i - 1] = 0;
                                hyph_count -= 1;
                                break;
                            }
                            if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] {
                                values[i - 1] = 0;
                                hyph_count -= 1;
                                break;
                            }
                        }
                    }
                }
            }
        }

        hyph_count
    }

    /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char`
    /// at each valid break position.
    ///
    /// # Panics
    /// If the block of memory represented by `self` is not in fact a valid
    /// hyphenation dictionary, this function may panic with an overflow or
    /// array bounds violation.
    ///
    /// Also panics if the length of the hyphenated word would overflow `usize`.
    pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String {
        let mut values = vec![0u8; word.len()];
        let hyph_count = self.find_hyphen_values(word, &mut values);
        if hyph_count <= 0 {
            return word.to_string();
        }
        // We know how long the result will be, so we can preallocate here.
        let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8();
        let mut result = String::with_capacity(result_len);
        let mut n = 0;
        for ch in word.char_indices() {
            if ch.0 > 0 && is_odd(values[ch.0 - 1]) {
                result.push(hyphchar);
                n += 1;
            }
            result.push(ch.1);
        }
        debug_assert_eq!(n, hyph_count);
        debug_assert_eq!(result_len, result.len());
        result
    }

    /// Check if the block of memory looks like it could be a valid hyphenation
    /// table.
    pub fn is_valid_hyphenator(&self) -> bool {
        // Size must be at least 4 bytes for magic_number + 4 bytes num_levels;
        // smaller than this cannot be safely inspected.
        if self.0.len() < FILE_HEADER_SIZE {
            return false;
        }
        if self.magic_number() != MAGIC_NUMBER {
            return false;
        }
        // For each level, there's a 4-byte offset in the header, and the level
        // has its own 16-byte header, so we can check a minimum size again here.
        let num_levels = self.num_levels();
        if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels {
            return false;
        }
        // Check that state_data_base and string_data_base for each hyphenation
        // level are within range.
        for l in 0 .. num_levels {
            let level = self.level(l);
            if level.state_data_base() < LEVEL_HEADER_SIZE ||
                   level.state_data_base() > level.string_data_base() ||
                   level.string_data_base() > level.data.len() {
                return false;
            }
            // TODO: consider doing more extensive validation of states and
            // strings within the level?
        }
        // It's still possible the dic is internally broken, but at least it's
        // worth trying to use it!
        true
    }
}

/// Load the compiled hyphenation file at `dic_path`, if present.
///
/// Returns `None` if the specified file cannot be opened or mapped,
/// otherwise returns a `memmap2::Mmap` mapping the file.
///
/// # Safety
///
/// This is unsafe for the same reason `Mmap::map()` is unsafe:
/// mapped_hyph does not guarantee safety if the mapped file is modified
/// (e.g. by another process) while we're using it.
///
/// This verifies that the file looks superficially like it may be a
/// compiled hyphenation table, but does *not* fully check the validity
/// of the file contents! Calling hyphenation functions with the returned
/// data is not unsafe, but may panic if the data is invalid.
pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> {
    let file = File::open(dic_path).ok()?;
    let dic = Mmap::map(&file).ok()?;
    let hyph = Hyphenator(&*dic);
    if hyph.is_valid_hyphenator() {
        return Some(dic);
    }
    None
}