diff options
Diffstat (limited to 'third_party/rust/unicode-bidi/src/prepare.rs')
-rw-r--r-- | third_party/rust/unicode-bidi/src/prepare.rs | 368 |
1 files changed, 368 insertions, 0 deletions
diff --git a/third_party/rust/unicode-bidi/src/prepare.rs b/third_party/rust/unicode-bidi/src/prepare.rs new file mode 100644 index 0000000000..7b952361a4 --- /dev/null +++ b/third_party/rust/unicode-bidi/src/prepare.rs @@ -0,0 +1,368 @@ +// Copyright 2015 The Servo Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! 3.3.3 Preparations for Implicit Processing +//! +//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing> + +use alloc::vec::Vec; +use core::cmp::max; +use core::ops::Range; + +use super::level::Level; +use super::BidiClass::{self, *}; + +/// A maximal substring of characters with the same embedding level. +/// +/// Represented as a range of byte indices. +pub type LevelRun = Range<usize>; + +/// Output of `isolating_run_sequences` (steps X9-X10) +#[derive(Debug, PartialEq)] +pub struct IsolatingRunSequence { + pub runs: Vec<LevelRun>, + pub sos: BidiClass, // Start-of-sequence type. + pub eos: BidiClass, // End-of-sequence type. +} + +/// Compute the set of isolating run sequences. +/// +/// An isolating run sequence is a maximal sequence of level runs such that for all level runs +/// except the last one in the sequence, the last character of the run is an isolate initiator +/// whose matching PDI is the first character of the next level run in the sequence. +/// +/// Note: This function does *not* return the sequences in order by their first characters. +#[cfg_attr(feature = "flame_it", flamer::flame)] +pub fn isolating_run_sequences( + para_level: Level, + original_classes: &[BidiClass], + levels: &[Level], +) -> Vec<IsolatingRunSequence> { + let runs = level_runs(levels, original_classes); + + // Compute the set of isolating run sequences. + // <http://www.unicode.org/reports/tr9/#BD13> + let mut sequences = Vec::with_capacity(runs.len()); + + // When we encounter an isolate initiator, we push the current sequence onto the + // stack so we can resume it after the matching PDI. + let mut stack = vec![Vec::new()]; + + for run in runs { + assert!(run.len() > 0); + assert!(!stack.is_empty()); + + let start_class = original_classes[run.start]; + let end_class = original_classes[run.end - 1]; + + let mut sequence = if start_class == PDI && stack.len() > 1 { + // Continue a previous sequence interrupted by an isolate. + stack.pop().unwrap() + } else { + // Start a new sequence. + Vec::new() + }; + + sequence.push(run); + + if let RLI | LRI | FSI = end_class { + // Resume this sequence after the isolate. + stack.push(sequence); + } else { + // This sequence is finished. + sequences.push(sequence); + } + } + // Pop any remaning sequences off the stack. + sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); + + // Determine the `sos` and `eos` class for each sequence. + // <http://www.unicode.org/reports/tr9/#X10> + sequences + .into_iter() + .map(|sequence: Vec<LevelRun>| { + assert!(!sequence.is_empty()); + + let start_of_seq = sequence[0].start; + let end_of_seq = sequence[sequence.len() - 1].end; + let seq_level = levels[start_of_seq]; + + #[cfg(test)] + for run in sequence.clone() { + for idx in run { + if not_removed_by_x9(&original_classes[idx]) { + assert_eq!(seq_level, levels[idx]); + } + } + } + + // Get the level of the last non-removed char before the runs. + let pred_level = match original_classes[..start_of_seq] + .iter() + .rposition(not_removed_by_x9) + { + Some(idx) => levels[idx], + None => para_level, + }; + + // Get the level of the next non-removed char after the runs. + let succ_level = if let RLI | LRI | FSI = original_classes[end_of_seq - 1] { + para_level + } else { + match original_classes[end_of_seq..] + .iter() + .position(not_removed_by_x9) + { + Some(idx) => levels[end_of_seq + idx], + None => para_level, + } + }; + + IsolatingRunSequence { + runs: sequence, + sos: max(seq_level, pred_level).bidi_class(), + eos: max(seq_level, succ_level).bidi_class(), + } + }) + .collect() +} + +/// Finds the level runs in a paragraph. +/// +/// <http://www.unicode.org/reports/tr9/#BD7> +fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> { + assert_eq!(levels.len(), original_classes.len()); + + let mut runs = Vec::new(); + if levels.is_empty() { + return runs; + } + + let mut current_run_level = levels[0]; + let mut current_run_start = 0; + for i in 1..levels.len() { + if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { + // End the last run and start a new one. + runs.push(current_run_start..i); + current_run_level = levels[i]; + current_run_start = i; + } + } + runs.push(current_run_start..levels.len()); + + runs +} + +/// Should this character be ignored in steps after X9? +/// +/// <http://www.unicode.org/reports/tr9/#X9> +pub fn removed_by_x9(class: BidiClass) -> bool { + match class { + RLE | LRE | RLO | LRO | PDF | BN => true, + _ => false, + } +} + +// For use as a predicate for `position` / `rposition` +pub fn not_removed_by_x9(class: &BidiClass) -> bool { + !removed_by_x9(*class) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_level_runs() { + assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); + assert_eq!( + level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), + &[0..3, 3..5, 5..6, 6..8] + ); + } + + // From <http://www.unicode.org/reports/tr9/#BD13> + #[rustfmt::skip] + #[test] + fn test_isolating_run_sequences() { + + // == Example 1 == + // text1·RLE·text2·PDF·RLE·text3·PDF·text4 + // index 0 1 2 3 4 5 6 7 + let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; + let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; + let para_level = Level::ltr(); + let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); + assert_eq!( + sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), + vec![vec![0..2], vec![2..7], vec![7..8]] + ); + + // == Example 2 == + // text1·RLI·text2·PDI·RLI·text3·PDI·text4 + // index 0 1 2 3 4 5 6 7 + let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; + let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; + let para_level = Level::ltr(); + let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); + assert_eq!( + sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), + vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] + ); + + // == Example 3 == + // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 + // index 0 1 2 3 4 5 6 7 8 9 10 11 12 + let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; + let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; + let para_level = Level::ltr(); + let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); + assert_eq!( + sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), + vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] + ); + } + + // From <http://www.unicode.org/reports/tr9/#X10> + #[rustfmt::skip] + #[test] + fn test_isolating_run_sequences_sos_and_eos() { + + // == Example 1 == + // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 + // index 0 1 2 3 4 5 6 7 8 9 10 11 + let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; + let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; + let para_level = Level::ltr(); + let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); + + // text1 + assert_eq!( + &sequences[0], + &IsolatingRunSequence { + runs: vec![0..2], + sos: L, + eos: R, + } + ); + + // text2 + assert_eq!( + &sequences[1], + &IsolatingRunSequence { + runs: vec![2..4], + sos: R, + eos: L, + } + ); + + // text3 + assert_eq!( + &sequences[2], + &IsolatingRunSequence { + runs: vec![4..6], + sos: L, + eos: L, + } + ); + + // text4 text5 + assert_eq!( + &sequences[3], + &IsolatingRunSequence { + runs: vec![6..11], + sos: L, + eos: R, + } + ); + + // text6 + assert_eq!( + &sequences[4], + &IsolatingRunSequence { + runs: vec![11..12], + sos: R, + eos: L, + } + ); + + // == Example 2 == + // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 + // index 0 1 2 3 4 5 6 7 8 9 10 11 + let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; + let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; + let para_level = Level::ltr(); + let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); + + // text1·RLI·PDI·RLI·PDI·text6 + assert_eq!( + &sequences[0], + &IsolatingRunSequence { + runs: vec![0..2, 7..9, 10..12], + sos: L, + eos: L, + } + ); + + // text2·LRI·PDI·text4 + assert_eq!( + &sequences[1], + &IsolatingRunSequence { + runs: vec![2..4, 5..7], + sos: R, + eos: R, + } + ); + + // text3 + assert_eq!( + &sequences[2], + &IsolatingRunSequence { + runs: vec![4..5], + sos: L, + eos: L, + } + ); + + // text5 + assert_eq!( + &sequences[3], + &IsolatingRunSequence { + runs: vec![9..10], + sos: R, + eos: R, + } + ); + } + + #[test] + fn test_removed_by_x9() { + let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; + let not_classes = &[L, RLI, AL, LRI, PDI]; + for x in rem_classes { + assert_eq!(removed_by_x9(*x), true); + } + for x in not_classes { + assert_eq!(removed_by_x9(*x), false); + } + } + + #[test] + fn test_not_removed_by_x9() { + let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI]; + for x in non_x9_classes { + assert_eq!(not_removed_by_x9(&x), true); + } + } +} |