summaryrefslogtreecommitdiffstats
path: root/third_party/rust/unicode-bidi/src/prepare.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/unicode-bidi/src/prepare.rs')
-rw-r--r--third_party/rust/unicode-bidi/src/prepare.rs451
1 files changed, 451 insertions, 0 deletions
diff --git a/third_party/rust/unicode-bidi/src/prepare.rs b/third_party/rust/unicode-bidi/src/prepare.rs
new file mode 100644
index 0000000000..9234e1aa61
--- /dev/null
+++ b/third_party/rust/unicode-bidi/src/prepare.rs
@@ -0,0 +1,451 @@
+// Copyright 2015 The Servo Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! 3.3.3 Preparations for Implicit Processing
+//!
+//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
+
+use alloc::vec::Vec;
+use core::cmp::max;
+use core::ops::Range;
+
+use super::level::Level;
+use super::BidiClass::{self, *};
+
+/// A maximal substring of characters with the same embedding level.
+///
+/// Represented as a range of byte indices.
+pub type LevelRun = Range<usize>;
+
+/// Output of `isolating_run_sequences` (steps X9-X10)
+#[derive(Debug, PartialEq)]
+pub struct IsolatingRunSequence {
+ pub runs: Vec<LevelRun>,
+ pub sos: BidiClass, // Start-of-sequence type.
+ pub eos: BidiClass, // End-of-sequence type.
+}
+
+/// Compute the set of isolating run sequences.
+///
+/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
+/// except the last one in the sequence, the last character of the run is an isolate initiator
+/// whose matching PDI is the first character of the next level run in the sequence.
+///
+/// Note: This function does *not* return the sequences in order by their first characters.
+#[cfg_attr(feature = "flame_it", flamer::flame)]
+pub fn isolating_run_sequences(
+ para_level: Level,
+ original_classes: &[BidiClass],
+ levels: &[Level],
+) -> Vec<IsolatingRunSequence> {
+ let runs = level_runs(levels, original_classes);
+
+ // Compute the set of isolating run sequences.
+ // <http://www.unicode.org/reports/tr9/#BD13>
+ let mut sequences = Vec::with_capacity(runs.len());
+
+ // When we encounter an isolate initiator, we push the current sequence onto the
+ // stack so we can resume it after the matching PDI.
+ let mut stack = vec![Vec::new()];
+
+ for run in runs {
+ assert!(run.len() > 0);
+ assert!(!stack.is_empty());
+
+ let start_class = original_classes[run.start];
+ // > In rule X10, [..] skip over any BNs when [..].
+ // > Do the same when determining if the last character of the sequence is an isolate initiator.
+ //
+ // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
+ let end_class = original_classes[run.start..run.end]
+ .iter()
+ .copied()
+ .rev()
+ .filter(not_removed_by_x9)
+ .next()
+ .unwrap_or(start_class);
+
+ let mut sequence = if start_class == PDI && stack.len() > 1 {
+ // Continue a previous sequence interrupted by an isolate.
+ stack.pop().unwrap()
+ } else {
+ // Start a new sequence.
+ Vec::new()
+ };
+
+ sequence.push(run);
+
+ if let RLI | LRI | FSI = end_class {
+ // Resume this sequence after the isolate.
+ stack.push(sequence);
+ } else {
+ // This sequence is finished.
+ sequences.push(sequence);
+ }
+ }
+ // Pop any remaning sequences off the stack.
+ sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
+
+ // Determine the `sos` and `eos` class for each sequence.
+ // <http://www.unicode.org/reports/tr9/#X10>
+ sequences
+ .into_iter()
+ .map(|sequence: Vec<LevelRun>| {
+ assert!(!sequence.is_empty());
+
+ let mut result = IsolatingRunSequence {
+ runs: sequence,
+ sos: L,
+ eos: L,
+ };
+
+ let start_of_seq = result.runs[0].start;
+ let runs_len = result.runs.len();
+ let end_of_seq = result.runs[runs_len - 1].end;
+
+ // > (not counting characters removed by X9)
+ let seq_level = result
+ .iter_forwards_from(start_of_seq, 0)
+ .filter(|i| not_removed_by_x9(&original_classes[*i]))
+ .map(|i| levels[i])
+ .next()
+ .unwrap_or(levels[start_of_seq]);
+
+ // XXXManishearth the spec talks of a start and end level,
+ // but for a given IRS the two should be equivalent, yes?
+ let end_level = result
+ .iter_backwards_from(end_of_seq, runs_len - 1)
+ .filter(|i| not_removed_by_x9(&original_classes[*i]))
+ .map(|i| levels[i])
+ .next()
+ .unwrap_or(levels[end_of_seq - 1]);
+
+ #[cfg(test)]
+ for run in result.runs.clone() {
+ for idx in run {
+ if not_removed_by_x9(&original_classes[idx]) {
+ assert_eq!(seq_level, levels[idx]);
+ }
+ }
+ }
+
+ // Get the level of the last non-removed char before the runs.
+ let pred_level = match original_classes[..start_of_seq]
+ .iter()
+ .rposition(not_removed_by_x9)
+ {
+ Some(idx) => levels[idx],
+ None => para_level,
+ };
+
+ // Get the last non-removed character to check if it is an isolate initiator.
+ // The spec calls for an unmatched one, but matched isolate initiators
+ // will never be at the end of a level run (otherwise there would be more to the run).
+ // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
+ let last_non_removed = original_classes[..end_of_seq]
+ .iter()
+ .copied()
+ .rev()
+ .find(not_removed_by_x9)
+ .unwrap_or(BN);
+
+ // Get the level of the next non-removed char after the runs.
+ let succ_level = if let RLI | LRI | FSI = last_non_removed {
+ para_level
+ } else {
+ match original_classes[end_of_seq..]
+ .iter()
+ .position(not_removed_by_x9)
+ {
+ Some(idx) => levels[end_of_seq + idx],
+ None => para_level,
+ }
+ };
+
+ result.sos = max(seq_level, pred_level).bidi_class();
+ result.eos = max(end_level, succ_level).bidi_class();
+ result
+ })
+ .collect()
+}
+
+impl IsolatingRunSequence {
+ /// Given a text-relative position `pos` and an index of the level run it is in,
+ /// produce an iterator of all characters after and pos (`pos..`) that are in this
+ /// run sequence
+ pub(crate) fn iter_forwards_from(
+ &self,
+ pos: usize,
+ level_run_index: usize,
+ ) -> impl Iterator<Item = usize> + '_ {
+ let runs = &self.runs[level_run_index..];
+
+ // Check that it is in range
+ // (we can't use contains() since we want an inclusive range)
+ #[cfg(feature = "std")]
+ debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
+
+ (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
+ }
+
+ /// Given a text-relative position `pos` and an index of the level run it is in,
+ /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
+ /// run sequence
+ pub(crate) fn iter_backwards_from(
+ &self,
+ pos: usize,
+ level_run_index: usize,
+ ) -> impl Iterator<Item = usize> + '_ {
+ let prev_runs = &self.runs[..level_run_index];
+ let current = &self.runs[level_run_index];
+
+ // Check that it is in range
+ // (we can't use contains() since we want an inclusive range)
+ #[cfg(feature = "std")]
+ debug_assert!(current.start <= pos && pos <= current.end);
+
+ (current.start..pos)
+ .rev()
+ .chain(prev_runs.iter().rev().flat_map(Clone::clone))
+ }
+}
+
+/// Finds the level runs in a paragraph.
+///
+/// <http://www.unicode.org/reports/tr9/#BD7>
+fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
+ assert_eq!(levels.len(), original_classes.len());
+
+ let mut runs = Vec::new();
+ if levels.is_empty() {
+ return runs;
+ }
+
+ let mut current_run_level = levels[0];
+ let mut current_run_start = 0;
+ for i in 1..levels.len() {
+ if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
+ // End the last run and start a new one.
+ runs.push(current_run_start..i);
+ current_run_level = levels[i];
+ current_run_start = i;
+ }
+ }
+ runs.push(current_run_start..levels.len());
+
+ runs
+}
+
+/// Should this character be ignored in steps after X9?
+///
+/// <http://www.unicode.org/reports/tr9/#X9>
+pub fn removed_by_x9(class: BidiClass) -> bool {
+ match class {
+ RLE | LRE | RLO | LRO | PDF | BN => true,
+ _ => false,
+ }
+}
+
+// For use as a predicate for `position` / `rposition`
+pub fn not_removed_by_x9(class: &BidiClass) -> bool {
+ !removed_by_x9(*class)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_level_runs() {
+ assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
+ assert_eq!(
+ level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
+ &[0..3, 3..5, 5..6, 6..8]
+ );
+ }
+
+ // From <http://www.unicode.org/reports/tr9/#BD13>
+ #[rustfmt::skip]
+ #[test]
+ fn test_isolating_run_sequences() {
+
+ // == Example 1 ==
+ // text1·RLE·text2·PDF·RLE·text3·PDF·text4
+ // index 0 1 2 3 4 5 6 7
+ let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
+ let levels = &[0, 1, 1, 1, 1, 1, 1, 0];
+ let para_level = Level::ltr();
+ let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
+ sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
+ assert_eq!(
+ sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
+ vec![vec![0..2], vec![2..7], vec![7..8]]
+ );
+
+ // == Example 2 ==
+ // text1·RLI·text2·PDI·RLI·text3·PDI·text4
+ // index 0 1 2 3 4 5 6 7
+ let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
+ let levels = &[0, 0, 1, 0, 0, 1, 0, 0];
+ let para_level = Level::ltr();
+ let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
+ sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
+ assert_eq!(
+ sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
+ vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
+ );
+
+ // == Example 3 ==
+ // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
+ // index 0 1 2 3 4 5 6 7 8 9 10 11 12
+ let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L];
+ let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0];
+ let para_level = Level::ltr();
+ let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
+ sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
+ assert_eq!(
+ sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
+ vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
+ );
+ }
+
+ // From <http://www.unicode.org/reports/tr9/#X10>
+ #[rustfmt::skip]
+ #[test]
+ fn test_isolating_run_sequences_sos_and_eos() {
+
+ // == Example 1 ==
+ // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
+ // index 0 1 2 3 4 5 6 7 8 9 10 11
+ let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L];
+ let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0];
+ let para_level = Level::ltr();
+ let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
+ sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
+
+ // text1
+ assert_eq!(
+ &sequences[0],
+ &IsolatingRunSequence {
+ runs: vec![0..2],
+ sos: L,
+ eos: R,
+ }
+ );
+
+ // text2
+ assert_eq!(
+ &sequences[1],
+ &IsolatingRunSequence {
+ runs: vec![2..4],
+ sos: R,
+ eos: L,
+ }
+ );
+
+ // text3
+ assert_eq!(
+ &sequences[2],
+ &IsolatingRunSequence {
+ runs: vec![4..6],
+ sos: L,
+ eos: L,
+ }
+ );
+
+ // text4 text5
+ assert_eq!(
+ &sequences[3],
+ &IsolatingRunSequence {
+ runs: vec![6..11],
+ sos: L,
+ eos: R,
+ }
+ );
+
+ // text6
+ assert_eq!(
+ &sequences[4],
+ &IsolatingRunSequence {
+ runs: vec![11..12],
+ sos: R,
+ eos: L,
+ }
+ );
+
+ // == Example 2 ==
+ // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
+ // index 0 1 2 3 4 5 6 7 8 9 10 11
+ let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L];
+ let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0];
+ let para_level = Level::ltr();
+ let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
+ sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
+
+ // text1·RLI·PDI·RLI·PDI·text6
+ assert_eq!(
+ &sequences[0],
+ &IsolatingRunSequence {
+ runs: vec![0..2, 7..9, 10..12],
+ sos: L,
+ eos: L,
+ }
+ );
+
+ // text2·LRI·PDI·text4
+ assert_eq!(
+ &sequences[1],
+ &IsolatingRunSequence {
+ runs: vec![2..4, 5..7],
+ sos: R,
+ eos: R,
+ }
+ );
+
+ // text3
+ assert_eq!(
+ &sequences[2],
+ &IsolatingRunSequence {
+ runs: vec![4..5],
+ sos: L,
+ eos: L,
+ }
+ );
+
+ // text5
+ assert_eq!(
+ &sequences[3],
+ &IsolatingRunSequence {
+ runs: vec![9..10],
+ sos: R,
+ eos: R,
+ }
+ );
+ }
+
+ #[test]
+ fn test_removed_by_x9() {
+ let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
+ let not_classes = &[L, RLI, AL, LRI, PDI];
+ for x in rem_classes {
+ assert_eq!(removed_by_x9(*x), true);
+ }
+ for x in not_classes {
+ assert_eq!(removed_by_x9(*x), false);
+ }
+ }
+
+ #[test]
+ fn test_not_removed_by_x9() {
+ let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
+ for x in non_x9_classes {
+ assert_eq!(not_removed_by_x9(&x), true);
+ }
+ }
+}