diff options
Diffstat (limited to 'vendor/unicode-bidi/src/implicit.rs')
-rw-r--r-- | vendor/unicode-bidi/src/implicit.rs | 521 |
1 files changed, 439 insertions, 82 deletions
diff --git a/vendor/unicode-bidi/src/implicit.rs b/vendor/unicode-bidi/src/implicit.rs index bf37f17e6..294af7cbd 100644 --- a/vendor/unicode-bidi/src/implicit.rs +++ b/vendor/unicode-bidi/src/implicit.rs @@ -14,104 +14,213 @@ use core::cmp::max; use super::char_data::BidiClass::{self, *}; use super::level::Level; -use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence, LevelRun}; +use super::prepare::{not_removed_by_x9, removed_by_x9, IsolatingRunSequence}; +use super::BidiDataSource; /// 3.3.4 Resolving Weak Types /// /// <http://www.unicode.org/reports/tr9/#Resolving_Weak_Types> #[cfg_attr(feature = "flame_it", flamer::flame)] -pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass]) { - // FIXME (#8): This function applies steps W1-W6 in a single pass. This can produce - // incorrect results in cases where a "later" rule changes the value of `prev_class` seen - // by an "earlier" rule. We should either split this into separate passes, or preserve - // extra state so each rule can see the correct previous class. - - // FIXME: Also, this could be the cause of increased failure for using longer-UTF-8 chars in - // conformance tests, like BidiTest:69635 (AL ET EN) +pub fn resolve_weak( + text: &str, + sequence: &IsolatingRunSequence, + processing_classes: &mut [BidiClass], +) { + // Note: The spec treats these steps as individual passes that are applied one after the other + // on the entire IsolatingRunSequence at once. We instead collapse it into a single iteration, + // which is straightforward for rules that are based on the state of the current character, but not + // for rules that care about surrounding characters. To deal with them, we retain additional state + // about previous character classes that may have since been changed by later rules. - let mut prev_class = sequence.sos; + // The previous class for the purposes of rule W4/W6, not tracking changes made after or during W4. + let mut prev_class_before_w4 = sequence.sos; + // The previous class for the purposes of rule W5. + let mut prev_class_before_w5 = sequence.sos; + // The previous class for the purposes of rule W1, not tracking changes from any other rules. + let mut prev_class_before_w1 = sequence.sos; let mut last_strong_is_al = false; let mut et_run_indices = Vec::new(); // for W5 + let mut bn_run_indices = Vec::new(); // for W5 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + + for (run_index, level_run) in sequence.runs.iter().enumerate() { + for i in &mut level_run.clone() { + if processing_classes[i] == BN { + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + // Keeps track of bn runs for W5 in case we see an ET. + bn_run_indices.push(i); + // BNs aren't real, skip over them. + continue; + } + + // Store the processing class of all rules before W2/W1. + // Used to keep track of the last strong character for W2. W3 is able to insert new strong + // characters, so we don't want to be misled by it. + let mut w2_processing_class = processing_classes[i]; - // Like sequence.runs.iter().flat_map(Clone::clone), but make indices itself clonable. - fn id(x: LevelRun) -> LevelRun { - x - } - let mut indices = sequence - .runs - .iter() - .cloned() - .flat_map(id as fn(LevelRun) -> LevelRun); - - while let Some(i) = indices.next() { - match processing_classes[i] { // <http://www.unicode.org/reports/tr9/#W1> - NSM => { - processing_classes[i] = match prev_class { + // + + if processing_classes[i] == NSM { + processing_classes[i] = match prev_class_before_w1 { RLI | LRI | FSI | PDI => ON, - _ => prev_class, + _ => prev_class_before_w1, }; + // W1 occurs before W2, update this. + w2_processing_class = processing_classes[i]; } - EN => { - if last_strong_is_al { - // W2. If previous strong char was AL, change EN to AN. - processing_classes[i] = AN; - } else { + + prev_class_before_w1 = processing_classes[i]; + + // <http://www.unicode.org/reports/tr9/#W2> + // <http://www.unicode.org/reports/tr9/#W3> + // + match processing_classes[i] { + EN => { + if last_strong_is_al { + // W2. If previous strong char was AL, change EN to AN. + processing_classes[i] = AN; + } + } + // W3. + AL => processing_classes[i] = R, + _ => {} + } + + // update last_strong_is_al. + match w2_processing_class { + L | R => { + last_strong_is_al = false; + } + AL => { + last_strong_is_al = true; + } + _ => {} + } + + let class_before_w456 = processing_classes[i]; + + // <http://www.unicode.org/reports/tr9/#W4> + // <http://www.unicode.org/reports/tr9/#W5> + // <http://www.unicode.org/reports/tr9/#W6> (separators only) + // (see below for W6 terminator code) + // + match processing_classes[i] { + // <http://www.unicode.org/reports/tr9/#W6> + EN => { // W5. If a run of ETs is adjacent to an EN, change the ETs to EN. for j in &et_run_indices { processing_classes[*j] = EN; } et_run_indices.clear(); } - } - // <http://www.unicode.org/reports/tr9/#W3> - AL => processing_classes[i] = R, - // <http://www.unicode.org/reports/tr9/#W4> - ES | CS => { - let next_class = indices - .clone() - .map(|j| processing_classes[j]) - .find(not_removed_by_x9) - .unwrap_or(sequence.eos); - processing_classes[i] = match (prev_class, processing_classes[i], next_class) { - (EN, ES, EN) | (EN, CS, EN) => EN, - (AN, CS, AN) => AN, - (_, _, _) => ON, + // <http://www.unicode.org/reports/tr9/#W4> + // <http://www.unicode.org/reports/tr9/#W6> + ES | CS => { + // See https://github.com/servo/unicode-bidi/issues/86 for improving this. + // We want to make sure we check the correct next character by skipping past the rest + // of this one. + if let Some(ch) = text.get(i..).and_then(|s| s.chars().next()) { + let mut next_class = sequence + .iter_forwards_from(i + ch.len_utf8(), run_index) + .map(|j| processing_classes[j]) + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + .find(not_removed_by_x9) + .unwrap_or(sequence.eos); + if next_class == EN && last_strong_is_al { + // Apply W2 to next_class. We know that last_strong_is_al + // has no chance of changing on this character so we can still assume its value + // will be the same by the time we get to it. + next_class = AN; + } + processing_classes[i] = + match (prev_class_before_w4, processing_classes[i], next_class) { + // W4 + (EN, ES, EN) | (EN, CS, EN) => EN, + // W4 + (AN, CS, AN) => AN, + // W6 (separators only) + (_, _, _) => ON, + }; + + // W6 + <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + // We have to do this before W5 gets its grubby hands on these characters and thinks + // they're part of an ET run. + // We check for ON to ensure that we had hit the W6 branch above, since this `ES | CS` match + // arm handles both W4 and W6. + if processing_classes[i] == ON { + for idx in sequence.iter_backwards_from(i, run_index) { + let class = &mut processing_classes[idx]; + if *class != BN { + break; + } + *class = ON; + } + for idx in sequence.iter_forwards_from(i + ch.len_utf8(), run_index) { + let class = &mut processing_classes[idx]; + if *class != BN { + break; + } + *class = ON; + } + } + } else { + // We're in the middle of a character, copy over work done for previous bytes + // since it's going to be the same answer. + processing_classes[i] = processing_classes[i - 1]; + } } - } - // <http://www.unicode.org/reports/tr9/#W5> - ET => { - match prev_class { - EN => processing_classes[i] = EN, - _ => et_run_indices.push(i), // In case this is followed by an EN. + // <http://www.unicode.org/reports/tr9/#W5> + ET => { + match prev_class_before_w5 { + EN => processing_classes[i] = EN, + _ => { + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + // If there was a BN run before this, that's now a part of this ET run. + et_run_indices.extend(&bn_run_indices); + + // In case this is followed by an EN. + et_run_indices.push(i); + } + } } + _ => {} } - class => { - if removed_by_x9(class) { - continue; + + // Common loop iteration code + // + + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + // BN runs would have already continued the loop, clear them before we get to the next one. + bn_run_indices.clear(); + + // W6 above only deals with separators, so it doesn't change anything W5 cares about, + // so we still can update this after running that part of W6. + prev_class_before_w5 = processing_classes[i]; + + // <http://www.unicode.org/reports/tr9/#W6> (terminators only) + // (see above for W6 separator code) + // + if prev_class_before_w5 != ET { + // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. + for j in &et_run_indices { + processing_classes[*j] = ON; } + et_run_indices.clear(); } - } - prev_class = processing_classes[i]; - match prev_class { - L | R => { - last_strong_is_al = false; - } - AL => { - last_strong_is_al = true; - } - _ => {} - } - if prev_class != ET { - // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. - for j in &et_run_indices { - processing_classes[*j] = ON; - } - et_run_indices.clear(); + // We stashed this before W4/5/6 could get their grubby hands on it, and it's not + // used in the W6 terminator code below so we can update it now. + prev_class_before_w4 = class_before_w456; } } + // Rerun this check in case we ended with a sequence of BNs (i.e., we'd never + // hit the end of the for loop above). + // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. + for j in &et_run_indices { + processing_classes[*j] = ON; + } + et_run_indices.clear(); // W7. If the previous strong char was L, change EN to L. let mut last_strong_is_l = sequence.sos == L; @@ -127,6 +236,8 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B R | AL => { last_strong_is_l = false; } + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + // Already scanning past BN here. _ => {} } } @@ -137,22 +248,172 @@ pub fn resolve_weak(sequence: &IsolatingRunSequence, processing_classes: &mut [B /// /// <http://www.unicode.org/reports/tr9/#Resolving_Neutral_Types> #[cfg_attr(feature = "flame_it", flamer::flame)] -pub fn resolve_neutral( +pub fn resolve_neutral<D: BidiDataSource>( + text: &str, + data_source: &D, sequence: &IsolatingRunSequence, levels: &[Level], + original_classes: &[BidiClass], processing_classes: &mut [BidiClass], ) { + // e = embedding direction let e: BidiClass = levels[sequence.runs[0].start].bidi_class(); + let not_e = if e == BidiClass::L { + BidiClass::R + } else { + BidiClass::L + }; + // N0. Process bracket pairs. + + // > Identify the bracket pairs in the current isolating run sequence according to BD16. + // We use processing_classes, not original_classes, due to BD14/BD15 + let bracket_pairs = identify_bracket_pairs(text, data_source, sequence, processing_classes); + + // > For each bracket-pair element in the list of pairs of text positions + // + // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly + // for indexing as it will include the opening bracket pair but not the closing one. + for pair in bracket_pairs { + #[cfg(feature = "std")] + debug_assert!( + pair.start < processing_classes.len(), + "identify_bracket_pairs returned a range that is out of bounds!" + ); + #[cfg(feature = "std")] + debug_assert!( + pair.end < processing_classes.len(), + "identify_bracket_pairs returned a range that is out of bounds!" + ); + let mut found_e = false; + let mut found_not_e = false; + let mut class_to_set = None; + + let start_len_utf8 = text[pair.start..].chars().next().unwrap().len_utf8(); + // > Inspect the bidirectional types of the characters enclosed within the bracket pair. + // + // `pair` is [start, end) so we will end up processing the opening character but not the closing one. + // + for enclosed_i in sequence.iter_forwards_from(pair.start + start_len_utf8, pair.start_run) { + if enclosed_i >= pair.end { + #[cfg(feature = "std")] + debug_assert!( + enclosed_i == pair.end, + "If we skipped past this, the iterator is broken" + ); + break; + } + let class = processing_classes[enclosed_i]; + if class == e { + found_e = true; + } else if class == not_e { + found_not_e = true; + } else if class == BidiClass::EN || class == BidiClass::AN { + // > Within this scope, bidirectional types EN and AN are treated as R. + if e == BidiClass::L { + found_not_e = true; + } else { + found_e = true; + } + } + + // If we have found a character with the class of the embedding direction + // we can bail early. + if found_e { + break; + } + } + // > If any strong type (either L or R) matching the embedding direction is found + if found_e { + // > .. set the type for both brackets in the pair to match the embedding direction + class_to_set = Some(e); + // > Otherwise, if there is a strong type it must be opposite the embedding direction + } else if found_not_e { + // > Therefore, test for an established context with a preceding strong type by + // > checking backwards before the opening paired bracket + // > until the first strong type (L, R, or sos) is found. + // (see note above about processing_classes and character boundaries) + let mut previous_strong = sequence + .iter_backwards_from(pair.start, pair.start_run) + .map(|i| processing_classes[i]) + .find(|class| { + *class == BidiClass::L + || *class == BidiClass::R + || *class == BidiClass::EN + || *class == BidiClass::AN + }) + .unwrap_or(sequence.sos); + + // > Within this scope, bidirectional types EN and AN are treated as R. + if previous_strong == BidiClass::EN || previous_strong == BidiClass::AN { + previous_strong = BidiClass::R; + } + + // > If the preceding strong type is also opposite the embedding direction, + // > context is established, + // > so set the type for both brackets in the pair to that direction. + // AND + // > Otherwise set the type for both brackets in the pair to the embedding direction. + // > Either way it gets set to previous_strong + // + // Both branches amount to setting the type to the strong type. + class_to_set = Some(previous_strong); + } + + if let Some(class_to_set) = class_to_set { + // Update all processing classes corresponding to the start and end elements, as requested. + // We should include all bytes of the character, not the first one. + let end_len_utf8 = text[pair.end..].chars().next().unwrap().len_utf8(); + for class in &mut processing_classes[pair.start..pair.start + start_len_utf8] { + *class = class_to_set; + } + for class in &mut processing_classes[pair.end..pair.end + end_len_utf8] { + *class = class_to_set; + } + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + for idx in sequence.iter_backwards_from(pair.start, pair.start_run) { + let class = &mut processing_classes[idx]; + if *class != BN { + break; + } + *class = class_to_set; + } + // > Any number of characters that had original bidirectional character type NSM prior to the application of + // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket. + + // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry + // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters. + let nsm_start = pair.start + start_len_utf8; + for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) { + let class = original_classes[idx]; + if class == BidiClass::NSM || processing_classes[idx] == BN { + processing_classes[idx] = class_to_set; + } else { + break; + } + } + let nsm_end = pair.end + end_len_utf8; + for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) { + let class = original_classes[idx]; + if class == BidiClass::NSM || processing_classes[idx] == BN { + processing_classes[idx] = class_to_set; + } else { + break; + } + } + } + // > Otherwise, there are no strong types within the bracket pair + // > Therefore, do not set the type for that bracket pair + } + + // N1 and N2. + // Indices of every byte in this isolating run sequence let mut indices = sequence.runs.iter().flat_map(Clone::clone); let mut prev_class = sequence.sos; - while let Some(mut i) = indices.next() { - // N0. Process bracket pairs. - // TODO - // Process sequences of NI characters. let mut ni_run = Vec::new(); - if is_NI(processing_classes[i]) { + // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + if is_NI(processing_classes[i]) || processing_classes[i] == BN { // Consume a run of consecutive NI characters. ni_run.push(i); let mut next_class; @@ -160,11 +421,9 @@ pub fn resolve_neutral( match indices.next() { Some(j) => { i = j; - if removed_by_x9(processing_classes[i]) { - continue; - } next_class = processing_classes[j]; - if is_NI(next_class) { + // The BN is for <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> + if is_NI(next_class) || next_class == BN { ni_run.push(i); } else { break; @@ -176,7 +435,6 @@ pub fn resolve_neutral( } }; } - // N1-N2. // // <http://www.unicode.org/reports/tr9/#N1> @@ -203,6 +461,105 @@ pub fn resolve_neutral( } } +struct BracketPair { + /// The text-relative index of the opening bracket. + start: usize, + /// The text-relative index of the closing bracket. + end: usize, + /// The index of the run (in the run sequence) that the opening bracket is in. + start_run: usize, + /// The index of the run (in the run sequence) that the closing bracket is in. + end_run: usize, +} +/// 3.1.3 Identifying Bracket Pairs +/// +/// Returns all paired brackets in the source, as indices into the +/// text source. +/// +/// <https://www.unicode.org/reports/tr9/#BD16> +fn identify_bracket_pairs<D: BidiDataSource>( + text: &str, + data_source: &D, + run_sequence: &IsolatingRunSequence, + original_classes: &[BidiClass], +) -> Vec<BracketPair> { + let mut ret = vec![]; + let mut stack = vec![]; + + for (run_index, level_run) in run_sequence.runs.iter().enumerate() { + let slice = if let Some(slice) = text.get(level_run.clone()) { + slice + } else { + #[cfg(feature = "std")] + std::debug_assert!( + false, + "Found broken indices in level run: found indices {}..{} for string of length {}", + level_run.start, + level_run.end, + text.len() + ); + return ret; + }; + + for (i, ch) in slice.char_indices() { + let actual_index = level_run.start + i; + // All paren characters are ON. + // From BidiBrackets.txt: + // > The Unicode property value stability policy guarantees that characters + // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y + if original_classes[level_run.start + i] != BidiClass::ON { + continue; + } + + if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) { + if matched.is_open { + // > If an opening paired bracket is found ... + + // > ... and there is no room in the stack, + // > stop processing BD16 for the remainder of the isolating run sequence. + if stack.len() >= 63 { + break; + } + // > ... push its Bidi_Paired_Bracket property value and its text position onto the stack + stack.push((matched.opening, actual_index, run_index)) + } else { + // > If a closing paired bracket is found, do the following + + // > Declare a variable that holds a reference to the current stack element + // > and initialize it with the top element of the stack. + // AND + // > Else, if the current stack element is not at the bottom of the stack + for (stack_index, element) in stack.iter().enumerate().rev() { + // > Compare the closing paired bracket being inspected or its canonical + // > equivalent to the bracket in the current stack element. + if element.0 == matched.opening { + // > If the values match, meaning the two characters form a bracket pair, then + + // > Append the text position in the current stack element together with the + // > text position of the closing paired bracket to the list. + let pair = BracketPair { + start: element.1, + end: actual_index, + start_run: element.2, + end_run: run_index, + }; + ret.push(pair); + + // > Pop the stack through the current stack element inclusively. + stack.truncate(stack_index); + break; + } + } + } + } + } + } + // > Sort the list of pairs of text positions in ascending order based on + // > the text position of the opening paired bracket. + ret.sort_by_key(|r| r.start); + ret +} + /// 3.3.6 Resolving Implicit Levels /// /// Returns the maximum embedding level in the paragraph. @@ -211,7 +568,6 @@ pub fn resolve_neutral( #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> Level { let mut max_level = Level::ltr(); - assert_eq!(original_classes.len(), levels.len()); for i in 0..levels.len() { match (levels[i].is_rtl(), original_classes[i]) { @@ -219,6 +575,7 @@ pub fn resolve_levels(original_classes: &[BidiClass], levels: &mut [Level]) -> L (false, R) | (true, L) | (true, EN) | (true, AN) => { levels[i].raise(1).expect("Level number error") } + // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> handled here (_, _) => {} } max_level = max(max_level, levels[i]); |