diff options
Diffstat (limited to 'vendor/regex-syntax/src/hir/interval.rs')
-rw-r--r-- | vendor/regex-syntax/src/hir/interval.rs | 83 |
1 files changed, 72 insertions, 11 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs index 56698c53a..e063390a8 100644 --- a/vendor/regex-syntax/src/hir/interval.rs +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -1,8 +1,6 @@ -use std::char; -use std::cmp; -use std::fmt::Debug; -use std::slice; -use std::u8; +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; use crate::unicode; @@ -32,9 +30,38 @@ use crate::unicode; // // Tests on this are relegated to the public API of HIR in src/hir.rs. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug)] pub struct IntervalSet<I> { + /// A sorted set of non-overlapping ranges. ranges: Vec<I>, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl<I: Interval> Eq for IntervalSet<I> {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl<I: Interval> PartialEq for IntervalSet<I> { + fn eq(&self, other: &IntervalSet<I>) -> bool { + self.ranges.eq(&other.ranges) + } } impl<I: Interval> IntervalSet<I> { @@ -44,7 +71,10 @@ impl<I: Interval> IntervalSet<I> { /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { - let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + let ranges: Vec<I> = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } @@ -55,6 +85,10 @@ impl<I: Interval> IntervalSet<I> { // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; } /// Return an iterator over all intervals in this set. @@ -79,6 +113,9 @@ impl<I: Interval> IntervalSet<I> { /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; @@ -88,14 +125,19 @@ impl<I: Interval> IntervalSet<I> { } } self.canonicalize(); + self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet<I>) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); + self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. @@ -105,6 +147,8 @@ impl<I: Interval> IntervalSet<I> { } if other.ranges.is_empty() { self.ranges.clear(); + // An empty set is case folded. + self.folded = true; return; } @@ -134,6 +178,7 @@ impl<I: Interval> IntervalSet<I> { } } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. @@ -226,6 +271,7 @@ impl<I: Interval> IntervalSet<I> { a += 1; } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -251,6 +297,8 @@ impl<I: Interval> IntervalSet<I> { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; return; } @@ -276,6 +324,19 @@ impl<I: Interval> IntervalSet<I> { self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. @@ -481,7 +542,7 @@ impl Bound for u8 { u8::MAX } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() @@ -499,20 +560,20 @@ impl Bound for char { '\u{10FFFF}' } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', - c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', - c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } |