summaryrefslogtreecommitdiffstats
path: root/vendor/regex-syntax/src/hir/interval.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-syntax/src/hir/interval.rs')
-rw-r--r--vendor/regex-syntax/src/hir/interval.rs83
1 files changed, 72 insertions, 11 deletions
diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs
index 56698c53a..e063390a8 100644
--- a/vendor/regex-syntax/src/hir/interval.rs
+++ b/vendor/regex-syntax/src/hir/interval.rs
@@ -1,8 +1,6 @@
-use std::char;
-use std::cmp;
-use std::fmt::Debug;
-use std::slice;
-use std::u8;
+use core::{char, cmp, fmt::Debug, slice};
+
+use alloc::vec::Vec;
use crate::unicode;
@@ -32,9 +30,38 @@ use crate::unicode;
//
// Tests on this are relegated to the public API of HIR in src/hir.rs.
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug)]
pub struct IntervalSet<I> {
+ /// A sorted set of non-overlapping ranges.
ranges: Vec<I>,
+ /// While not required at all for correctness, we keep track of whether an
+ /// interval set has been case folded or not. This helps us avoid doing
+ /// redundant work if, for example, a set has already been cased folded.
+ /// And note that whether a set is folded or not is preserved through
+ /// all of the pairwise set operations. That is, if both interval sets
+ /// have been case folded, then any of difference, union, intersection or
+ /// symmetric difference all produce a case folded set.
+ ///
+ /// Note that when this is true, it *must* be the case that the set is case
+ /// folded. But when it's false, the set *may* be case folded. In other
+ /// words, we only set this to true when we know it to be case, but we're
+ /// okay with it being false if it would otherwise be costly to determine
+ /// whether it should be true. This means code cannot assume that a false
+ /// value necessarily indicates that the set is not case folded.
+ ///
+ /// Bottom line: this is a performance optimization.
+ folded: bool,
+}
+
+impl<I: Interval> Eq for IntervalSet<I> {}
+
+// We implement PartialEq manually so that we don't consider the set's internal
+// 'folded' property to be part of its identity. The 'folded' property is
+// strictly an optimization.
+impl<I: Interval> PartialEq for IntervalSet<I> {
+ fn eq(&self, other: &IntervalSet<I>) -> bool {
+ self.ranges.eq(&other.ranges)
+ }
}
impl<I: Interval> IntervalSet<I> {
@@ -44,7 +71,10 @@ impl<I: Interval> IntervalSet<I> {
/// The given ranges do not need to be in any specific order, and ranges
/// may overlap.
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
- let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+ let ranges: Vec<I> = intervals.into_iter().collect();
+ // An empty set is case folded.
+ let folded = ranges.is_empty();
+ let mut set = IntervalSet { ranges, folded };
set.canonicalize();
set
}
@@ -55,6 +85,10 @@ impl<I: Interval> IntervalSet<I> {
// it preserves canonicalization.
self.ranges.push(interval);
self.canonicalize();
+ // We don't know whether the new interval added here is considered
+ // case folded, so we conservatively assume that the entire set is
+ // no longer case folded if it was previously.
+ self.folded = false;
}
/// Return an iterator over all intervals in this set.
@@ -79,6 +113,9 @@ impl<I: Interval> IntervalSet<I> {
/// This returns an error if the necessary case mapping data is not
/// available.
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ if self.folded {
+ return Ok(());
+ }
let len = self.ranges.len();
for i in 0..len {
let range = self.ranges[i];
@@ -88,14 +125,19 @@ impl<I: Interval> IntervalSet<I> {
}
}
self.canonicalize();
+ self.folded = true;
Ok(())
}
/// Union this set with the given set, in place.
pub fn union(&mut self, other: &IntervalSet<I>) {
+ if other.ranges.is_empty() || self.ranges == other.ranges {
+ return;
+ }
// This could almost certainly be done more efficiently.
self.ranges.extend(&other.ranges);
self.canonicalize();
+ self.folded = self.folded && other.folded;
}
/// Intersect this set with the given set, in place.
@@ -105,6 +147,8 @@ impl<I: Interval> IntervalSet<I> {
}
if other.ranges.is_empty() {
self.ranges.clear();
+ // An empty set is case folded.
+ self.folded = true;
return;
}
@@ -134,6 +178,7 @@ impl<I: Interval> IntervalSet<I> {
}
}
self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
}
/// Subtract the given set from this set, in place.
@@ -226,6 +271,7 @@ impl<I: Interval> IntervalSet<I> {
a += 1;
}
self.ranges.drain(..drain_end);
+ self.folded = self.folded && other.folded;
}
/// Compute the symmetric difference of the two sets, in place.
@@ -251,6 +297,8 @@ impl<I: Interval> IntervalSet<I> {
if self.ranges.is_empty() {
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
self.ranges.push(I::create(min, max));
+ // The set containing everything must case folded.
+ self.folded = true;
return;
}
@@ -276,6 +324,19 @@ impl<I: Interval> IntervalSet<I> {
self.ranges.push(I::create(lower, I::Bound::max_value()));
}
self.ranges.drain(..drain_end);
+ // We don't need to update whether this set is folded or not, because
+ // it is conservatively preserved through negation. Namely, if a set
+ // is not folded, then it is possible that its negation is folded, for
+ // example, [^☃]. But we're fine with assuming that the set is not
+ // folded in that case. (`folded` permits false negatives but not false
+ // positives.)
+ //
+ // But what about when a set is folded, is its negation also
+ // necessarily folded? Yes. Because if a set is folded, then for every
+ // character in the set, it necessarily included its equivalence class
+ // of case folded characters. Negating it in turn means that all
+ // equivalence classes in the set are negated, and any equivalence
+ // class that was previously not in the set is now entirely in the set.
}
/// Converts this set into a canonical ordering.
@@ -481,7 +542,7 @@ impl Bound for u8 {
u8::MAX
}
fn as_u32(self) -> u32 {
- self as u32
+ u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
@@ -499,20 +560,20 @@ impl Bound for char {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
- self as u32
+ u32::from(self)
}
fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
- c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+ c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}
fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
- c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+ c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}