summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-syntax/src/hir/interval.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-syntax/src/hir/interval.rs')
-rw-r--r--third_party/rust/regex-syntax/src/hir/interval.rs520
1 files changed, 520 insertions, 0 deletions
diff --git a/third_party/rust/regex-syntax/src/hir/interval.rs b/third_party/rust/regex-syntax/src/hir/interval.rs
new file mode 100644
index 0000000000..56698c53af
--- /dev/null
+++ b/third_party/rust/regex-syntax/src/hir/interval.rs
@@ -0,0 +1,520 @@
+use std::char;
+use std::cmp;
+use std::fmt::Debug;
+use std::slice;
+use std::u8;
+
+use crate::unicode;
+
+// This module contains an *internal* implementation of interval sets.
+//
+// The primary invariant that interval sets guards is canonical ordering. That
+// is, every interval set contains an ordered sequence of intervals where
+// no two intervals are overlapping or adjacent. While this invariant is
+// occasionally broken within the implementation, it should be impossible for
+// callers to observe it.
+//
+// Since case folding (as implemented below) breaks that invariant, we roll
+// that into this API even though it is a little out of place in an otherwise
+// generic interval set. (Hence the reason why the `unicode` module is imported
+// here.)
+//
+// Some of the implementation complexity here is a result of me wanting to
+// preserve the sequential representation without using additional memory.
+// In many cases, we do use linear extra memory, but it is at most 2x and it
+// is amortized. If we relaxed the memory requirements, this implementation
+// could become much simpler. The extra memory is honestly probably OK, but
+// character classes (especially of the Unicode variety) can become quite
+// large, and it would be nice to keep regex compilation snappy even in debug
+// builds. (In the past, I have been careless with this area of code and it has
+// caused slow regex compilations in debug mode, so this isn't entirely
+// unwarranted.)
+//
+// Tests on this are relegated to the public API of HIR in src/hir.rs.
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct IntervalSet<I> {
+ ranges: Vec<I>,
+}
+
+impl<I: Interval> IntervalSet<I> {
+ /// Create a new set from a sequence of intervals. Each interval is
+ /// specified as a pair of bounds, where both bounds are inclusive.
+ ///
+ /// The given ranges do not need to be in any specific order, and ranges
+ /// may overlap.
+ pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
+ let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+ set.canonicalize();
+ set
+ }
+
+ /// Add a new interval to this set.
+ pub fn push(&mut self, interval: I) {
+ // TODO: This could be faster. e.g., Push the interval such that
+ // it preserves canonicalization.
+ self.ranges.push(interval);
+ self.canonicalize();
+ }
+
+ /// Return an iterator over all intervals in this set.
+ ///
+ /// The iterator yields intervals in ascending order.
+ pub fn iter(&self) -> IntervalSetIter<'_, I> {
+ IntervalSetIter(self.ranges.iter())
+ }
+
+ /// Return an immutable slice of intervals in this set.
+ ///
+ /// The sequence returned is in canonical ordering.
+ pub fn intervals(&self) -> &[I] {
+ &self.ranges
+ }
+
+ /// Expand this interval set such that it contains all case folded
+ /// characters. For example, if this class consists of the range `a-z`,
+ /// then applying case folding will result in the class containing both the
+ /// ranges `a-z` and `A-Z`.
+ ///
+ /// This returns an error if the necessary case mapping data is not
+ /// available.
+ pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+ let len = self.ranges.len();
+ for i in 0..len {
+ let range = self.ranges[i];
+ if let Err(err) = range.case_fold_simple(&mut self.ranges) {
+ self.canonicalize();
+ return Err(err);
+ }
+ }
+ self.canonicalize();
+ Ok(())
+ }
+
+ /// Union this set with the given set, in place.
+ pub fn union(&mut self, other: &IntervalSet<I>) {
+ // This could almost certainly be done more efficiently.
+ self.ranges.extend(&other.ranges);
+ self.canonicalize();
+ }
+
+ /// Intersect this set with the given set, in place.
+ pub fn intersect(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() {
+ return;
+ }
+ if other.ranges.is_empty() {
+ self.ranges.clear();
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the intersection to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ let mut ita = 0..drain_end;
+ let mut itb = 0..other.ranges.len();
+ let mut a = ita.next().unwrap();
+ let mut b = itb.next().unwrap();
+ loop {
+ if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
+ self.ranges.push(ab);
+ }
+ let (it, aorb) =
+ if self.ranges[a].upper() < other.ranges[b].upper() {
+ (&mut ita, &mut a)
+ } else {
+ (&mut itb, &mut b)
+ };
+ match it.next() {
+ Some(v) => *aorb = v,
+ None => break,
+ }
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Subtract the given set from this set, in place.
+ pub fn difference(&mut self, other: &IntervalSet<I>) {
+ if self.ranges.is_empty() || other.ranges.is_empty() {
+ return;
+ }
+
+ // This algorithm is (to me) surprisingly complex. A search of the
+ // interwebs indicate that this is a potentially interesting problem.
+ // Folks seem to suggest interval or segment trees, but I'd like to
+ // avoid the overhead (both runtime and conceptual) of that.
+ //
+ // The following is basically my Shitty First Draft. Therefore, in
+ // order to grok it, you probably need to read each line carefully.
+ // Simplifications are most welcome!
+ //
+ // Remember, we can assume the canonical format invariant here, which
+ // says that all ranges are sorted, not overlapping and not adjacent in
+ // each class.
+ let drain_end = self.ranges.len();
+ let (mut a, mut b) = (0, 0);
+ 'LOOP: while a < drain_end && b < other.ranges.len() {
+ // Basically, the easy cases are when neither range overlaps with
+ // each other. If the `b` range is less than our current `a`
+ // range, then we can skip it and move on.
+ if other.ranges[b].upper() < self.ranges[a].lower() {
+ b += 1;
+ continue;
+ }
+ // ... similarly for the `a` range. If it's less than the smallest
+ // `b` range, then we can add it as-is.
+ if self.ranges[a].upper() < other.ranges[b].lower() {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ continue;
+ }
+ // Otherwise, we have overlapping ranges.
+ assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
+
+ // This part is tricky and was non-obvious to me without looking
+ // at explicit examples (see the tests). The trickiness stems from
+ // two things: 1) subtracting a range from another range could
+ // yield two ranges and 2) after subtracting a range, it's possible
+ // that future ranges can have an impact. The loop below advances
+ // the `b` ranges until they can't possible impact the current
+ // range.
+ //
+ // For example, if our `a` range is `a-t` and our next three `b`
+ // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
+ // subtraction three times before moving on to the next `a` range.
+ let mut range = self.ranges[a];
+ while b < other.ranges.len()
+ && !range.is_intersection_empty(&other.ranges[b])
+ {
+ let old_range = range;
+ range = match range.difference(&other.ranges[b]) {
+ (None, None) => {
+ // We lost the entire range, so move on to the next
+ // without adding this one.
+ a += 1;
+ continue 'LOOP;
+ }
+ (Some(range1), None) | (None, Some(range1)) => range1,
+ (Some(range1), Some(range2)) => {
+ self.ranges.push(range1);
+ range2
+ }
+ };
+ // It's possible that the `b` range has more to contribute
+ // here. In particular, if it is greater than the original
+ // range, then it might impact the next `a` range *and* it
+ // has impacted the current `a` range as much as possible,
+ // so we can quit. We don't bump `b` so that the next `a`
+ // range can apply it.
+ if other.ranges[b].upper() > old_range.upper() {
+ break;
+ }
+ // Otherwise, the next `b` range might apply to the current
+ // `a` range.
+ b += 1;
+ }
+ self.ranges.push(range);
+ a += 1;
+ }
+ while a < drain_end {
+ let range = self.ranges[a];
+ self.ranges.push(range);
+ a += 1;
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Compute the symmetric difference of the two sets, in place.
+ ///
+ /// This computes the symmetric difference of two interval sets. This
+ /// removes all elements in this set that are also in the given set,
+ /// but also adds all elements from the given set that aren't in this
+ /// set. That is, the set will contain all elements in either set,
+ /// but will not contain any elements that are in both sets.
+ pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
+ // TODO(burntsushi): Fix this so that it amortizes allocation.
+ let mut intersection = self.clone();
+ intersection.intersect(other);
+ self.union(other);
+ self.difference(&intersection);
+ }
+
+ /// Negate this interval set.
+ ///
+ /// For all `x` where `x` is any element, if `x` was in this set, then it
+ /// will not be in this set after negation.
+ pub fn negate(&mut self) {
+ if self.ranges.is_empty() {
+ let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
+ self.ranges.push(I::create(min, max));
+ return;
+ }
+
+ // There should be a way to do this in-place with constant memory,
+ // but I couldn't figure out a simple way to do it. So just append
+ // the negation to the end of this range, and then drain it before
+ // we're done.
+ let drain_end = self.ranges.len();
+
+ // We do checked arithmetic below because of the canonical ordering
+ // invariant.
+ if self.ranges[0].lower() > I::Bound::min_value() {
+ let upper = self.ranges[0].lower().decrement();
+ self.ranges.push(I::create(I::Bound::min_value(), upper));
+ }
+ for i in 1..drain_end {
+ let lower = self.ranges[i - 1].upper().increment();
+ let upper = self.ranges[i].lower().decrement();
+ self.ranges.push(I::create(lower, upper));
+ }
+ if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
+ let lower = self.ranges[drain_end - 1].upper().increment();
+ self.ranges.push(I::create(lower, I::Bound::max_value()));
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Converts this set into a canonical ordering.
+ fn canonicalize(&mut self) {
+ if self.is_canonical() {
+ return;
+ }
+ self.ranges.sort();
+ assert!(!self.ranges.is_empty());
+
+ // Is there a way to do this in-place with constant memory? I couldn't
+ // figure out a way to do it. So just append the canonicalization to
+ // the end of this range, and then drain it before we're done.
+ let drain_end = self.ranges.len();
+ for oldi in 0..drain_end {
+ // If we've added at least one new range, then check if we can
+ // merge this range in the previously added range.
+ if self.ranges.len() > drain_end {
+ let (last, rest) = self.ranges.split_last_mut().unwrap();
+ if let Some(union) = last.union(&rest[oldi]) {
+ *last = union;
+ continue;
+ }
+ }
+ let range = self.ranges[oldi];
+ self.ranges.push(range);
+ }
+ self.ranges.drain(..drain_end);
+ }
+
+ /// Returns true if and only if this class is in a canonical ordering.
+ fn is_canonical(&self) -> bool {
+ for pair in self.ranges.windows(2) {
+ if pair[0] >= pair[1] {
+ return false;
+ }
+ if pair[0].is_contiguous(&pair[1]) {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+/// An iterator over intervals.
+#[derive(Debug)]
+pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
+
+impl<'a, I> Iterator for IntervalSetIter<'a, I> {
+ type Item = &'a I;
+
+ fn next(&mut self) -> Option<&'a I> {
+ self.0.next()
+ }
+}
+
+pub trait Interval:
+ Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
+{
+ type Bound: Bound;
+
+ fn lower(&self) -> Self::Bound;
+ fn upper(&self) -> Self::Bound;
+ fn set_lower(&mut self, bound: Self::Bound);
+ fn set_upper(&mut self, bound: Self::Bound);
+ fn case_fold_simple(
+ &self,
+ intervals: &mut Vec<Self>,
+ ) -> Result<(), unicode::CaseFoldError>;
+
+ /// Create a new interval.
+ fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
+ let mut int = Self::default();
+ if lower <= upper {
+ int.set_lower(lower);
+ int.set_upper(upper);
+ } else {
+ int.set_lower(upper);
+ int.set_upper(lower);
+ }
+ int
+ }
+
+ /// Union the given overlapping range into this range.
+ ///
+ /// If the two ranges aren't contiguous, then this returns `None`.
+ fn union(&self, other: &Self) -> Option<Self> {
+ if !self.is_contiguous(other) {
+ return None;
+ }
+ let lower = cmp::min(self.lower(), other.lower());
+ let upper = cmp::max(self.upper(), other.upper());
+ Some(Self::create(lower, upper))
+ }
+
+ /// Intersect this range with the given range and return the result.
+ ///
+ /// If the intersection is empty, then this returns `None`.
+ fn intersect(&self, other: &Self) -> Option<Self> {
+ let lower = cmp::max(self.lower(), other.lower());
+ let upper = cmp::min(self.upper(), other.upper());
+ if lower <= upper {
+ Some(Self::create(lower, upper))
+ } else {
+ None
+ }
+ }
+
+ /// Subtract the given range from this range and return the resulting
+ /// ranges.
+ ///
+ /// If subtraction would result in an empty range, then no ranges are
+ /// returned.
+ fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
+ if self.is_subset(other) {
+ return (None, None);
+ }
+ if self.is_intersection_empty(other) {
+ return (Some(self.clone()), None);
+ }
+ let add_lower = other.lower() > self.lower();
+ let add_upper = other.upper() < self.upper();
+ // We know this because !self.is_subset(other) and the ranges have
+ // a non-empty intersection.
+ assert!(add_lower || add_upper);
+ let mut ret = (None, None);
+ if add_lower {
+ let upper = other.lower().decrement();
+ ret.0 = Some(Self::create(self.lower(), upper));
+ }
+ if add_upper {
+ let lower = other.upper().increment();
+ let range = Self::create(lower, self.upper());
+ if ret.0.is_none() {
+ ret.0 = Some(range);
+ } else {
+ ret.1 = Some(range);
+ }
+ }
+ ret
+ }
+
+ /// Compute the symmetric difference the given range from this range. This
+ /// returns the union of the two ranges minus its intersection.
+ fn symmetric_difference(
+ &self,
+ other: &Self,
+ ) -> (Option<Self>, Option<Self>) {
+ let union = match self.union(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(union) => union,
+ };
+ let intersection = match self.intersect(other) {
+ None => return (Some(self.clone()), Some(other.clone())),
+ Some(intersection) => intersection,
+ };
+ union.difference(&intersection)
+ }
+
+ /// Returns true if and only if the two ranges are contiguous. Two ranges
+ /// are contiguous if and only if the ranges are either overlapping or
+ /// adjacent.
+ fn is_contiguous(&self, other: &Self) -> bool {
+ let lower1 = self.lower().as_u32();
+ let upper1 = self.upper().as_u32();
+ let lower2 = other.lower().as_u32();
+ let upper2 = other.upper().as_u32();
+ cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
+ }
+
+ /// Returns true if and only if the intersection of this range and the
+ /// other range is empty.
+ fn is_intersection_empty(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
+ }
+
+ /// Returns true if and only if this range is a subset of the other range.
+ fn is_subset(&self, other: &Self) -> bool {
+ let (lower1, upper1) = (self.lower(), self.upper());
+ let (lower2, upper2) = (other.lower(), other.upper());
+ (lower2 <= lower1 && lower1 <= upper2)
+ && (lower2 <= upper1 && upper1 <= upper2)
+ }
+}
+
+pub trait Bound:
+ Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
+{
+ fn min_value() -> Self;
+ fn max_value() -> Self;
+ fn as_u32(self) -> u32;
+ fn increment(self) -> Self;
+ fn decrement(self) -> Self;
+}
+
+impl Bound for u8 {
+ fn min_value() -> Self {
+ u8::MIN
+ }
+ fn max_value() -> Self {
+ u8::MAX
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+ fn increment(self) -> Self {
+ self.checked_add(1).unwrap()
+ }
+ fn decrement(self) -> Self {
+ self.checked_sub(1).unwrap()
+ }
+}
+
+impl Bound for char {
+ fn min_value() -> Self {
+ '\x00'
+ }
+ fn max_value() -> Self {
+ '\u{10FFFF}'
+ }
+ fn as_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn increment(self) -> Self {
+ match self {
+ '\u{D7FF}' => '\u{E000}',
+ c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
+ }
+ }
+
+ fn decrement(self) -> Self {
+ match self {
+ '\u{E000}' => '\u{D7FF}',
+ c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
+ }
+ }
+}
+
+// Tests for interval sets are written in src/hir.rs against the public API.