use core::{char, cmp, fmt::Debug, slice}; use alloc::vec::Vec; use crate::unicode; // This module contains an *internal* implementation of interval sets. // // The primary invariant that interval sets guards is canonical ordering. That // is, every interval set contains an ordered sequence of intervals where // no two intervals are overlapping or adjacent. While this invariant is // occasionally broken within the implementation, it should be impossible for // callers to observe it. // // Since case folding (as implemented below) breaks that invariant, we roll // that into this API even though it is a little out of place in an otherwise // generic interval set. (Hence the reason why the `unicode` module is imported // here.) // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. // In many cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite // large, and it would be nice to keep regex compilation snappy even in debug // builds. (In the past, I have been careless with this area of code and it has // caused slow regex compilations in debug mode, so this isn't entirely // unwarranted.) // // Tests on this are relegated to the public API of HIR in src/hir.rs. #[derive(Clone, Debug)] pub struct IntervalSet { /// A sorted set of non-overlapping ranges. ranges: Vec, /// While not required at all for correctness, we keep track of whether an /// interval set has been case folded or not. This helps us avoid doing /// redundant work if, for example, a set has already been cased folded. /// And note that whether a set is folded or not is preserved through /// all of the pairwise set operations. That is, if both interval sets /// have been case folded, then any of difference, union, intersection or /// symmetric difference all produce a case folded set. /// /// Note that when this is true, it *must* be the case that the set is case /// folded. But when it's false, the set *may* be case folded. In other /// words, we only set this to true when we know it to be case, but we're /// okay with it being false if it would otherwise be costly to determine /// whether it should be true. This means code cannot assume that a false /// value necessarily indicates that the set is not case folded. /// /// Bottom line: this is a performance optimization. folded: bool, } impl Eq for IntervalSet {} // We implement PartialEq manually so that we don't consider the set's internal // 'folded' property to be part of its identity. The 'folded' property is // strictly an optimization. impl PartialEq for IntervalSet { fn eq(&self, other: &IntervalSet) -> bool { self.ranges.eq(&other.ranges) } } impl IntervalSet { /// Create a new set from a sequence of intervals. Each interval is /// specified as a pair of bounds, where both bounds are inclusive. /// /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new>(intervals: T) -> IntervalSet { let ranges: Vec = intervals.into_iter().collect(); // An empty set is case folded. let folded = ranges.is_empty(); let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } /// Add a new interval to this set. pub fn push(&mut self, interval: I) { // TODO: This could be faster. e.g., Push the interval such that // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; } /// Return an iterator over all intervals in this set. /// /// The iterator yields intervals in ascending order. pub fn iter(&self) -> IntervalSetIter<'_, I> { IntervalSetIter(self.ranges.iter()) } /// Return an immutable slice of intervals in this set. /// /// The sequence returned is in canonical ordering. pub fn intervals(&self) -> &[I] { &self.ranges } /// Expand this interval set such that it contains all case folded /// characters. For example, if this class consists of the range `a-z`, /// then applying case folding will result in the class containing both the /// ranges `a-z` and `A-Z`. /// /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { if self.folded { return Ok(()); } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; if let Err(err) = range.case_fold_simple(&mut self.ranges) { self.canonicalize(); return Err(err); } } self.canonicalize(); self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { if other.ranges.is_empty() || self.ranges == other.ranges { return; } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. pub fn intersect(&mut self, other: &IntervalSet) { if self.ranges.is_empty() { return; } if other.ranges.is_empty() { self.ranges.clear(); // An empty set is case folded. self.folded = true; return; } // There should be a way to do this in-place with constant memory, // but I couldn't figure out a simple way to do it. So just append // the intersection to the end of this range, and then drain it before // we're done. let drain_end = self.ranges.len(); let mut ita = 0..drain_end; let mut itb = 0..other.ranges.len(); let mut a = ita.next().unwrap(); let mut b = itb.next().unwrap(); loop { if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { self.ranges.push(ab); } let (it, aorb) = if self.ranges[a].upper() < other.ranges[b].upper() { (&mut ita, &mut a) } else { (&mut itb, &mut b) }; match it.next() { Some(v) => *aorb = v, None => break, } } self.ranges.drain(..drain_end); self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. pub fn difference(&mut self, other: &IntervalSet) { if self.ranges.is_empty() || other.ranges.is_empty() { return; } // This algorithm is (to me) surprisingly complex. A search of the // interwebs indicate that this is a potentially interesting problem. // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // // The following is basically my Shitty First Draft. Therefore, in // order to grok it, you probably need to read each line carefully. // Simplifications are most welcome! // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); let (mut a, mut b) = (0, 0); 'LOOP: while a < drain_end && b < other.ranges.len() { // Basically, the easy cases are when neither range overlaps with // each other. If the `b` range is less than our current `a` // range, then we can skip it and move on. if other.ranges[b].upper() < self.ranges[a].lower() { b += 1; continue; } // ... similarly for the `a` range. If it's less than the smallest // `b` range, then we can add it as-is. if self.ranges[a].upper() < other.ranges[b].lower() { let range = self.ranges[a]; self.ranges.push(range); a += 1; continue; } // Otherwise, we have overlapping ranges. assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could // yield two ranges and 2) after subtracting a range, it's possible // that future ranges can have an impact. The loop below advances // the `b` ranges until they can't possible impact the current // range. // // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. let mut range = self.ranges[a]; while b < other.ranges.len() && !range.is_intersection_empty(&other.ranges[b]) { let old_range = range; range = match range.difference(&other.ranges[b]) { (None, None) => { // We lost the entire range, so move on to the next // without adding this one. a += 1; continue 'LOOP; } (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); range2 } }; // It's possible that the `b` range has more to contribute // here. In particular, if it is greater than the original // range, then it might impact the next `a` range *and* it // has impacted the current `a` range as much as possible, // so we can quit. We don't bump `b` so that the next `a` // range can apply it. if other.ranges[b].upper() > old_range.upper() { break; } // Otherwise, the next `b` range might apply to the current // `a` range. b += 1; } self.ranges.push(range); a += 1; } while a < drain_end { let range = self.ranges[a]; self.ranges.push(range); a += 1; } self.ranges.drain(..drain_end); self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. /// /// This computes the symmetric difference of two interval sets. This /// removes all elements in this set that are also in the given set, /// but also adds all elements from the given set that aren't in this /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { // TODO(burntsushi): Fix this so that it amortizes allocation. let mut intersection = self.clone(); intersection.intersect(other); self.union(other); self.difference(&intersection); } /// Negate this interval set. /// /// For all `x` where `x` is any element, if `x` was in this set, then it /// will not be in this set after negation. pub fn negate(&mut self) { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); // The set containing everything must case folded. self.folded = true; return; } // There should be a way to do this in-place with constant memory, // but I couldn't figure out a simple way to do it. So just append // the negation to the end of this range, and then drain it before // we're done. let drain_end = self.ranges.len(); // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { let upper = self.ranges[0].lower().decrement(); self.ranges.push(I::create(I::Bound::min_value(), upper)); } for i in 1..drain_end { let lower = self.ranges[i - 1].upper().increment(); let upper = self.ranges[i].lower().decrement(); self.ranges.push(I::create(lower, upper)); } if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { let lower = self.ranges[drain_end - 1].upper().increment(); self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for // example, [^☃]. But we're fine with assuming that the set is not // folded in that case. (`folded` permits false negatives but not false // positives.) // // But what about when a set is folded, is its negation also // necessarily folded? Yes. Because if a set is folded, then for every // character in the set, it necessarily included its equivalence class // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. fn canonicalize(&mut self) { if self.is_canonical() { return; } self.ranges.sort(); assert!(!self.ranges.is_empty()); // Is there a way to do this in-place with constant memory? I couldn't // figure out a way to do it. So just append the canonicalization to // the end of this range, and then drain it before we're done. let drain_end = self.ranges.len(); for oldi in 0..drain_end { // If we've added at least one new range, then check if we can // merge this range in the previously added range. if self.ranges.len() > drain_end { let (last, rest) = self.ranges.split_last_mut().unwrap(); if let Some(union) = last.union(&rest[oldi]) { *last = union; continue; } } let range = self.ranges[oldi]; self.ranges.push(range); } self.ranges.drain(..drain_end); } /// Returns true if and only if this class is in a canonical ordering. fn is_canonical(&self) -> bool { for pair in self.ranges.windows(2) { if pair[0] >= pair[1] { return false; } if pair[0].is_contiguous(&pair[1]) { return false; } } true } } /// An iterator over intervals. #[derive(Debug)] pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); impl<'a, I> Iterator for IntervalSetIter<'a, I> { type Item = &'a I; fn next(&mut self) -> Option<&'a I> { self.0.next() } } pub trait Interval: Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord { type Bound: Bound; fn lower(&self) -> Self::Bound; fn upper(&self) -> Self::Bound; fn set_lower(&mut self, bound: Self::Bound); fn set_upper(&mut self, bound: Self::Bound); fn case_fold_simple( &self, intervals: &mut Vec, ) -> Result<(), unicode::CaseFoldError>; /// Create a new interval. fn create(lower: Self::Bound, upper: Self::Bound) -> Self { let mut int = Self::default(); if lower <= upper { int.set_lower(lower); int.set_upper(upper); } else { int.set_lower(upper); int.set_upper(lower); } int } /// Union the given overlapping range into this range. /// /// If the two ranges aren't contiguous, then this returns `None`. fn union(&self, other: &Self) -> Option { if !self.is_contiguous(other) { return None; } let lower = cmp::min(self.lower(), other.lower()); let upper = cmp::max(self.upper(), other.upper()); Some(Self::create(lower, upper)) } /// Intersect this range with the given range and return the result. /// /// If the intersection is empty, then this returns `None`. fn intersect(&self, other: &Self) -> Option { let lower = cmp::max(self.lower(), other.lower()); let upper = cmp::min(self.upper(), other.upper()); if lower <= upper { Some(Self::create(lower, upper)) } else { None } } /// Subtract the given range from this range and return the resulting /// ranges. /// /// If subtraction would result in an empty range, then no ranges are /// returned. fn difference(&self, other: &Self) -> (Option, Option) { if self.is_subset(other) { return (None, None); } if self.is_intersection_empty(other) { return (Some(self.clone()), None); } let add_lower = other.lower() > self.lower(); let add_upper = other.upper() < self.upper(); // We know this because !self.is_subset(other) and the ranges have // a non-empty intersection. assert!(add_lower || add_upper); let mut ret = (None, None); if add_lower { let upper = other.lower().decrement(); ret.0 = Some(Self::create(self.lower(), upper)); } if add_upper { let lower = other.upper().increment(); let range = Self::create(lower, self.upper()); if ret.0.is_none() { ret.0 = Some(range); } else { ret.1 = Some(range); } } ret } /// Compute the symmetric difference the given range from this range. This /// returns the union of the two ranges minus its intersection. fn symmetric_difference( &self, other: &Self, ) -> (Option, Option) { let union = match self.union(other) { None => return (Some(self.clone()), Some(other.clone())), Some(union) => union, }; let intersection = match self.intersect(other) { None => return (Some(self.clone()), Some(other.clone())), Some(intersection) => intersection, }; union.difference(&intersection) } /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. fn is_contiguous(&self, other: &Self) -> bool { let lower1 = self.lower().as_u32(); let upper1 = self.upper().as_u32(); let lower2 = other.lower().as_u32(); let upper2 = other.upper().as_u32(); cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) } /// Returns true if and only if the intersection of this range and the /// other range is empty. fn is_intersection_empty(&self, other: &Self) -> bool { let (lower1, upper1) = (self.lower(), self.upper()); let (lower2, upper2) = (other.lower(), other.upper()); cmp::max(lower1, lower2) > cmp::min(upper1, upper2) } /// Returns true if and only if this range is a subset of the other range. fn is_subset(&self, other: &Self) -> bool { let (lower1, upper1) = (self.lower(), self.upper()); let (lower2, upper2) = (other.lower(), other.upper()); (lower2 <= lower1 && lower1 <= upper2) && (lower2 <= upper1 && upper1 <= upper2) } } pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord { fn min_value() -> Self; fn max_value() -> Self; fn as_u32(self) -> u32; fn increment(self) -> Self; fn decrement(self) -> Self; } impl Bound for u8 { fn min_value() -> Self { u8::MIN } fn max_value() -> Self { u8::MAX } fn as_u32(self) -> u32 { u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() } fn decrement(self) -> Self { self.checked_sub(1).unwrap() } } impl Bound for char { fn min_value() -> Self { '\x00' } fn max_value() -> Self { '\u{10FFFF}' } fn as_u32(self) -> u32 { u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } // Tests for interval sets are written in src/hir.rs against the public API.