/// Slot is a single saved capture location. Note that there are two slots for /// every capture in a regular expression (one slot each for the start and end /// of the capture). pub type Slot = Option; /// Locations represents the offsets of each capturing group in a regex for /// a single match. /// /// Unlike `Captures`, a `Locations` value only stores offsets. #[doc(hidden)] #[derive(Clone, Debug)] pub struct Locations(Vec); impl Locations { /// Returns the start and end positions of the Nth capture group. Returns /// `None` if `i` is not a valid capture group or if the capture group did /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { let (s, e) = (i * 2, i * 2 + 1); match (self.0.get(s), self.0.get(e)) { (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), _ => None, } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices /// in terms of the original string matched. pub fn iter(&self) -> SubCapturesPosIter { SubCapturesPosIter { idx: 0, locs: self } } /// Returns the total number of capturing groups. /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. pub fn len(&self) -> usize { self.0.len() / 2 } /// Return the individual slots as a slice. pub(crate) fn as_slots(&mut self) -> &mut [Slot] { &mut self.0 } } /// An iterator over capture group positions for a particular match of a /// regular expression. /// /// Positions are byte indices in terms of the original string matched. /// /// `'c` is the lifetime of the captures. pub struct SubCapturesPosIter<'c> { idx: usize, locs: &'c Locations, } impl<'c> Iterator for SubCapturesPosIter<'c> { type Item = Option<(usize, usize)>; fn next(&mut self) -> Option> { if self.idx >= self.locs.len() { return None; } let x = match self.locs.pos(self.idx) { None => Some(None), Some((s, e)) => Some(Some((s, e))), }; self.idx += 1; x } } /// `RegularExpression` describes types that can implement regex searching. /// /// This trait is my attempt at reducing code duplication and to standardize /// the internal API. Specific duplication that is avoided are the `find` /// and `capture` iterators, which are slightly tricky. /// /// It's not clear whether this trait is worth it, and it also isn't /// clear whether it's useful as a public trait or not. Methods like /// `next_after_empty` reak of bad design, but the rest of the methods seem /// somewhat reasonable. One particular thing this trait would expose would be /// the ability to start the search of a regex anywhere in a haystack, which /// isn't possible in the current public API. pub trait RegularExpression: Sized { /// The type of the haystack. type Text: ?Sized; /// The number of capture slots in the compiled regular expression. This is /// always two times the number of capture groups (two slots per group). fn slots_len(&self) -> usize; /// Allocates fresh space for all capturing groups in this regex. fn locations(&self) -> Locations { Locations(vec![None; self.slots_len()]) } /// Returns the position of the next character after `i`. /// /// For example, a haystack with type `&[u8]` probably returns `i+1`, /// whereas a haystack with type `&str` probably returns `i` plus the /// length of the next UTF-8 sequence. fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; /// Returns the location of the shortest match. fn shortest_match_at( &self, text: &Self::Text, start: usize, ) -> Option; /// Returns whether the regex matches the text given. fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; /// Returns the leftmost-first match location if one exists. fn find_at( &self, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>; /// Returns the leftmost-first match location if one exists, and also /// fills in any matching capture slot locations. fn captures_read_at( &self, locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>; /// Returns an iterator over all non-overlapping successive leftmost-first /// matches. fn find_iter(self, text: &Self::Text) -> Matches { Matches { re: self, text: text, last_end: 0, last_match: None } } /// Returns an iterator over all non-overlapping successive leftmost-first /// matches with captures. fn captures_iter(self, text: &Self::Text) -> CaptureMatches { CaptureMatches(self.find_iter(text)) } } /// An iterator over all non-overlapping successive leftmost-first matches. pub struct Matches<'t, R> where R: RegularExpression, R::Text: 't, { re: R, text: &'t R::Text, last_end: usize, last_match: Option, } impl<'t, R> Matches<'t, R> where R: RegularExpression, R::Text: 't, { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.text } /// Return the underlying regex. pub fn regex(&self) -> &R { &self.re } } impl<'t, R> Iterator for Matches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]>, { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { if self.last_end > self.text.as_ref().len() { return None; } let (s, e) = match self.re.find_at(self.text, self.last_end) { None => return None, Some((s, e)) => (s, e), }; if s == e { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = self.re.next_after_empty(self.text, e); // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(e) == self.last_match { return self.next(); } } else { self.last_end = e; } self.last_match = Some(e); Some((s, e)) } } /// An iterator over all non-overlapping successive leftmost-first matches with /// captures. pub struct CaptureMatches<'t, R>(Matches<'t, R>) where R: RegularExpression, R::Text: 't; impl<'t, R> CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't, { /// Return the text being searched. pub fn text(&self) -> &'t R::Text { self.0.text() } /// Return the underlying regex. pub fn regex(&self) -> &R { self.0.regex() } } impl<'t, R> Iterator for CaptureMatches<'t, R> where R: RegularExpression, R::Text: 't + AsRef<[u8]>, { type Item = Locations; fn next(&mut self) -> Option { if self.0.last_end > self.0.text.as_ref().len() { return None; } let mut locs = self.0.re.locations(); let (s, e) = match self.0.re.captures_read_at( &mut locs, self.0.text, self.0.last_end, ) { None => return None, Some((s, e)) => (s, e), }; if s == e { self.0.last_end = self.0.re.next_after_empty(self.0.text, e); if Some(e) == self.0.last_match { return self.next(); } } else { self.0.last_end = e; } self.0.last_match = Some(e); Some(locs) } }