diff options
Diffstat (limited to 'vendor/fst/src/automaton/mod.rs')
-rw-r--r-- | vendor/fst/src/automaton/mod.rs | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/vendor/fst/src/automaton/mod.rs b/vendor/fst/src/automaton/mod.rs new file mode 100644 index 000000000..fe503ed6c --- /dev/null +++ b/vendor/fst/src/automaton/mod.rs @@ -0,0 +1,492 @@ +#[cfg(feature = "levenshtein")] +pub use self::levenshtein::{Levenshtein, LevenshteinError}; + +#[cfg(feature = "levenshtein")] +mod levenshtein; + +/// Automaton describes types that behave as a finite automaton. +/// +/// All implementors of this trait are represented by *byte based* automata. +/// Stated differently, all transitions in the automata correspond to a single +/// byte in the input. +/// +/// This implementation choice is important for a couple reasons: +/// +/// 1. The set of possible transitions in each node is small, which may make +/// efficient memory usage easier. +/// 2. The finite state transducers in this crate are all byte based, so any +/// automata used on them must also be byte based. +/// +/// In practice, this does present somewhat of a problem, for example, if +/// you're storing UTF-8 encoded strings in a finite state transducer. Consider +/// using a `Levenshtein` automaton, which accepts a query string and an edit +/// distance. The edit distance should apply to some notion of *character*, +/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for +/// some definition of "character"). Therefore, the automaton must have UTF-8 +/// decoding built into it. This can be tricky to implement, so you may find +/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful. +pub trait Automaton { + /// The type of the state used in the automaton. + type State; + + /// Returns a single start state for this automaton. + /// + /// This method should always return the same value for each + /// implementation. + fn start(&self) -> Self::State; + + /// Returns true if and only if `state` is a match state. + fn is_match(&self, state: &Self::State) -> bool; + + /// Returns true if and only if `state` can lead to a match in zero or more + /// steps. + /// + /// If this returns `false`, then no sequence of inputs from this state + /// should ever produce a match. If this does not follow, then those match + /// states may never be reached. In other words, behavior may be incorrect. + /// + /// If this returns `true` even when no match is possible, then behavior + /// will be correct, but callers may be forced to do additional work. + fn can_match(&self, _state: &Self::State) -> bool { + true + } + + /// Returns true if and only if `state` matches and must match no matter + /// what steps are taken. + /// + /// If this returns `true`, then every sequence of inputs from this state + /// produces a match. If this does not follow, then those match states may + /// never be reached. In other words, behavior may be incorrect. + /// + /// If this returns `false` even when every sequence of inputs will lead to + /// a match, then behavior will be correct, but callers may be forced to do + /// additional work. + fn will_always_match(&self, _state: &Self::State) -> bool { + false + } + + /// Return the next state given `state` and an input. + fn accept(&self, state: &Self::State, byte: u8) -> Self::State; + + /// If applicable, return the next state when the end of a key is seen. + fn accept_eof(&self, _: &Self::State) -> Option<Self::State> { + None + } + + /// Returns an automaton that matches the strings that start with something + /// this automaton matches. + fn starts_with(self) -> StartsWith<Self> + where + Self: Sized, + { + StartsWith(self) + } + + /// Returns an automaton that matches the strings matched by either this or + /// the other automaton. + fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs> + where + Self: Sized, + { + Union(self, rhs) + } + + /// Returns an automaton that matches the strings matched by both this and + /// the other automaton. + fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs> + where + Self: Sized, + { + Intersection(self, rhs) + } + + /// Returns an automaton that matches the strings not matched by this + /// automaton. + fn complement(self) -> Complement<Self> + where + Self: Sized, + { + Complement(self) + } +} + +impl<'a, T: Automaton> Automaton for &'a T { + type State = T::State; + + fn start(&self) -> T::State { + (*self).start() + } + + fn is_match(&self, state: &T::State) -> bool { + (*self).is_match(state) + } + + fn can_match(&self, state: &T::State) -> bool { + (*self).can_match(state) + } + + fn will_always_match(&self, state: &T::State) -> bool { + (*self).will_always_match(state) + } + + fn accept(&self, state: &T::State, byte: u8) -> T::State { + (*self).accept(state, byte) + } + + fn accept_eof(&self, state: &Self::State) -> Option<Self::State> { + (*self).accept_eof(state) + } +} + +/// An automaton that matches if the input equals to a specific string. +/// +/// It can be used in combination with [`StartsWith`] to search strings +/// starting with a given prefix. +/// +/// ```rust +/// extern crate fst; +/// +/// use fst::{Automaton, IntoStreamer, Streamer, Set}; +/// use fst::automaton::Str; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<(), Box<dyn std::error::Error>> { +/// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; +/// let set = Set::from_iter(paths)?; +/// +/// // Build our prefix query. +/// let prefix = Str::new("/home").starts_with(); +/// +/// // Apply our query to the set we built. +/// let mut stream = set.search(prefix).into_stream(); +/// +/// let matches = stream.into_strs()?; +/// assert_eq!(matches, vec!["/home/projects/bar", "/home/projects/foo"]); +/// Ok(()) +/// } +/// ``` +#[derive(Clone, Debug)] +pub struct Str<'a> { + string: &'a [u8], +} + +impl<'a> Str<'a> { + /// Constructs automaton that matches an exact string. + #[inline] + pub fn new(string: &'a str) -> Str<'a> { + Str { string: string.as_bytes() } + } +} + +impl<'a> Automaton for Str<'a> { + type State = Option<usize>; + + #[inline] + fn start(&self) -> Option<usize> { + Some(0) + } + + #[inline] + fn is_match(&self, pos: &Option<usize>) -> bool { + *pos == Some(self.string.len()) + } + + #[inline] + fn can_match(&self, pos: &Option<usize>) -> bool { + pos.is_some() + } + + #[inline] + fn accept(&self, pos: &Option<usize>, byte: u8) -> Option<usize> { + // if we aren't already past the end... + if let Some(pos) = *pos { + // and there is still a matching byte at the current position... + if self.string.get(pos).cloned() == Some(byte) { + // then move forward + return Some(pos + 1); + } + } + // otherwise we're either past the end or didn't match the byte + None + } +} + +/// An automaton that matches if the input contains a specific subsequence. +/// +/// It can be used to build a simple fuzzy-finder. +/// +/// ```rust +/// extern crate fst; +/// +/// use fst::{IntoStreamer, Streamer, Set}; +/// use fst::automaton::Subsequence; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<(), Box<dyn std::error::Error>> { +/// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; +/// let set = Set::from_iter(paths)?; +/// +/// // Build our fuzzy query. +/// let subseq = Subsequence::new("hpf"); +/// +/// // Apply our fuzzy query to the set we built. +/// let mut stream = set.search(subseq).into_stream(); +/// +/// let matches = stream.into_strs()?; +/// assert_eq!(matches, vec!["/home/projects/foo"]); +/// Ok(()) +/// } +/// ``` +#[derive(Clone, Debug)] +pub struct Subsequence<'a> { + subseq: &'a [u8], +} + +impl<'a> Subsequence<'a> { + /// Constructs automaton that matches input containing the + /// specified subsequence. + #[inline] + pub fn new(subsequence: &'a str) -> Subsequence<'a> { + Subsequence { subseq: subsequence.as_bytes() } + } +} + +impl<'a> Automaton for Subsequence<'a> { + type State = usize; + + #[inline] + fn start(&self) -> usize { + 0 + } + + #[inline] + fn is_match(&self, &state: &usize) -> bool { + state == self.subseq.len() + } + + #[inline] + fn can_match(&self, _: &usize) -> bool { + true + } + + #[inline] + fn will_always_match(&self, &state: &usize) -> bool { + state == self.subseq.len() + } + + #[inline] + fn accept(&self, &state: &usize, byte: u8) -> usize { + if state == self.subseq.len() { + return state; + } + state + (byte == self.subseq[state]) as usize + } +} + +/// An automaton that always matches. +/// +/// This is useful in a generic context as a way to express that no automaton +/// should be used. +#[derive(Clone, Debug)] +pub struct AlwaysMatch; + +impl Automaton for AlwaysMatch { + type State = (); + + #[inline] + fn start(&self) -> () { + () + } + #[inline] + fn is_match(&self, _: &()) -> bool { + true + } + #[inline] + fn can_match(&self, _: &()) -> bool { + true + } + #[inline] + fn will_always_match(&self, _: &()) -> bool { + true + } + #[inline] + fn accept(&self, _: &(), _: u8) -> () { + () + } +} + +/// An automaton that matches a string that begins with something that the +/// wrapped automaton matches. +#[derive(Clone, Debug)] +pub struct StartsWith<A>(A); + +/// The `Automaton` state for `StartsWith<A>`. +pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>); + +enum StartsWithStateKind<A: Automaton> { + Done, + Running(A::State), +} + +impl<A: Automaton> Automaton for StartsWith<A> { + type State = StartsWithState<A>; + + fn start(&self) -> StartsWithState<A> { + StartsWithState({ + let inner = self.0.start(); + if self.0.is_match(&inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(inner) + } + }) + } + + fn is_match(&self, state: &StartsWithState<A>) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + + fn can_match(&self, state: &StartsWithState<A>) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), + } + } + + fn will_always_match(&self, state: &StartsWithState<A>) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + + fn accept( + &self, + state: &StartsWithState<A>, + byte: u8, + ) -> StartsWithState<A> { + StartsWithState(match state.0 { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(ref inner) => { + let next_inner = self.0.accept(inner, byte); + if self.0.is_match(&next_inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(next_inner) + } + } + }) + } +} + +/// An automaton that matches when one of its component automata match. +#[derive(Clone, Debug)] +pub struct Union<A, B>(A, B); + +/// The `Automaton` state for `Union<A, B>`. +pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State); + +impl<A: Automaton, B: Automaton> Automaton for Union<A, B> { + type State = UnionState<A, B>; + + fn start(&self) -> UnionState<A, B> { + UnionState(self.0.start(), self.1.start()) + } + + fn is_match(&self, state: &UnionState<A, B>) -> bool { + self.0.is_match(&state.0) || self.1.is_match(&state.1) + } + + fn can_match(&self, state: &UnionState<A, B>) -> bool { + self.0.can_match(&state.0) || self.1.can_match(&state.1) + } + + fn will_always_match(&self, state: &UnionState<A, B>) -> bool { + self.0.will_always_match(&state.0) + || self.1.will_always_match(&state.1) + } + + fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> { + UnionState( + self.0.accept(&state.0, byte), + self.1.accept(&state.1, byte), + ) + } +} + +/// An automaton that matches when both of its component automata match. +#[derive(Clone, Debug)] +pub struct Intersection<A, B>(A, B); + +/// The `Automaton` state for `Intersection<A, B>`. +pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State); + +impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> { + type State = IntersectionState<A, B>; + + fn start(&self) -> IntersectionState<A, B> { + IntersectionState(self.0.start(), self.1.start()) + } + + fn is_match(&self, state: &IntersectionState<A, B>) -> bool { + self.0.is_match(&state.0) && self.1.is_match(&state.1) + } + + fn can_match(&self, state: &IntersectionState<A, B>) -> bool { + self.0.can_match(&state.0) && self.1.can_match(&state.1) + } + + fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool { + self.0.will_always_match(&state.0) + && self.1.will_always_match(&state.1) + } + + fn accept( + &self, + state: &IntersectionState<A, B>, + byte: u8, + ) -> IntersectionState<A, B> { + IntersectionState( + self.0.accept(&state.0, byte), + self.1.accept(&state.1, byte), + ) + } +} + +/// An automaton that matches exactly when the automaton it wraps does not. +#[derive(Clone, Debug)] +pub struct Complement<A>(A); + +/// The `Automaton` state for `Complement<A>`. +pub struct ComplementState<A: Automaton>(A::State); + +impl<A: Automaton> Automaton for Complement<A> { + type State = ComplementState<A>; + + fn start(&self) -> ComplementState<A> { + ComplementState(self.0.start()) + } + + fn is_match(&self, state: &ComplementState<A>) -> bool { + !self.0.is_match(&state.0) + } + + fn can_match(&self, state: &ComplementState<A>) -> bool { + !self.0.will_always_match(&state.0) + } + + fn will_always_match(&self, state: &ComplementState<A>) -> bool { + !self.0.can_match(&state.0) + } + + fn accept( + &self, + state: &ComplementState<A>, + byte: u8, + ) -> ComplementState<A> { + ComplementState(self.0.accept(&state.0, byte)) + } +} |