diff options
Diffstat (limited to 'vendor/datafrog/src/lib.rs')
-rw-r--r-- | vendor/datafrog/src/lib.rs | 567 |
1 files changed, 567 insertions, 0 deletions
diff --git a/vendor/datafrog/src/lib.rs b/vendor/datafrog/src/lib.rs new file mode 100644 index 000000000..d2f93239e --- /dev/null +++ b/vendor/datafrog/src/lib.rs @@ -0,0 +1,567 @@ +//! A lightweight Datalog engine in Rust +//! +//! The intended design is that one has static `Relation` types that are sets +//! of tuples, and `Variable` types that represent monotonically increasing +//! sets of tuples. +//! +//! The types are mostly wrappers around `Vec<Tuple>` indicating sorted-ness, +//! and the intent is that this code can be dropped in the middle of an otherwise +//! normal Rust program, run to completion, and then the results extracted as +//! vectors again. + +#![forbid(missing_docs)] + +use std::cell::RefCell; +use std::cmp::Ordering; +use std::iter::FromIterator; +use std::rc::Rc; + +mod join; +mod map; +mod test; +mod treefrog; +pub use crate::join::JoinInput; +pub use crate::treefrog::{ + extend_anti::ExtendAnti, + extend_with::ExtendWith, + filter_anti::FilterAnti, + filter_with::FilterWith, + filters::{PrefixFilter, ValueFilter}, + Leaper, Leapers, RelationLeaper, +}; + +/// A static, ordered list of key-value pairs. +/// +/// A relation represents a fixed set of key-value pairs. In many places in a +/// Datalog computation we want to be sure that certain relations are not able +/// to vary (for example, in antijoins). +#[derive(Clone)] +pub struct Relation<Tuple: Ord> { + /// Sorted list of distinct tuples. + pub elements: Vec<Tuple>, +} + +impl<Tuple: Ord> Relation<Tuple> { + /// Merges two relations into their union. + pub fn merge(self, other: Self) -> Self { + let Relation { + elements: mut elements1, + } = self; + let Relation { + elements: mut elements2, + } = other; + + // If one of the element lists is zero-length, we don't need to do any work + if elements1.is_empty() { + return Relation { + elements: elements2, + }; + } + + if elements2.is_empty() { + return Relation { + elements: elements1, + }; + } + + // Make sure that elements1 starts with the lower element + // Will not panic since both collections must have at least 1 element at this point + if elements1[0] > elements2[0] { + std::mem::swap(&mut elements1, &mut elements2); + } + + // Fast path for when all the new elements are after the exiting ones + if elements1[elements1.len() - 1] < elements2[0] { + elements1.extend(elements2.into_iter()); + // println!("fast path"); + return Relation { + elements: elements1, + }; + } + + let mut elements = Vec::with_capacity(elements1.len() + elements2.len()); + let mut elements1 = elements1.drain(..); + let mut elements2 = elements2.drain(..).peekable(); + + elements.push(elements1.next().unwrap()); + if elements.first() == elements2.peek() { + elements2.next(); + } + + for elem in elements1 { + while elements2.peek().map(|x| x.cmp(&elem)) == Some(Ordering::Less) { + elements.push(elements2.next().unwrap()); + } + if elements2.peek().map(|x| x.cmp(&elem)) == Some(Ordering::Equal) { + elements2.next(); + } + elements.push(elem); + } + + // Finish draining second list + elements.extend(elements2); + + Relation { elements } + } + + /// Creates a `Relation` from the elements of the `iterator`. + /// + /// Same as the `from_iter` method from `std::iter::FromIterator` trait. + pub fn from_iter<I>(iterator: I) -> Self + where + I: IntoIterator<Item = Tuple>, + { + iterator.into_iter().collect() + } + + /// Creates a `Relation` using the `leapjoin` logic; + /// see [`Variable::from_leapjoin`] + pub fn from_leapjoin<'leap, SourceTuple: Ord, Val: Ord + 'leap>( + source: &Relation<SourceTuple>, + leapers: impl Leapers<'leap, SourceTuple, Val>, + logic: impl FnMut(&SourceTuple, &Val) -> Tuple, + ) -> Self { + treefrog::leapjoin(&source.elements, leapers, logic) + } + + /// Creates a `Relation` by joining the values from `input1` and + /// `input2` and then applying `logic`. Like + /// [`Variable::from_join`] except for use where the inputs are + /// not varying across iterations. + pub fn from_join<Key: Ord, Val1: Ord, Val2: Ord>( + input1: &Relation<(Key, Val1)>, + input2: &Relation<(Key, Val2)>, + logic: impl FnMut(&Key, &Val1, &Val2) -> Tuple, + ) -> Self { + join::join_into_relation(input1, input2, logic) + } + + /// Creates a `Relation` by removing all values from `input1` that + /// share a key with `input2`, and then transforming the resulting + /// tuples with the `logic` closure. Like + /// [`Variable::from_antijoin`] except for use where the inputs + /// are not varying across iterations. + pub fn from_antijoin<Key: Ord, Val1: Ord>( + input1: &Relation<(Key, Val1)>, + input2: &Relation<Key>, + logic: impl FnMut(&Key, &Val1) -> Tuple, + ) -> Self { + join::antijoin(input1, input2, logic) + } + + /// Construct a new relation by mapping another one. Equivalent to + /// creating an iterator but perhaps more convenient. Analogous to + /// `Variable::from_map`. + pub fn from_map<T2: Ord>(input: &Relation<T2>, logic: impl FnMut(&T2) -> Tuple) -> Self { + input.iter().map(logic).collect() + } + + /// Creates a `Relation` from a vector of tuples. + pub fn from_vec(mut elements: Vec<Tuple>) -> Self { + elements.sort(); + elements.dedup(); + Relation { elements } + } +} + +impl<Tuple: Ord> From<Vec<Tuple>> for Relation<Tuple> { + fn from(iterator: Vec<Tuple>) -> Self { + Self::from_vec(iterator) + } +} + +impl<Tuple: Ord> FromIterator<Tuple> for Relation<Tuple> { + fn from_iter<I>(iterator: I) -> Self + where + I: IntoIterator<Item = Tuple>, + { + Relation::from_vec(iterator.into_iter().collect()) + } +} + +impl<'tuple, Tuple: 'tuple + Copy + Ord> FromIterator<&'tuple Tuple> for Relation<Tuple> { + fn from_iter<I>(iterator: I) -> Self + where + I: IntoIterator<Item = &'tuple Tuple>, + { + Relation::from_vec(iterator.into_iter().cloned().collect()) + } +} + +impl<Tuple: Ord> std::ops::Deref for Relation<Tuple> { + type Target = [Tuple]; + fn deref(&self) -> &Self::Target { + &self.elements[..] + } +} + +/// An iterative context for recursive evaluation. +/// +/// An `Iteration` tracks monotonic variables, and monitors their progress. +/// It can inform the user if they have ceased changing, at which point the +/// computation should be done. +pub struct Iteration { + variables: Vec<Box<dyn VariableTrait>>, +} + +impl Iteration { + /// Create a new iterative context. + pub fn new() -> Self { + Iteration { + variables: Vec::new(), + } + } + /// Reports whether any of the monitored variables have changed since + /// the most recent call. + pub fn changed(&mut self) -> bool { + let mut result = false; + for variable in self.variables.iter_mut() { + if variable.changed() { + result = true; + } + } + result + } + /// Creates a new named variable associated with the iterative context. + pub fn variable<Tuple: Ord + 'static>(&mut self, name: &str) -> Variable<Tuple> { + let variable = Variable::new(name); + self.variables.push(Box::new(variable.clone())); + variable + } + /// Creates a new named variable associated with the iterative context. + /// + /// This variable will not be maintained distinctly, and may advertise tuples as + /// recent multiple times (perhaps unboundedly many times). + pub fn variable_indistinct<Tuple: Ord + 'static>(&mut self, name: &str) -> Variable<Tuple> { + let mut variable = Variable::new(name); + variable.distinct = false; + self.variables.push(Box::new(variable.clone())); + variable + } +} + +/// A type that can report on whether it has changed. +trait VariableTrait { + /// Reports whether the variable has changed since it was last asked. + fn changed(&mut self) -> bool; +} + +/// An monotonically increasing set of `Tuple`s. +/// +/// There are three stages in the lifecycle of a tuple: +/// +/// 1. A tuple is added to `self.to_add`, but is not yet visible externally. +/// 2. Newly added tuples are then promoted to `self.recent` for one iteration. +/// 3. After one iteration, recent tuples are moved to `self.tuples` for posterity. +/// +/// Each time `self.changed()` is called, the `recent` relation is folded into `tuples`, +/// and the `to_add` relations are merged, potentially deduplicated against `tuples`, and +/// then made `recent`. This way, across calls to `changed()` all added tuples are in +/// `recent` at least once and eventually all are in `tuples`. +/// +/// A `Variable` may optionally be instructed not to de-duplicate its tuples, for reasons +/// of performance. Such a variable cannot be relied on to terminate iterative computation, +/// and it is important that any cycle of derivations have at least one de-duplicating +/// variable on it. +pub struct Variable<Tuple: Ord> { + /// Should the variable be maintained distinctly. + distinct: bool, + /// A useful name for the variable. + name: String, + /// A list of relations whose union are the accepted tuples. + pub stable: Rc<RefCell<Vec<Relation<Tuple>>>>, + /// A list of recent tuples, still to be processed. + pub recent: Rc<RefCell<Relation<Tuple>>>, + /// A list of future tuples, to be introduced. + to_add: Rc<RefCell<Vec<Relation<Tuple>>>>, +} + +// Operator implementations. +impl<Tuple: Ord> Variable<Tuple> { + /// Adds tuples that result from joining `input1` and `input2` -- + /// each of the inputs must be a set of (Key, Value) tuples. Both + /// `input1` and `input2` must have the same type of key (`K`) but + /// they can have distinct value types (`V1` and `V2` + /// respectively). The `logic` closure will be invoked for each + /// key that appears in both inputs; it is also given the two + /// values, and from those it should construct the resulting + /// value. + /// + /// Note that `input1` must be a variable, but `input2` can be a + /// relation or a variable. Therefore, you cannot join two + /// relations with this method. This is not because the result + /// would be wrong, but because it would be inefficient: the + /// result from such a join cannot vary across iterations (as + /// relations are fixed), so you should prefer to invoke `insert` + /// on a relation created by `Relation::from_join` instead. + /// + /// # Examples + /// + /// This example starts a collection with the pairs (x, x+1) and (x+1, x) for x in 0 .. 10. + /// It then adds pairs (y, z) for which (x, y) and (x, z) are present. Because the initial + /// pairs are symmetric, this should result in all pairs (x, y) for x and y in 0 .. 11. + /// + /// ``` + /// use datafrog::{Iteration, Relation}; + /// + /// let mut iteration = Iteration::new(); + /// let variable = iteration.variable::<(usize, usize)>("source"); + /// variable.extend((0 .. 10).map(|x| (x, x + 1))); + /// variable.extend((0 .. 10).map(|x| (x + 1, x))); + /// + /// while iteration.changed() { + /// variable.from_join(&variable, &variable, |&key, &val1, &val2| (val1, val2)); + /// } + /// + /// let result = variable.complete(); + /// assert_eq!(result.len(), 121); + /// ``` + pub fn from_join<'me, K: Ord, V1: Ord, V2: Ord>( + &self, + input1: &'me Variable<(K, V1)>, + input2: impl JoinInput<'me, (K, V2)>, + logic: impl FnMut(&K, &V1, &V2) -> Tuple, + ) { + join::join_into(input1, input2, self, logic) + } + + /// Adds tuples from `input1` whose key is not present in `input2`. + /// + /// Note that `input1` must be a variable: if you have a relation + /// instead, you can use `Relation::from_antijoin` and then + /// `Variable::insert`. Note that the result will not vary during + /// the iteration. + /// + /// # Examples + /// + /// This example starts a collection with the pairs (x, x+1) for x in 0 .. 10. It then + /// adds any pairs (x+1,x) for which x is not a multiple of three. That excludes four + /// pairs (for 0, 3, 6, and 9) which should leave us with 16 total pairs. + /// + /// ``` + /// use datafrog::{Iteration, Relation}; + /// + /// let mut iteration = Iteration::new(); + /// let variable = iteration.variable::<(usize, usize)>("source"); + /// variable.extend((0 .. 10).map(|x| (x, x + 1))); + /// + /// let relation: Relation<_> = (0 .. 10).filter(|x| x % 3 == 0).collect(); + /// + /// while iteration.changed() { + /// variable.from_antijoin(&variable, &relation, |&key, &val| (val, key)); + /// } + /// + /// let result = variable.complete(); + /// assert_eq!(result.len(), 16); + /// ``` + pub fn from_antijoin<K: Ord, V: Ord>( + &self, + input1: &Variable<(K, V)>, + input2: &Relation<K>, + logic: impl FnMut(&K, &V) -> Tuple, + ) { + self.insert(join::antijoin(input1, input2, logic)) + } + + /// Adds tuples that result from mapping `input`. + /// + /// # Examples + /// + /// This example starts a collection with the pairs (x, x) for x in 0 .. 10. It then + /// repeatedly adds any pairs (x, z) for (x, y) in the collection, where z is the Collatz + /// step for y: it is y/2 if y is even, and 3*y + 1 if y is odd. This produces all of the + /// pairs (x, y) where x visits y as part of its Collatz journey. + /// + /// ``` + /// use datafrog::{Iteration, Relation}; + /// + /// let mut iteration = Iteration::new(); + /// let variable = iteration.variable::<(usize, usize)>("source"); + /// variable.extend((0 .. 10).map(|x| (x, x))); + /// + /// while iteration.changed() { + /// variable.from_map(&variable, |&(key, val)| + /// if val % 2 == 0 { + /// (key, val/2) + /// } + /// else { + /// (key, 3*val + 1) + /// }); + /// } + /// + /// let result = variable.complete(); + /// assert_eq!(result.len(), 74); + /// ``` + pub fn from_map<T2: Ord>(&self, input: &Variable<T2>, logic: impl FnMut(&T2) -> Tuple) { + map::map_into(input, self, logic) + } + + /// Adds tuples that result from combining `source` with the + /// relations given in `leapers`. This operation is very flexible + /// and can be used to do a combination of joins and anti-joins. + /// The main limitation is that the things being combined must + /// consist of one dynamic variable (`source`) and then several + /// fixed relations (`leapers`). + /// + /// The idea is as follows: + /// + /// - You will be inserting new tuples that result from joining (and anti-joining) + /// some dynamic variable `source` of source tuples (`SourceTuple`) + /// with some set of values (of type `Val`). + /// - You provide these values by combining `source` with a set of leapers + /// `leapers`, each of which is derived from a fixed relation. The `leapers` + /// should be either a single leaper (of suitable type) or else a tuple of leapers. + /// You can create a leaper in one of two ways: + /// - Extension: In this case, you have a relation of type `(K, Val)` for some + /// type `K`. You provide a closure that maps from `SourceTuple` to the key + /// `K`. If you use `relation.extend_with`, then any `Val` values the + /// relation provides will be added to the set of values; if you use + /// `extend_anti`, then the `Val` values will be removed. + /// - Filtering: In this case, you have a relation of type `K` for some + /// type `K` and you provide a closure that maps from `SourceTuple` to + /// the key `K`. Filters don't provide values but they remove source + /// tuples. + /// - Finally, you get a callback `logic` that accepts each `(SourceTuple, Val)` + /// that was successfully joined (and not filtered) and which maps to the + /// type of this variable. + pub fn from_leapjoin<'leap, SourceTuple: Ord, Val: Ord + 'leap>( + &self, + source: &Variable<SourceTuple>, + leapers: impl Leapers<'leap, SourceTuple, Val>, + logic: impl FnMut(&SourceTuple, &Val) -> Tuple, + ) { + self.insert(treefrog::leapjoin(&source.recent.borrow(), leapers, logic)); + } +} + +impl<Tuple: Ord> Clone for Variable<Tuple> { + fn clone(&self) -> Self { + Variable { + distinct: self.distinct, + name: self.name.clone(), + stable: self.stable.clone(), + recent: self.recent.clone(), + to_add: self.to_add.clone(), + } + } +} + +impl<Tuple: Ord> Variable<Tuple> { + fn new(name: &str) -> Self { + Variable { + distinct: true, + name: name.to_string(), + stable: Rc::new(RefCell::new(Vec::new())), + recent: Rc::new(RefCell::new(Vec::new().into())), + to_add: Rc::new(RefCell::new(Vec::new())), + } + } + + /// Inserts a relation into the variable. + /// + /// This is most commonly used to load initial values into a variable. + /// it is not obvious that it should be commonly used otherwise, but + /// it should not be harmful. + pub fn insert(&self, relation: Relation<Tuple>) { + if !relation.is_empty() { + self.to_add.borrow_mut().push(relation); + } + } + + /// Extend the variable with values from the iterator. + /// + /// This is most commonly used to load initial values into a variable. + /// it is not obvious that it should be commonly used otherwise, but + /// it should not be harmful. + pub fn extend<T>(&self, iterator: impl IntoIterator<Item = T>) + where + Relation<Tuple>: FromIterator<T>, + { + self.insert(iterator.into_iter().collect()); + } + + /// Consumes the variable and returns a relation. + /// + /// This method removes the ability for the variable to develop, and + /// flattens all internal tuples down to one relation. The method + /// asserts that iteration has completed, in that `self.recent` and + /// `self.to_add` should both be empty. + pub fn complete(self) -> Relation<Tuple> { + assert!(self.recent.borrow().is_empty()); + assert!(self.to_add.borrow().is_empty()); + let mut result: Relation<Tuple> = Vec::new().into(); + while let Some(batch) = self.stable.borrow_mut().pop() { + result = result.merge(batch); + } + result + } +} + +impl<Tuple: Ord> VariableTrait for Variable<Tuple> { + fn changed(&mut self) -> bool { + // 1. Merge self.recent into self.stable. + if !self.recent.borrow().is_empty() { + let mut recent = + ::std::mem::replace(&mut (*self.recent.borrow_mut()), Vec::new().into()); + while self + .stable + .borrow() + .last() + .map(|x| x.len() <= 2 * recent.len()) + == Some(true) + { + let last = self.stable.borrow_mut().pop().unwrap(); + recent = recent.merge(last); + } + self.stable.borrow_mut().push(recent); + } + + // 2. Move self.to_add into self.recent. + let to_add = self.to_add.borrow_mut().pop(); + if let Some(mut to_add) = to_add { + while let Some(to_add_more) = self.to_add.borrow_mut().pop() { + to_add = to_add.merge(to_add_more); + } + // 2b. Restrict `to_add` to tuples not in `self.stable`. + if self.distinct { + for batch in self.stable.borrow().iter() { + let mut slice = &batch[..]; + // Only gallop if the slice is relatively large. + if slice.len() > 4 * to_add.elements.len() { + to_add.elements.retain(|x| { + slice = join::gallop(slice, |y| y < x); + slice.is_empty() || &slice[0] != x + }); + } else { + to_add.elements.retain(|x| { + while !slice.is_empty() && &slice[0] < x { + slice = &slice[1..]; + } + slice.is_empty() || &slice[0] != x + }); + } + } + } + *self.recent.borrow_mut() = to_add; + } + + // let mut total = 0; + // for tuple in self.stable.borrow().iter() { + // total += tuple.len(); + // } + + // println!("Variable\t{}\t{}\t{}", self.name, total, self.recent.borrow().len()); + + !self.recent.borrow().is_empty() + } +} + +// impl<Tuple: Ord> Drop for Variable<Tuple> { +// fn drop(&mut self) { +// let mut total = 0; +// for batch in self.stable.borrow().iter() { +// total += batch.len(); +// } +// println!("FINAL: {:?}\t{:?}", self.name, total); +// } +// } |