diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 09:26:03 +0000 |
commit | 9918693037dce8aa4bb6f08741b6812923486c18 (patch) | |
tree | 21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/fst/src/raw | |
parent | Releasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff) | |
download | rustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip |
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/fst/src/raw')
-rw-r--r-- | vendor/fst/src/raw/build.rs | 474 | ||||
-rw-r--r-- | vendor/fst/src/raw/common_inputs.rs | 289 | ||||
-rw-r--r-- | vendor/fst/src/raw/counting_writer.rs | 70 | ||||
-rw-r--r-- | vendor/fst/src/raw/crc32.rs | 59 | ||||
-rw-r--r-- | vendor/fst/src/raw/crc32_table.rs | 2 | ||||
-rw-r--r-- | vendor/fst/src/raw/error.rs | 171 | ||||
-rw-r--r-- | vendor/fst/src/raw/mod.rs | 1382 | ||||
-rw-r--r-- | vendor/fst/src/raw/node.rs | 1012 | ||||
-rw-r--r-- | vendor/fst/src/raw/ops.rs | 643 | ||||
-rw-r--r-- | vendor/fst/src/raw/registry.rs | 264 | ||||
-rw-r--r-- | vendor/fst/src/raw/registry_minimal.rs | 53 | ||||
-rw-r--r-- | vendor/fst/src/raw/tests.rs | 589 |
12 files changed, 5008 insertions, 0 deletions
diff --git a/vendor/fst/src/raw/build.rs b/vendor/fst/src/raw/build.rs new file mode 100644 index 000000000..e93626b27 --- /dev/null +++ b/vendor/fst/src/raw/build.rs @@ -0,0 +1,474 @@ +use std::io; + +use crate::bytes; +use crate::error::Result; +use crate::raw::counting_writer::CountingWriter; +use crate::raw::error::Error; +use crate::raw::registry::{Registry, RegistryEntry}; +use crate::raw::{ + CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS, + NONE_ADDRESS, VERSION, +}; +// use raw::registry_minimal::{Registry, RegistryEntry}; +use crate::stream::{IntoStreamer, Streamer}; + +/// A builder for creating a finite state transducer. +/// +/// This is not your average everyday builder. It has two important qualities +/// that make it a bit unique from what you might expect: +/// +/// 1. All keys must be added in lexicographic order. Adding a key out of order +/// will result in an error. Additionally, adding a duplicate key with an +/// output value will also result in an error. That is, once a key is +/// associated with a value, that association can never be modified or +/// deleted. +/// 2. The representation of an fst is streamed to *any* `io::Write` as it is +/// built. For an in memory representation, this can be a `Vec<u8>`. +/// +/// Point (2) is especially important because it means that an fst can be +/// constructed *without storing the entire fst in memory*. Namely, since it +/// works with any `io::Write`, it can be streamed directly to a file. +/// +/// With that said, the builder does use memory, but **memory usage is bounded +/// to a constant size**. The amount of memory used trades off with the +/// compression ratio. Currently, the implementation hard codes this trade off +/// which can result in about 5-20MB of heap usage during construction. (N.B. +/// Guaranteeing a maximal compression ratio requires memory proportional to +/// the size of the fst, which defeats some of the benefit of streaming +/// it to disk. In practice, a small bounded amount of memory achieves +/// close-to-minimal compression ratios.) +/// +/// The algorithmic complexity of fst construction is `O(n)` where `n` is the +/// number of elements added to the fst. +pub struct Builder<W> { + /// The FST raw data is written directly to `wtr`. + /// + /// No internal buffering is done. + wtr: CountingWriter<W>, + /// The stack of unfinished nodes. + /// + /// An unfinished node is a node that could potentially have a new + /// transition added to it when a new word is added to the dictionary. + unfinished: UnfinishedNodes, + /// A map of finished nodes. + /// + /// A finished node is one that has been compiled and written to `wtr`. + /// After this point, the node is considered immutable and will never + /// change. + registry: Registry, + /// The last word added. + /// + /// This is used to enforce the invariant that words are added in sorted + /// order. + last: Option<Vec<u8>>, + /// The address of the last compiled node. + /// + /// This is used to optimize states with one transition that point + /// to the previously compiled node. (The previously compiled node in + /// this case actually corresponds to the next state for the transition, + /// since states are compiled in reverse.) + last_addr: CompiledAddr, + /// The number of keys added. + len: usize, +} + +#[derive(Debug)] +struct UnfinishedNodes { + stack: Vec<BuilderNodeUnfinished>, +} + +#[derive(Debug)] +struct BuilderNodeUnfinished { + node: BuilderNode, + last: Option<LastTransition>, +} + +#[derive(Debug, Hash, Eq, PartialEq)] +pub struct BuilderNode { + pub is_final: bool, + pub final_output: Output, + pub trans: Vec<Transition>, +} + +#[derive(Debug)] +struct LastTransition { + inp: u8, + out: Output, +} + +impl Builder<Vec<u8>> { + /// Create a builder that builds an fst in memory. + #[inline] + pub fn memory() -> Builder<Vec<u8>> { + Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap() + } + + /// Finishes construction of the FST and returns it. + #[inline] + pub fn into_fst(self) -> Fst<Vec<u8>> { + self.into_inner().and_then(Fst::new).unwrap() + } +} + +impl<W: io::Write> Builder<W> { + /// Create a builder that builds an fst by writing it to `wtr` in a + /// streaming fashion. + pub fn new(wtr: W) -> Result<Builder<W>> { + Builder::new_type(wtr, 0) + } + + /// The same as `new`, except it sets the type of the fst to the type + /// given. + pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> { + let mut wtr = CountingWriter::new(wtr); + // Don't allow any nodes to have address 0-7. We use these to encode + // the API version. We also use addresses `0` and `1` as special + // sentinel values, so they should never correspond to a real node. + bytes::io_write_u64_le(VERSION, &mut wtr)?; + // Similarly for 8-15 for the fst type. + bytes::io_write_u64_le(ty, &mut wtr)?; + Ok(Builder { + wtr, + unfinished: UnfinishedNodes::new(), + registry: Registry::new(10_000, 2), + last: None, + last_addr: NONE_ADDRESS, + len: 0, + }) + } + + /// Adds a byte string to this FST with a zero output value. + pub fn add<B>(&mut self, bs: B) -> Result<()> + where + B: AsRef<[u8]>, + { + self.check_last_key(bs.as_ref(), false)?; + self.insert_output(bs, None) + } + + /// Insert a new key-value pair into the fst. + /// + /// Keys must be convertible to byte strings. Values must be a `u64`, which + /// is a restriction of the current implementation of finite state + /// transducers. (Values may one day be expanded to other types.) + /// + /// If a key is inserted that is less than or equal to any previous key + /// added, then an error is returned. Similarly, if there was a problem + /// writing to the underlying writer, an error is returned. + pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()> + where + B: AsRef<[u8]>, + { + self.check_last_key(bs.as_ref(), true)?; + self.insert_output(bs, Some(Output::new(val))) + } + + /// Calls insert on each item in the iterator. + /// + /// If an error occurred while adding an element, processing is stopped + /// and the error is returned. + /// + /// If a key is inserted that is less than or equal to any previous key + /// added, then an error is returned. Similarly, if there was a problem + /// writing to the underlying writer, an error is returned. + pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()> + where + T: AsRef<[u8]>, + I: IntoIterator<Item = (T, Output)>, + { + for (key, out) in iter { + self.insert(key, out.value())?; + } + Ok(()) + } + + /// Calls insert on each item in the stream. + /// + /// Note that unlike `extend_iter`, this is not generic on the items in + /// the stream. + /// + /// If a key is inserted that is less than or equal to any previous key + /// added, then an error is returned. Similarly, if there was a problem + /// writing to the underlying writer, an error is returned. + pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + let mut stream = stream.into_stream(); + while let Some((key, out)) = stream.next() { + self.insert(key, out.value())?; + } + Ok(()) + } + + /// Finishes the construction of the fst and flushes the underlying + /// writer. After completion, the data written to `W` may be read using + /// one of `Fst`'s constructor methods. + pub fn finish(self) -> Result<()> { + self.into_inner()?; + Ok(()) + } + + /// Just like `finish`, except it returns the underlying writer after + /// flushing it. + pub fn into_inner(mut self) -> Result<W> { + self.compile_from(0)?; + let root_node = self.unfinished.pop_root(); + let root_addr = self.compile(&root_node)?; + bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?; + bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?; + + let sum = self.wtr.masked_checksum(); + let mut wtr = self.wtr.into_inner(); + bytes::io_write_u32_le(sum, &mut wtr)?; + wtr.flush()?; + Ok(wtr) + } + + fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()> + where + B: AsRef<[u8]>, + { + let bs = bs.as_ref(); + if bs.is_empty() { + self.len = 1; // must be first key, so length is always 1 + self.unfinished.set_root_output(out.unwrap_or(Output::zero())); + return Ok(()); + } + let (prefix_len, out) = if let Some(out) = out { + self.unfinished.find_common_prefix_and_set_output(bs, out) + } else { + (self.unfinished.find_common_prefix(bs), Output::zero()) + }; + if prefix_len == bs.len() { + // If the prefix found consumes the entire set of bytes, then + // the prefix *equals* the bytes given. This means it is a + // duplicate value with no output. So we can give up here. + // + // If the below assert fails, then that means we let a duplicate + // value through even when inserting outputs. + assert!(out.is_zero()); + return Ok(()); + } + self.len += 1; + self.compile_from(prefix_len)?; + self.unfinished.add_suffix(&bs[prefix_len..], out); + Ok(()) + } + + fn compile_from(&mut self, istate: usize) -> Result<()> { + let mut addr = NONE_ADDRESS; + while istate + 1 < self.unfinished.len() { + let node = if addr == NONE_ADDRESS { + self.unfinished.pop_empty() + } else { + self.unfinished.pop_freeze(addr) + }; + addr = self.compile(&node)?; + assert!(addr != NONE_ADDRESS); + } + self.unfinished.top_last_freeze(addr); + Ok(()) + } + + fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> { + if node.is_final + && node.trans.is_empty() + && node.final_output.is_zero() + { + return Ok(EMPTY_ADDRESS); + } + let entry = self.registry.entry(&node); + if let RegistryEntry::Found(ref addr) = entry { + return Ok(*addr); + } + let start_addr = self.wtr.count() as CompiledAddr; + node.compile_to(&mut self.wtr, self.last_addr, start_addr)?; + self.last_addr = self.wtr.count() as CompiledAddr - 1; + if let RegistryEntry::NotFound(cell) = entry { + cell.insert(self.last_addr); + } + Ok(self.last_addr) + } + + fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> { + if let Some(ref mut last) = self.last { + if check_dupe && bs == &**last { + return Err(Error::DuplicateKey { got: bs.to_vec() }.into()); + } + if bs < &**last { + return Err(Error::OutOfOrder { + previous: last.to_vec(), + got: bs.to_vec(), + } + .into()); + } + last.clear(); + for &b in bs { + last.push(b); + } + } else { + self.last = Some(bs.to_vec()); + } + Ok(()) + } + + /// Gets a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.wtr.get_ref() + } + + /// Returns the number of bytes written to the underlying writer + pub fn bytes_written(&self) -> u64 { + self.wtr.count() + } +} + +impl UnfinishedNodes { + fn new() -> UnfinishedNodes { + let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) }; + unfinished.push_empty(false); + unfinished + } + + fn len(&self) -> usize { + self.stack.len() + } + + fn push_empty(&mut self, is_final: bool) { + self.stack.push(BuilderNodeUnfinished { + node: BuilderNode { is_final, ..BuilderNode::default() }, + last: None, + }); + } + + fn pop_root(&mut self) -> BuilderNode { + assert!(self.stack.len() == 1); + assert!(self.stack[0].last.is_none()); + self.stack.pop().unwrap().node + } + + fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode { + let mut unfinished = self.stack.pop().unwrap(); + unfinished.last_compiled(addr); + unfinished.node + } + + fn pop_empty(&mut self) -> BuilderNode { + let unfinished = self.stack.pop().unwrap(); + assert!(unfinished.last.is_none()); + unfinished.node + } + + fn set_root_output(&mut self, out: Output) { + self.stack[0].node.is_final = true; + self.stack[0].node.final_output = out; + } + + fn top_last_freeze(&mut self, addr: CompiledAddr) { + let last = self.stack.len().checked_sub(1).unwrap(); + self.stack[last].last_compiled(addr); + } + + fn add_suffix(&mut self, bs: &[u8], out: Output) { + if bs.is_empty() { + return; + } + let last = self.stack.len().checked_sub(1).unwrap(); + assert!(self.stack[last].last.is_none()); + self.stack[last].last = Some(LastTransition { inp: bs[0], out }); + for &b in &bs[1..] { + self.stack.push(BuilderNodeUnfinished { + node: BuilderNode::default(), + last: Some(LastTransition { inp: b, out: Output::zero() }), + }); + } + self.push_empty(true); + } + + fn find_common_prefix(&mut self, bs: &[u8]) -> usize { + bs.iter() + .zip(&self.stack) + .take_while(|&(&b, ref node)| { + node.last.as_ref().map(|t| t.inp == b).unwrap_or(false) + }) + .count() + } + + fn find_common_prefix_and_set_output( + &mut self, + bs: &[u8], + mut out: Output, + ) -> (usize, Output) { + let mut i = 0; + while i < bs.len() { + let add_prefix = match self.stack[i].last.as_mut() { + Some(ref mut t) if t.inp == bs[i] => { + i += 1; + let common_pre = t.out.prefix(out); + let add_prefix = t.out.sub(common_pre); + out = out.sub(common_pre); + t.out = common_pre; + add_prefix + } + _ => break, + }; + if !add_prefix.is_zero() { + self.stack[i].add_output_prefix(add_prefix); + } + } + (i, out) + } +} + +impl BuilderNodeUnfinished { + fn last_compiled(&mut self, addr: CompiledAddr) { + if let Some(trans) = self.last.take() { + self.node.trans.push(Transition { + inp: trans.inp, + out: trans.out, + addr, + }); + } + } + + fn add_output_prefix(&mut self, prefix: Output) { + if self.node.is_final { + self.node.final_output = prefix.cat(self.node.final_output); + } + for t in &mut self.node.trans { + t.out = prefix.cat(t.out); + } + if let Some(ref mut t) = self.last { + t.out = prefix.cat(t.out); + } + } +} + +impl Clone for BuilderNode { + fn clone(&self) -> BuilderNode { + BuilderNode { + is_final: self.is_final, + final_output: self.final_output, + trans: self.trans.clone(), + } + } + + fn clone_from(&mut self, source: &BuilderNode) { + self.is_final = source.is_final; + self.final_output = source.final_output; + self.trans.clear(); + self.trans.extend(source.trans.iter()); + } +} + +impl Default for BuilderNode { + fn default() -> BuilderNode { + BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![], + } + } +} diff --git a/vendor/fst/src/raw/common_inputs.rs b/vendor/fst/src/raw/common_inputs.rs new file mode 100644 index 000000000..f7d7d23b1 --- /dev/null +++ b/vendor/fst/src/raw/common_inputs.rs @@ -0,0 +1,289 @@ +pub const COMMON_INPUTS: [u8; 256] = [ + 84, // '\x00' + 85, // '\x01' + 86, // '\x02' + 87, // '\x03' + 88, // '\x04' + 89, // '\x05' + 90, // '\x06' + 91, // '\x07' + 92, // '\x08' + 93, // '\t' + 94, // '\n' + 95, // '\x0b' + 96, // '\x0c' + 97, // '\r' + 98, // '\x0e' + 99, // '\x0f' + 100, // '\x10' + 101, // '\x11' + 102, // '\x12' + 103, // '\x13' + 104, // '\x14' + 105, // '\x15' + 106, // '\x16' + 107, // '\x17' + 108, // '\x18' + 109, // '\x19' + 110, // '\x1a' + 111, // '\x1b' + 112, // '\x1c' + 113, // '\x1d' + 114, // '\x1e' + 115, // '\x1f' + 116, // ' ' + 80, // '!' + 117, // '"' + 118, // '#' + 79, // '$' + 39, // '%' + 30, // '&' + 81, // "'" + 75, // '(' + 74, // ')' + 82, // '*' + 57, // '+' + 66, // ',' + 16, // '-' + 12, // '.' + 2, // '/' + 19, // '0' + 20, // '1' + 21, // '2' + 27, // '3' + 32, // '4' + 29, // '5' + 35, // '6' + 36, // '7' + 37, // '8' + 34, // '9' + 24, // ':' + 73, // ';' + 119, // '<' + 23, // '=' + 120, // '>' + 40, // '?' + 83, // '@' + 44, // 'A' + 48, // 'B' + 42, // 'C' + 43, // 'D' + 49, // 'E' + 46, // 'F' + 62, // 'G' + 61, // 'H' + 47, // 'I' + 69, // 'J' + 68, // 'K' + 58, // 'L' + 56, // 'M' + 55, // 'N' + 59, // 'O' + 51, // 'P' + 72, // 'Q' + 54, // 'R' + 45, // 'S' + 52, // 'T' + 64, // 'U' + 65, // 'V' + 63, // 'W' + 71, // 'X' + 67, // 'Y' + 70, // 'Z' + 77, // '[' + 121, // '\\' + 78, // ']' + 122, // '^' + 31, // '_' + 123, // '`' + 4, // 'a' + 25, // 'b' + 9, // 'c' + 17, // 'd' + 1, // 'e' + 26, // 'f' + 22, // 'g' + 13, // 'h' + 7, // 'i' + 50, // 'j' + 38, // 'k' + 14, // 'l' + 15, // 'm' + 10, // 'n' + 3, // 'o' + 8, // 'p' + 60, // 'q' + 6, // 'r' + 5, // 's' + 0, // 't' + 18, // 'u' + 33, // 'v' + 11, // 'w' + 41, // 'x' + 28, // 'y' + 53, // 'z' + 124, // '{' + 125, // '|' + 126, // '}' + 76, // '~' + 127, // '\x7f' + 128, // '\x80' + 129, // '\x81' + 130, // '\x82' + 131, // '\x83' + 132, // '\x84' + 133, // '\x85' + 134, // '\x86' + 135, // '\x87' + 136, // '\x88' + 137, // '\x89' + 138, // '\x8a' + 139, // '\x8b' + 140, // '\x8c' + 141, // '\x8d' + 142, // '\x8e' + 143, // '\x8f' + 144, // '\x90' + 145, // '\x91' + 146, // '\x92' + 147, // '\x93' + 148, // '\x94' + 149, // '\x95' + 150, // '\x96' + 151, // '\x97' + 152, // '\x98' + 153, // '\x99' + 154, // '\x9a' + 155, // '\x9b' + 156, // '\x9c' + 157, // '\x9d' + 158, // '\x9e' + 159, // '\x9f' + 160, // '\xa0' + 161, // '¡' + 162, // '¢' + 163, // '£' + 164, // '¤' + 165, // '¥' + 166, // '¦' + 167, // '§' + 168, // '¨' + 169, // '©' + 170, // 'ª' + 171, // '«' + 172, // '¬' + 173, // '\xad' + 174, // '®' + 175, // '¯' + 176, // '°' + 177, // '±' + 178, // '²' + 179, // '³' + 180, // '´' + 181, // 'µ' + 182, // '¶' + 183, // '·' + 184, // '¸' + 185, // '¹' + 186, // 'º' + 187, // '»' + 188, // '¼' + 189, // '½' + 190, // '¾' + 191, // '¿' + 192, // 'À' + 193, // 'Á' + 194, // 'Â' + 195, // 'Ã' + 196, // 'Ä' + 197, // 'Å' + 198, // 'Æ' + 199, // 'Ç' + 200, // 'È' + 201, // 'É' + 202, // 'Ê' + 203, // 'Ë' + 204, // 'Ì' + 205, // 'Í' + 206, // 'Î' + 207, // 'Ï' + 208, // 'Ð' + 209, // 'Ñ' + 210, // 'Ò' + 211, // 'Ó' + 212, // 'Ô' + 213, // 'Õ' + 214, // 'Ö' + 215, // '×' + 216, // 'Ø' + 217, // 'Ù' + 218, // 'Ú' + 219, // 'Û' + 220, // 'Ü' + 221, // 'Ý' + 222, // 'Þ' + 223, // 'ß' + 224, // 'à' + 225, // 'á' + 226, // 'â' + 227, // 'ã' + 228, // 'ä' + 229, // 'å' + 230, // 'æ' + 231, // 'ç' + 232, // 'è' + 233, // 'é' + 234, // 'ê' + 235, // 'ë' + 236, // 'ì' + 237, // 'í' + 238, // 'î' + 239, // 'ï' + 240, // 'ð' + 241, // 'ñ' + 242, // 'ò' + 243, // 'ó' + 244, // 'ô' + 245, // 'õ' + 246, // 'ö' + 247, // '÷' + 248, // 'ø' + 249, // 'ù' + 250, // 'ú' + 251, // 'û' + 252, // 'ü' + 253, // 'ý' + 254, // 'þ' + 255, // 'ÿ' +]; + +pub const COMMON_INPUTS_INV: [u8; 256] = [ + b't', b'e', b'/', b'o', b'a', b's', b'r', b'i', b'p', b'c', b'n', b'w', + b'.', b'h', b'l', b'm', b'-', b'd', b'u', b'0', b'1', b'2', b'g', b'=', + b':', b'b', b'f', b'3', b'y', b'5', b'&', b'_', b'4', b'v', b'9', b'6', + b'7', b'8', b'k', b'%', b'?', b'x', b'C', b'D', b'A', b'S', b'F', b'I', + b'B', b'E', b'j', b'P', b'T', b'z', b'R', b'N', b'M', b'+', b'L', b'O', + b'q', b'H', b'G', b'W', b'U', b'V', b',', b'Y', b'K', b'J', b'Z', b'X', + b'Q', b';', b')', b'(', b'~', b'[', b']', b'$', b'!', b'\'', b'*', b'@', + b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07', + b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10', + b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18', + b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'"', + b'#', b'<', b'>', b'\\', b'^', b'`', b'{', b'|', b'}', b'\x7f', b'\x80', + b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88', + b'\x89', b'\x8a', b'\x8b', b'\x8c', b'\x8d', b'\x8e', b'\x8f', b'\x90', + b'\x91', b'\x92', b'\x93', b'\x94', b'\x95', b'\x96', b'\x97', b'\x98', + b'\x99', b'\x9a', b'\x9b', b'\x9c', b'\x9d', b'\x9e', b'\x9f', b'\xa0', + b'\xa1', b'\xa2', b'\xa3', b'\xa4', b'\xa5', b'\xa6', b'\xa7', b'\xa8', + b'\xa9', b'\xaa', b'\xab', b'\xac', b'\xad', b'\xae', b'\xaf', b'\xb0', + b'\xb1', b'\xb2', b'\xb3', b'\xb4', b'\xb5', b'\xb6', b'\xb7', b'\xb8', + b'\xb9', b'\xba', b'\xbb', b'\xbc', b'\xbd', b'\xbe', b'\xbf', b'\xc0', + b'\xc1', b'\xc2', b'\xc3', b'\xc4', b'\xc5', b'\xc6', b'\xc7', b'\xc8', + b'\xc9', b'\xca', b'\xcb', b'\xcc', b'\xcd', b'\xce', b'\xcf', b'\xd0', + b'\xd1', b'\xd2', b'\xd3', b'\xd4', b'\xd5', b'\xd6', b'\xd7', b'\xd8', + b'\xd9', b'\xda', b'\xdb', b'\xdc', b'\xdd', b'\xde', b'\xdf', b'\xe0', + b'\xe1', b'\xe2', b'\xe3', b'\xe4', b'\xe5', b'\xe6', b'\xe7', b'\xe8', + b'\xe9', b'\xea', b'\xeb', b'\xec', b'\xed', b'\xee', b'\xef', b'\xf0', + b'\xf1', b'\xf2', b'\xf3', b'\xf4', b'\xf5', b'\xf6', b'\xf7', b'\xf8', + b'\xf9', b'\xfa', b'\xfb', b'\xfc', b'\xfd', b'\xfe', b'\xff', +]; diff --git a/vendor/fst/src/raw/counting_writer.rs b/vendor/fst/src/raw/counting_writer.rs new file mode 100644 index 000000000..70d9315ff --- /dev/null +++ b/vendor/fst/src/raw/counting_writer.rs @@ -0,0 +1,70 @@ +use std::io; + +use crate::raw::crc32::CheckSummer; + +/// Wraps any writer that counts and checksums bytes written. +pub struct CountingWriter<W> { + wtr: W, + cnt: u64, + summer: CheckSummer, +} + +impl<W: io::Write> CountingWriter<W> { + /// Wrap the given writer with a counter. + pub fn new(wtr: W) -> CountingWriter<W> { + CountingWriter { wtr, cnt: 0, summer: CheckSummer::new() } + } + + /// Return the total number of bytes written to the underlying writer. + /// + /// The count returned is the sum of all counts resulting from a call + /// to `write`. + pub fn count(&self) -> u64 { + self.cnt + } + + /// Returns the masked CRC32C checksum of the bytes written so far. + /// + /// This "masked" checksum is the same one used by the Snappy frame format. + /// Masking is supposed to make the checksum robust with respect to data + /// that contains the checksum itself. + pub fn masked_checksum(&self) -> u32 { + self.summer.masked() + } + + /// Unwrap the counting writer and return the inner writer. + pub fn into_inner(self) -> W { + self.wtr + } + + /// Gets a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + &self.wtr + } +} + +impl<W: io::Write> io::Write for CountingWriter<W> { + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + self.summer.update(buf); + let n = self.wtr.write(buf)?; + self.cnt += n as u64; + Ok(n) + } + + fn flush(&mut self) -> io::Result<()> { + self.wtr.flush() + } +} + +#[cfg(test)] +mod tests { + use super::CountingWriter; + use std::io::Write; + + #[test] + fn counts_bytes() { + let mut wtr = CountingWriter::new(vec![]); + wtr.write_all(b"foobar").unwrap(); + assert_eq!(wtr.count(), 6); + } +} diff --git a/vendor/fst/src/raw/crc32.rs b/vendor/fst/src/raw/crc32.rs new file mode 100644 index 000000000..06b444205 --- /dev/null +++ b/vendor/fst/src/raw/crc32.rs @@ -0,0 +1,59 @@ +use crate::bytes; +use crate::raw::crc32_table::{TABLE, TABLE16}; + +/// Provides a simple API to perform a rolling CRC32C checksum. +#[derive(Clone, Copy, Debug)] +pub struct CheckSummer { + sum: u32, +} + +impl CheckSummer { + /// Create a new checksummer that can compute CRC32C checksums on arbitrary + /// bytes. + pub fn new() -> CheckSummer { + CheckSummer { sum: 0 } + } + + /// Returns the "masked" CRC32 checksum of the data so far using the + /// Castagnoli polynomial. This "masked" checksum is the same one used + /// by the Snappy frame format. Masking is supposed to make the checksum + /// robust with respect to data that contains the checksum itself. + pub fn masked(&self) -> u32 { + let sum = self.sum; + (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8) + } + + /// Update the current checksum with the checksum for the given bytes. + pub fn update(&mut self, buf: &[u8]) { + self.sum = crc32c_slice16(self.sum, buf); + } +} + +/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. +fn crc32c_slice16(prev: u32, mut buf: &[u8]) -> u32 { + let mut crc: u32 = !prev; + while buf.len() >= 16 { + crc ^= bytes::read_u32_le(buf); + crc = TABLE16[0][buf[15] as usize] + ^ TABLE16[1][buf[14] as usize] + ^ TABLE16[2][buf[13] as usize] + ^ TABLE16[3][buf[12] as usize] + ^ TABLE16[4][buf[11] as usize] + ^ TABLE16[5][buf[10] as usize] + ^ TABLE16[6][buf[9] as usize] + ^ TABLE16[7][buf[8] as usize] + ^ TABLE16[8][buf[7] as usize] + ^ TABLE16[9][buf[6] as usize] + ^ TABLE16[10][buf[5] as usize] + ^ TABLE16[11][buf[4] as usize] + ^ TABLE16[12][(crc >> 24) as u8 as usize] + ^ TABLE16[13][(crc >> 16) as u8 as usize] + ^ TABLE16[14][(crc >> 8) as u8 as usize] + ^ TABLE16[15][(crc) as u8 as usize]; + buf = &buf[16..]; + } + for &b in buf { + crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); + } + !crc +} diff --git a/vendor/fst/src/raw/crc32_table.rs b/vendor/fst/src/raw/crc32_table.rs new file mode 100644 index 000000000..7821b5d86 --- /dev/null +++ b/vendor/fst/src/raw/crc32_table.rs @@ -0,0 +1,2 @@ +// Generated by build.rs. +include!(concat!(env!("OUT_DIR"), "/crc32_table.rs")); diff --git a/vendor/fst/src/raw/error.rs b/vendor/fst/src/raw/error.rs new file mode 100644 index 000000000..b28377179 --- /dev/null +++ b/vendor/fst/src/raw/error.rs @@ -0,0 +1,171 @@ +use std::fmt; +use std::str; +use std::string::FromUtf8Error; + +use crate::raw::FstType; + +/// An error that occurred while using a finite state transducer. +/// +/// This enum is non-exhaustive. New variants may be added to it in +/// compatible releases. +pub enum Error { + /// A version mismatch occurred while reading a finite state transducer. + /// + /// This occurs when the API version (of the crate) does not match the + /// version encoded in the finite state transducer. + /// + /// When this error is encountered, there are only two ways to fix it: + /// + /// 1. Change the version of the library to one that is compatible with + /// the given finite state transducer. + /// 2. Rebuild the finite state transducer. + Version { + /// The expected version, which is hard-coded into the current version + /// of this crate. + expected: u64, + /// The version read from the finite state transducer. + got: u64, + }, + /// An unexpected error occurred while reading a finite state transducer. + /// Usually this occurs because the data is corrupted or is not actually + /// a finite state transducer serialized by this library. + Format { + /// The number of bytes given to the FST constructor. + size: usize, + }, + /// An error that is returned if verification of an FST fails because of a + /// checksum mismatch. + ChecksumMismatch { + /// The checksum that was expected. + expected: u32, + /// The checksum that was actually computed. + got: u32, + }, + /// An error that is returned if the caller attempts to verify an FST + /// that does not have a checksum, as is the case for all FSTs generated + /// by this crate before version `0.4`. + ChecksumMissing, + /// A duplicate key was inserted into a finite state transducer, which is + /// not allowed. + DuplicateKey { + /// The duplicate key. + got: Vec<u8>, + }, + /// A key was inserted out of order into a finite state transducer. + /// + /// Keys must always be inserted in lexicographic order. + OutOfOrder { + /// The last key successfully inserted. + previous: Vec<u8>, + /// The key that caused this error to occur. + got: Vec<u8>, + }, + /// A finite state transducer with an unexpected type was found. + /// + /// This is not currently used in this crate, but callers may wish to + /// employ its use for alternative data structures implemented on top of + /// finite state transducers. + WrongType { + /// The expected finite state transducer type. + expected: FstType, + /// The type read from a finite state transducer. + got: FstType, + }, + /// An error that occurred when trying to decode a UTF-8 byte key. + FromUtf8(FromUtf8Error), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::FromUtf8(ref err) => err.fmt(f), + Error::Version { expected, got } => write!( + f, + "\ +Error opening FST: expected API version {}, got API version {}. \ +It looks like the FST you're trying to open is either not an FST file or it \ +was generated with a different version of the 'fst' crate. You'll either need \ +to change the version of the 'fst' crate you're using, or re-generate the +FST.", + expected, got + ), + Error::Format { size } => write!( + f, + "\ +Error opening FST with size {} bytes: An unknown error occurred. This \ +usually means you're trying to read data that isn't actually an encoded FST.", + size + ), + Error::ChecksumMismatch { expected, got } => write!( + f, + "FST verification failed: expected checksum of {} but got {}", + expected, got, + ), + Error::ChecksumMissing => write!( + f, + "FST verification failed: FST does not contain a checksum", + ), + Error::DuplicateKey { ref got } => write!( + f, + "Error inserting duplicate key: '{}'.", + format_bytes(&*got) + ), + Error::OutOfOrder { ref previous, ref got } => write!( + f, + "\ +Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \ +inserted in lexicographic order.", + format_bytes(&*got), + format_bytes(&*previous) + ), + Error::WrongType { expected, got } => write!( + f, + "\ +Error opening FST: expected type '{}', got type '{}'.", + expected, got + ), + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + Error::FromUtf8(ref err) => Some(err), + _ => None, + } + } +} + +impl From<FromUtf8Error> for Error { + #[inline] + fn from(err: FromUtf8Error) -> Error { + Error::FromUtf8(err) + } +} + +/// Attempt to convert an arbitrary byte string to a more convenient display +/// form. +/// +/// Essentially, try to decode the bytes as UTF-8 and show that. Failing that, +/// just show the sequence of bytes. +fn format_bytes(bytes: &[u8]) -> String { + match str::from_utf8(bytes) { + Ok(s) => s.to_owned(), + Err(_) => format!("{:?}", bytes), + } +} diff --git a/vendor/fst/src/raw/mod.rs b/vendor/fst/src/raw/mod.rs new file mode 100644 index 000000000..6cc0eeb2c --- /dev/null +++ b/vendor/fst/src/raw/mod.rs @@ -0,0 +1,1382 @@ +/*! +Operations on raw finite state transducers. + +This sub-module exposes the guts of a finite state transducer. Many parts of +it, such as construction and traversal, are mirrored in the `set` and `map` +sub-modules. Other parts of it, such as direct access to nodes and transitions +in the transducer, do not have any analog. + +# Overview of types + +`Fst` is a read only interface to pre-constructed finite state transducers. +`Node` is a read only interface to a single node in a transducer. `Builder` is +used to create new finite state transducers. (Once a transducer is created, it +can never be modified.) `Stream` is a stream of all inputs and outputs in a +transducer. `StreamBuilder` builds range queries. `OpBuilder` collects streams +and executes set operations like `union` or `intersection` on them with the +option of specifying a merge strategy for output values. + +Most of the rest of the types are streams from set operations. +*/ +use std::cmp; +use std::fmt; + +use crate::automaton::{AlwaysMatch, Automaton}; +use crate::bytes; +use crate::error::Result; +use crate::stream::{IntoStreamer, Streamer}; + +pub use crate::raw::build::Builder; +pub use crate::raw::error::Error; +pub use crate::raw::node::{Node, Transitions}; +pub use crate::raw::ops::{ + Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, + Union, +}; + +mod build; +mod common_inputs; +mod counting_writer; +mod crc32; +mod crc32_table; +mod error; +mod node; +mod ops; +mod registry; +mod registry_minimal; +#[cfg(test)] +mod tests; + +/// The API version of this crate. +/// +/// This version number is written to every finite state transducer created by +/// this crate. When a finite state transducer is read, its version number is +/// checked against this value. +/// +/// Currently, any version mismatch results in an error. Fixing this requires +/// regenerating the finite state transducer or switching to a version of this +/// crate that is compatible with the serialized transducer. This particular +/// behavior may be relaxed in future versions. +pub const VERSION: u64 = 3; + +/// A sentinel value used to indicate an empty final state. +const EMPTY_ADDRESS: CompiledAddr = 0; + +/// A sentinel value used to indicate an invalid state. +/// +/// This is never the address of a node in a serialized transducer. +const NONE_ADDRESS: CompiledAddr = 1; + +/// FstType is a convention used to indicate the type of the underlying +/// transducer. +/// +/// This crate reserves the range 0-255 (inclusive) but currently leaves the +/// meaning of 0-255 unspecified. +pub type FstType = u64; + +/// CompiledAddr is the type used to address nodes in a finite state +/// transducer. +/// +/// It is most useful as a pointer to nodes. It can be used in the `Fst::node` +/// method to resolve the pointer. +pub type CompiledAddr = usize; + +/// An acyclic deterministic finite state transducer. +/// +/// # How does it work? +/// +/// The short answer: it's just like a prefix trie, which compresses keys +/// based only on their prefixes, except that a automaton/transducer also +/// compresses suffixes. +/// +/// The longer answer is that keys in an automaton are stored only in the +/// transitions from one state to another. A key can be acquired by tracing +/// a path from the root of the automaton to any match state. The inputs along +/// each transition are concatenated. Once a match state is reached, the +/// concatenation of inputs up until that point corresponds to a single key. +/// +/// But why is it called a transducer instead of an automaton? A finite state +/// transducer is just like a finite state automaton, except that it has output +/// transitions in addition to input transitions. Namely, the value associated +/// with any particular key is determined by summing the outputs along every +/// input transition that leads to the key's corresponding match state. +/// +/// This is best demonstrated with a couple images. First, let's ignore the +/// "transducer" aspect and focus on a plain automaton. +/// +/// Consider that your keys are abbreviations of some of the months in the +/// Gregorian calendar: +/// +/// ```plain +/// jan +/// feb +/// mar +/// may +/// jun +/// jul +/// ``` +/// +/// The corresponding automaton that stores all of these as keys looks like +/// this: +/// +/// ![finite state automaton](https://burntsushi.net/stuff/months-set.png) +/// +/// Notice here how the prefix and suffix of `jan` and `jun` are shared. +/// Similarly, the prefixes of `jun` and `jul` are shared and the prefixes +/// of `mar` and `may` are shared. +/// +/// All of the keys from this automaton can be enumerated in lexicographic +/// order by following every transition from each node in lexicographic +/// order. Since it is acyclic, the procedure will terminate. +/// +/// A key can be found by tracing it through the transitions in the automaton. +/// For example, the key `aug` is known not to be in the automaton by only +/// visiting the root state (because there is no `a` transition). For another +/// example, the key `jax` is known not to be in the set only after moving +/// through the transitions for `j` and `a`. Namely, after those transitions +/// are followed, there are no transitions for `x`. +/// +/// Notice here that looking up a key is proportional the length of the key +/// itself. Namely, lookup time is not affected by the number of keys in the +/// automaton! +/// +/// Additionally, notice that the automaton exploits the fact that many keys +/// share common prefixes and suffixes. For example, `jun` and `jul` are +/// represented with no more states than would be required to represent either +/// one on its own. Instead, the only change is a single extra transition. This +/// is a form of compression and is key to how the automatons produced by this +/// crate are so small. +/// +/// Let's move on to finite state transducers. Consider the same set of keys +/// as above, but let's assign their numeric month values: +/// +/// ```plain +/// jan,1 +/// feb,2 +/// mar,3 +/// may,5 +/// jun,6 +/// jul,7 +/// ``` +/// +/// The corresponding transducer looks very similar to the automaton above, +/// except outputs have been added to some of the transitions: +/// +/// ![finite state transducer](https://burntsushi.net/stuff/months-map.png) +/// +/// All of the operations with a transducer are the same as described above +/// for automatons. Additionally, the same compression techniques are used: +/// common prefixes and suffixes in keys are exploited. +/// +/// The key difference is that some transitions have been given an output. +/// As one follows input transitions, one must sum the outputs as they +/// are seen. (A transition with no output represents the additive identity, +/// or `0` in this case.) For example, when looking up `feb`, the transition +/// `f` has output `2`, the transition `e` has output `0`, and the transition +/// `b` also has output `0`. The sum of these is `2`, which is exactly the +/// value we associated with `feb`. +/// +/// For another more interesting example, consider `jul`. The `j` transition +/// has output `1`, the `u` transition has output `5` and the `l` transition +/// has output `1`. Summing these together gets us `7`, which is again the +/// correct value associated with `jul`. Notice that if we instead looked up +/// the `jun` key, then the `n` transition would be followed instead of the +/// `l` transition, which has no output. Therefore, the `jun` key equals +/// `1+5+0=6`. +/// +/// The trick to transducers is that there exists a unique path through the +/// transducer for every key, and its outputs are stored appropriately along +/// this path such that the correct value is returned when they are all summed +/// together. This process also enables the data that makes up each value to be +/// shared across many values in the transducer in exactly the same way that +/// keys are shared. This is yet another form of compression! +/// +/// # Bonus: a billion strings +/// +/// The amount of compression one can get from automata can be absolutely +/// ridiuclous. Consider the particular case of storing all billion strings +/// in the range `0000000001-1000000000`, e.g., +/// +/// ```plain +/// 0000000001 +/// 0000000002 +/// ... +/// 0000000100 +/// 0000000101 +/// ... +/// 0999999999 +/// 1000000000 +/// ``` +/// +/// The corresponding automaton looks like this: +/// +/// ![finite state automaton - one billion strings](https://burntsushi.net/stuff/one-billion.png) +/// +/// Indeed, the on disk size of this automaton is a mere **251 bytes**. +/// +/// Of course, this is a bit of a pathological best case, but it does serve +/// to show how good compression can be in the optimal case. +/// +/// Also, check out the +/// [corresponding transducer](https://burntsushi.net/stuff/one-billion-map.svg) +/// that maps each string to its integer value. It's a bit bigger, but still +/// only takes up **896 bytes** of space on disk. This demonstrates that +/// output values are also compressible. +/// +/// # Does this crate produce minimal transducers? +/// +/// For any non-trivial sized set of keys, it is unlikely that this crate will +/// produce a minimal transducer. As far as this author knows, guaranteeing a +/// minimal transducer requires working memory proportional to the number of +/// states. This can be quite costly and is anathema to the main design goal of +/// this crate: provide the ability to work with gigantic sets of strings with +/// constant memory overhead. +/// +/// Instead, construction of a finite state transducer uses a cache of +/// states. More frequently used states are cached and reused, which provides +/// reasonably good compression ratios. (No comprehensive benchmarks exist to +/// back up this claim.) +/// +/// It is possible that this crate may expose a way to guarantee minimal +/// construction of transducers at the expense of exorbitant memory +/// requirements. +/// +/// # Bibliography +/// +/// I initially got the idea to use finite state tranducers to represent +/// ordered sets/maps from +/// [Michael +/// McCandless'](http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html) +/// work on incorporating transducers in Lucene. +/// +/// However, my work would also not have been possible without the hard work +/// of many academics, especially +/// [Jan Daciuk](http://galaxy.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/). +/// +/// * [Incremental construction of minimal acyclic finite-state automata](https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601) +/// (Section 3 provides a decent overview of the algorithm used to construct +/// transducers in this crate, assuming all outputs are `0`.) +/// * [Direct Construction of Minimal Acyclic Subsequential Transducers](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf) +/// (The whole thing. The proof is dense but illuminating. The algorithm at +/// the end is the money shot, namely, it incorporates output values.) +/// * [Experiments with Automata Compression](https://www.researchgate.net/profile/Jiri-Dvorsky/publication/221568039_Word_Random_Access_Compression/links/0c96052c095630d5b3000000/Word-Random-Access-Compression.pdf#page=116), [Smaller Representation of Finite State Automata](https://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf) +/// (various compression techniques for representing states/transitions) +/// * [Jan Daciuk's dissertation](http://www.pg.gda.pl/~jandac/thesis.ps.gz) +/// (excellent for in depth overview) +/// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](https://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf) +/// (excellent for surface level overview) +#[derive(Clone)] +pub struct Fst<D> { + meta: Meta, + data: D, +} + +#[derive(Debug, Clone)] +struct Meta { + version: u64, + root_addr: CompiledAddr, + ty: FstType, + len: usize, + /// A checksum is missing when the FST version is <= 2. (Checksums were + /// added in version 3.) + checksum: Option<u32>, +} + +impl Fst<Vec<u8>> { + /// Create a new FST from an iterator of lexicographically ordered byte + /// strings. Every key's value is set to `0`. + /// + /// If the iterator does not yield values in lexicographic order, then an + /// error is returned. + /// + /// Note that this is a convenience function to build an FST in memory. + /// To build an FST that streams to an arbitrary `io::Write`, use + /// `raw::Builder`. + pub fn from_iter_set<K, I>(iter: I) -> Result<Fst<Vec<u8>>> + where + K: AsRef<[u8]>, + I: IntoIterator<Item = K>, + { + let mut builder = Builder::memory(); + for k in iter { + builder.add(k)?; + } + Ok(builder.into_fst()) + } + + /// Create a new FST from an iterator of lexicographically ordered byte + /// strings. The iterator should consist of tuples, where the first element + /// is the byte string and the second element is its corresponding value. + /// + /// If the iterator does not yield unique keys in lexicographic order, then + /// an error is returned. + /// + /// Note that this is a convenience function to build an FST in memory. + /// To build an FST that streams to an arbitrary `io::Write`, use + /// `raw::Builder`. + pub fn from_iter_map<K, I>(iter: I) -> Result<Fst<Vec<u8>>> + where + K: AsRef<[u8]>, + I: IntoIterator<Item = (K, u64)>, + { + let mut builder = Builder::memory(); + for (k, v) in iter { + builder.insert(k, v)?; + } + Ok(builder.into_fst()) + } +} + +impl<D: AsRef<[u8]>> Fst<D> { + /// Creates a transducer from its representation as a raw byte sequence. + /// + /// This operation is intentionally very cheap (no allocations and no + /// copies). In particular, no verification on the integrity of the + /// FST is performed. Callers may opt into integrity checks via the + /// [`Fst::verify`](struct.Fst.html#method.verify) method. + /// + /// The fst must have been written with a compatible finite state + /// transducer builder (`Builder` qualifies). If the format is invalid or + /// if there is a mismatch between the API version of this library and the + /// fst, then an error is returned. + #[inline] + pub fn new(data: D) -> Result<Fst<D>> { + let bytes = data.as_ref(); + if bytes.len() < 36 { + return Err(Error::Format { size: bytes.len() }.into()); + } + // The read_u64 unwraps below are OK because they can never fail. + // They can only fail when there is an IO error or if there is an + // unexpected EOF. However, we are reading from a byte slice (no + // IO errors possible) and we've confirmed the byte slice is at least + // N bytes (no unexpected EOF). + let version = bytes::read_u64_le(&bytes); + if version == 0 || version > VERSION { + return Err( + Error::Version { expected: VERSION, got: version }.into() + ); + } + let ty = bytes::read_u64_le(&bytes[8..]); + + let (end, checksum) = if version <= 2 { + (bytes.len(), None) + } else { + let checksum = bytes::read_u32_le(&bytes[bytes.len() - 4..]); + (bytes.len() - 4, Some(checksum)) + }; + let root_addr = { + let last = &bytes[end - 8..]; + u64_to_usize(bytes::read_u64_le(last)) + }; + let len = { + let last2 = &bytes[end - 16..]; + u64_to_usize(bytes::read_u64_le(last2)) + }; + // The root node is always the last node written, so its address should + // be near the end. After the root node is written, we still have to + // write the root *address* and the number of keys in the FST, along + // with the checksum. That's 20 bytes. The extra byte used below (21 + // and not 20) comes from the fact that the root address points to + // the last byte in the root node, rather than the byte immediately + // following the root node. + // + // If this check passes, it is still possible that the FST is invalid + // but probably unlikely. If this check reports a false positive, then + // the program will probably panic. In the worst case, the FST will + // operate but be subtly wrong. (This would require the bytes to be in + // a format expected by an FST, which is incredibly unlikely.) + // + // The special check for EMPTY_ADDRESS is needed since an empty FST + // has a root node that is empty and final, which means it has the + // special address `0`. In that case, the FST is the smallest it can + // be: the version, type, root address and number of nodes. That's + // 36 bytes (8 byte u64 each). + // + // And finally, our calculation changes somewhat based on version. + // If the FST version is less than 3, then it does not have a checksum. + let (empty_total, addr_offset) = + if version <= 2 { (32, 17) } else { (36, 21) }; + if (root_addr == EMPTY_ADDRESS && bytes.len() != empty_total) + && root_addr + addr_offset != bytes.len() + { + return Err(Error::Format { size: bytes.len() }.into()); + } + let meta = Meta { version, root_addr, ty, len, checksum }; + Ok(Fst { meta, data }) + } + + /// Retrieves the value associated with a key. + /// + /// If the key does not exist, then `None` is returned. + #[inline] + pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> { + self.as_ref().get(key.as_ref()) + } + + /// Returns true if and only if the given key is in this FST. + #[inline] + pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool { + self.as_ref().contains_key(key.as_ref()) + } + + /// Retrieves the key associated with the given value. + /// + /// This is like `get_key_into`, but will return the key itself without + /// allowing the caller to reuse an allocation. + /// + /// If the given value does not exist, then `None` is returned. + /// + /// The values in this FST are not monotonically increasing when sorted + /// lexicographically by key, then this routine has unspecified behavior. + #[inline] + pub fn get_key(&self, value: u64) -> Option<Vec<u8>> { + let mut key = vec![]; + if self.get_key_into(value, &mut key) { + Some(key) + } else { + None + } + } + + /// Retrieves the key associated with the given value. + /// + /// If the given value does not exist, then `false` is returned. In this + /// case, the contents of `key` are unspecified. + /// + /// The given buffer is not clearer before the key is written to it. + /// + /// The values in this FST are not monotonically increasing when sorted + /// lexicographically by key, then this routine has unspecified behavior. + #[inline] + pub fn get_key_into(&self, value: u64, key: &mut Vec<u8>) -> bool { + self.as_ref().get_key_into(value, key) + } + + /// Return a lexicographically ordered stream of all key-value pairs in + /// this fst. + #[inline] + pub fn stream(&self) -> Stream<'_> { + StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() + } + + /// Return a builder for range queries. + /// + /// A range query returns a subset of key-value pairs in this fst in a + /// range given in lexicographic order. + #[inline] + pub fn range(&self) -> StreamBuilder<'_> { + StreamBuilder::new(self.as_ref(), AlwaysMatch) + } + + /// Executes an automaton on the keys of this FST. + #[inline] + pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> { + StreamBuilder::new(self.as_ref(), aut) + } + + /// Executes an automaton on the keys of this FST and yields matching + /// keys along with the corresponding matching states in the given + /// automaton. + #[inline] + pub fn search_with_state<A: Automaton>( + &self, + aut: A, + ) -> StreamWithStateBuilder<'_, A> { + StreamWithStateBuilder::new(self.as_ref(), aut) + } + + /// Returns the number of keys in this fst. + #[inline] + pub fn len(&self) -> usize { + self.as_ref().len() + } + + /// Returns true if and only if this fst has no keys. + #[inline] + pub fn is_empty(&self) -> bool { + self.as_ref().is_empty() + } + + /// Returns the number of bytes used by this fst. + #[inline] + pub fn size(&self) -> usize { + self.as_ref().size() + } + + /// Attempts to verify this FST by computing its checksum. + /// + /// This will scan over all of the bytes in the underlying FST, so this + /// may be an expensive operation depending on the size of the FST. + /// + /// This returns an error in two cases: + /// + /// 1. When a checksum does not exist, which is the case for FSTs that were + /// produced by the `fst` crate before version `0.4`. + /// 2. When the checksum in the FST does not match the computed checksum + /// performed by this procedure. + #[inline] + pub fn verify(&self) -> Result<()> { + use crate::raw::crc32::CheckSummer; + + let expected = match self.as_ref().meta.checksum { + None => return Err(Error::ChecksumMissing.into()), + Some(expected) => expected, + }; + let mut summer = CheckSummer::new(); + summer.update(&self.as_bytes()[..self.as_bytes().len() - 4]); + let got = summer.masked(); + if expected == got { + return Ok(()); + } + Err(Error::ChecksumMismatch { expected, got }.into()) + } + + /// Creates a new fst operation with this fst added to it. + /// + /// The `OpBuilder` type can be used to add additional fst streams + /// and perform set operations like union, intersection, difference and + /// symmetric difference on the keys of the fst. These set operations also + /// allow one to specify how conflicting values are merged in the stream. + #[inline] + pub fn op(&self) -> OpBuilder<'_> { + OpBuilder::new().add(self) + } + + /// Returns true if and only if the `self` fst is disjoint with the fst + /// `stream`. + /// + /// `stream` must be a lexicographically ordered sequence of byte strings + /// with associated values. + #[inline] + pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + self.op().add(stream).intersection().next().is_none() + } + + /// Returns true if and only if the `self` fst is a subset of the fst + /// `stream`. + /// + /// `stream` must be a lexicographically ordered sequence of byte strings + /// with associated values. + #[inline] + pub fn is_subset<'f, I, S>(&self, stream: I) -> bool + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + let mut op = self.op().add(stream).intersection(); + let mut count = 0; + while let Some(_) = op.next() { + count += 1; + } + count == self.len() + } + + /// Returns true if and only if the `self` fst is a superset of the fst + /// `stream`. + /// + /// `stream` must be a lexicographically ordered sequence of byte strings + /// with associated values. + #[inline] + pub fn is_superset<'f, I, S>(&self, stream: I) -> bool + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + let mut op = self.op().add(stream).union(); + let mut count = 0; + while let Some(_) = op.next() { + count += 1; + } + count == self.len() + } + + /// Returns the underlying type of this fst. + /// + /// FstType is a convention used to indicate the type of the underlying + /// transducer. + /// + /// This crate reserves the range 0-255 (inclusive) but currently leaves + /// the meaning of 0-255 unspecified. + #[inline] + pub fn fst_type(&self) -> FstType { + self.as_ref().fst_type() + } + + /// Returns the root node of this fst. + #[inline] + pub fn root(&self) -> Node<'_> { + self.as_ref().root() + } + + /// Returns the node at the given address. + /// + /// Node addresses can be obtained by reading transitions on `Node` values. + #[inline] + pub fn node(&self, addr: CompiledAddr) -> Node<'_> { + self.as_ref().node(addr) + } + + /// Returns a copy of the binary contents of this FST. + #[inline] + pub fn to_vec(&self) -> Vec<u8> { + self.as_ref().to_vec() + } + + /// Returns the binary contents of this FST. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.as_ref().as_bytes() + } + + #[inline] + fn as_ref(&self) -> FstRef { + FstRef { meta: &self.meta, data: self.data.as_ref() } + } +} + +impl<D> Fst<D> { + /// Returns the underlying data which constitutes the FST itself. + #[inline] + pub fn into_inner(self) -> D { + self.data + } + + /// Returns a borrow to the underlying data which constitutes the FST itself. + #[inline] + pub fn as_inner(&self) -> &D { + &self.data + } + + /// Maps the underlying data of the fst to another data type. + #[inline] + pub fn map_data<F, T>(self, mut f: F) -> Result<Fst<T>> + where + F: FnMut(D) -> T, + T: AsRef<[u8]>, + { + Fst::new(f(self.into_inner())) + } +} + +impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst<D> { + type Item = (&'a [u8], Output); + type Into = Stream<'f>; + + #[inline] + fn into_stream(self) -> Stream<'f> { + StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() + } +} + +struct FstRef<'f> { + meta: &'f Meta, + data: &'f [u8], +} + +impl<'f> FstRef<'f> { + #[inline] + fn get(&self, key: &[u8]) -> Option<Output> { + let mut node = self.root(); + let mut out = Output::zero(); + for &b in key { + node = match node.find_input(b) { + None => return None, + Some(i) => { + let t = node.transition(i); + out = out.cat(t.out); + self.node(t.addr) + } + } + } + if !node.is_final() { + None + } else { + Some(out.cat(node.final_output())) + } + } + + #[inline] + fn contains_key(&self, key: &[u8]) -> bool { + let mut node = self.root(); + for &b in key { + node = match node.find_input(b) { + None => return false, + Some(i) => self.node(node.transition_addr(i)), + } + } + node.is_final() + } + + #[inline] + fn get_key_into(&self, mut value: u64, key: &mut Vec<u8>) -> bool { + let mut node = self.root(); + while value != 0 || !node.is_final() { + let trans = node + .transitions() + .take_while(|t| t.out.value() <= value) + .last(); + node = match trans { + None => return false, + Some(t) => { + value -= t.out.value(); + key.push(t.inp); + self.node(t.addr) + } + }; + } + true + } + + #[inline] + fn len(&self) -> usize { + self.meta.len + } + + #[inline] + fn is_empty(&self) -> bool { + self.meta.len == 0 + } + + #[inline] + fn size(&self) -> usize { + self.as_bytes().len() + } + + #[inline] + fn fst_type(&self) -> FstType { + self.meta.ty + } + + #[inline] + fn root_addr(&self) -> CompiledAddr { + self.meta.root_addr + } + + #[inline] + fn root(&self) -> Node<'f> { + self.node(self.root_addr()) + } + + #[inline] + fn node(&self, addr: CompiledAddr) -> Node<'f> { + Node::new(self.meta.version, addr, self.as_bytes()) + } + + #[inline] + fn to_vec(&self) -> Vec<u8> { + self.as_bytes().to_vec() + } + + #[inline] + fn as_bytes(&self) -> &'f [u8] { + self.data + } + + #[inline] + fn empty_final_output(&self) -> Option<Output> { + let root = self.root(); + if root.is_final() { + Some(root.final_output()) + } else { + None + } + } +} + +/// A builder for constructing range queries on streams. +/// +/// Once all bounds are set, one should call `into_stream` to get a +/// `Stream`. +/// +/// Bounds are not additive. That is, if `ge` is called twice on the same +/// builder, then the second setting wins. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct StreamBuilder<'f, A = AlwaysMatch> { + fst: FstRef<'f>, + aut: A, + min: Bound, + max: Bound, +} + +impl<'f, A: Automaton> StreamBuilder<'f, A> { + fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> { + StreamBuilder { + fst, + aut, + min: Bound::Unbounded, + max: Bound::Unbounded, + } + } + + /// Specify a greater-than-or-equal-to bound. + pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> { + self.min = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a greater-than bound. + pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> { + self.min = Bound::Excluded(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than-or-equal-to bound. + pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> { + self.max = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than bound. + pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> { + self.max = Bound::Excluded(bound.as_ref().to_owned()); + self + } +} + +impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> { + type Item = (&'a [u8], Output); + type Into = Stream<'f, A>; + + fn into_stream(self) -> Stream<'f, A> { + Stream::new(self.fst, self.aut, self.min, self.max) + } +} + +/// A builder for constructing range queries on streams that include automaton +/// states. +/// +/// In general, one should use `StreamBuilder` unless you have a specific need +/// for accessing the states of the underlying automaton that is being used to +/// filter this stream. +/// +/// Once all bounds are set, one should call `into_stream` to get a +/// `Stream`. +/// +/// Bounds are not additive. That is, if `ge` is called twice on the same +/// builder, then the second setting wins. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> { + fst: FstRef<'f>, + aut: A, + min: Bound, + max: Bound, +} + +impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> { + fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> { + StreamWithStateBuilder { + fst, + aut, + min: Bound::Unbounded, + max: Bound::Unbounded, + } + } + + /// Specify a greater-than-or-equal-to bound. + pub fn ge<T: AsRef<[u8]>>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.min = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a greater-than bound. + pub fn gt<T: AsRef<[u8]>>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.min = Bound::Excluded(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than-or-equal-to bound. + pub fn le<T: AsRef<[u8]>>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.max = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than bound. + pub fn lt<T: AsRef<[u8]>>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.max = Bound::Excluded(bound.as_ref().to_owned()); + self + } +} + +impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a> + for StreamWithStateBuilder<'f, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], Output, A::State); + type Into = StreamWithState<'f, A>; + + fn into_stream(self) -> StreamWithState<'f, A> { + StreamWithState::new(self.fst, self.aut, self.min, self.max) + } +} + +#[derive(Debug)] +enum Bound { + Included(Vec<u8>), + Excluded(Vec<u8>), + Unbounded, +} + +impl Bound { + #[inline] + fn exceeded_by(&self, inp: &[u8]) -> bool { + match *self { + Bound::Included(ref v) => inp > v, + Bound::Excluded(ref v) => inp >= v, + Bound::Unbounded => false, + } + } + + #[inline] + fn is_empty(&self) -> bool { + match *self { + Bound::Included(ref v) => v.is_empty(), + Bound::Excluded(ref v) => v.is_empty(), + Bound::Unbounded => true, + } + } + + #[inline] + fn is_inclusive(&self) -> bool { + match *self { + Bound::Excluded(_) => false, + _ => true, + } + } +} + +/// A lexicographically ordered stream of key-value pairs from an fst. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>); + +impl<'f, A: Automaton> Stream<'f, A> { + fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> { + Stream(StreamWithState::new(fst, aut, min, max)) + } + + /// Convert this stream into a vector of byte strings and outputs. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> { + let mut vs = vec![]; + while let Some((k, v)) = self.next() { + vs.push((k.to_vec(), v.value())); + } + vs + } + + /// Convert this stream into a vector of Unicode strings and outputs. + /// + /// If any key is not valid UTF-8, then iteration on the stream is stopped + /// and a UTF-8 decoding error is returned. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> { + let mut vs = vec![]; + while let Some((k, v)) = self.next() { + let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; + vs.push((k, v.value())); + } + Ok(vs) + } + + /// Convert this stream into a vector of byte strings. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> { + let mut vs = vec![]; + while let Some((k, _)) = self.next() { + vs.push(k.to_vec()); + } + vs + } + + /// Convert this stream into a vector of Unicode strings. + /// + /// If any key is not valid UTF-8, then iteration on the stream is stopped + /// and a UTF-8 decoding error is returned. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_str_keys(mut self) -> Result<Vec<String>> { + let mut vs = vec![]; + while let Some((k, _)) = self.next() { + let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; + vs.push(k); + } + Ok(vs) + } + + /// Convert this stream into a vector of outputs. + pub fn into_values(mut self) -> Vec<u64> { + let mut vs = vec![]; + while let Some((_, v)) = self.next() { + vs.push(v.value()); + } + vs + } +} + +impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { + type Item = (&'a [u8], Output); + + fn next(&'a mut self) -> Option<(&'a [u8], Output)> { + self.0.next_with(|_| ()).map(|(key, out, _)| (key, out)) + } +} + +/// A lexicographically ordered stream of key-value-state triples from an fst +/// and an automaton. +/// +/// The key-values are from the underyling FSTP while the states are from the +/// automaton. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'m` lifetime parameter refers to the lifetime of the underlying map. +pub struct StreamWithState<'f, A = AlwaysMatch> +where + A: Automaton, +{ + fst: FstRef<'f>, + aut: A, + inp: Vec<u8>, + empty_output: Option<Output>, + stack: Vec<StreamState<'f, A::State>>, + end_at: Bound, +} + +#[derive(Clone, Debug)] +struct StreamState<'f, S> { + node: Node<'f>, + trans: usize, + out: Output, + aut_state: S, +} + +impl<'f, A: Automaton> StreamWithState<'f, A> { + fn new( + fst: FstRef<'f>, + aut: A, + min: Bound, + max: Bound, + ) -> StreamWithState<'f, A> { + let mut rdr = StreamWithState { + fst, + aut, + inp: Vec::with_capacity(16), + empty_output: None, + stack: vec![], + end_at: max, + }; + rdr.seek_min(min); + rdr + } + + /// Seeks the underlying stream such that the next key to be read is the + /// smallest key in the underlying fst that satisfies the given minimum + /// bound. + /// + /// This theoretically should be straight-forward, but we need to make + /// sure our stack is correct, which includes accounting for automaton + /// states. + fn seek_min(&mut self, min: Bound) { + if min.is_empty() { + if min.is_inclusive() { + self.empty_output = self.fst.empty_final_output(); + } + self.stack = vec![StreamState { + node: self.fst.root(), + trans: 0, + out: Output::zero(), + aut_state: self.aut.start(), + }]; + return; + } + let (key, inclusive) = match min { + Bound::Excluded(ref min) => (min, false), + Bound::Included(ref min) => (min, true), + Bound::Unbounded => unreachable!(), + }; + // At this point, we need to find the starting location of `min` in + // the FST. However, as we search, we need to maintain a stack of + // reader states so that the reader can pick up where we left off. + // N.B. We do not necessarily need to stop in a final state, unlike + // the one-off `find` method. For the example, the given bound might + // not actually exist in the FST. + let mut node = self.fst.root(); + let mut out = Output::zero(); + let mut aut_state = self.aut.start(); + for &b in key { + match node.find_input(b) { + Some(i) => { + let t = node.transition(i); + let prev_state = aut_state; + aut_state = self.aut.accept(&prev_state, b); + self.inp.push(b); + self.stack.push(StreamState { + node, + trans: i + 1, + out, + aut_state: prev_state, + }); + out = out.cat(t.out); + node = self.fst.node(t.addr); + } + None => { + // This is a little tricky. We're in this case if the + // given bound is not a prefix of any key in the FST. + // Since this is a minimum bound, we need to find the + // first transition in this node that proceeds the current + // input byte. + self.stack.push(StreamState { + node, + trans: node + .transitions() + .position(|t| t.inp > b) + .unwrap_or(node.len()), + out, + aut_state, + }); + return; + } + } + } + if !self.stack.is_empty() { + let last = self.stack.len() - 1; + if inclusive { + self.stack[last].trans -= 1; + self.inp.pop(); + } else { + let node = self.stack[last].node; + let trans = self.stack[last].trans; + self.stack.push(StreamState { + node: self.fst.node(node.transition(trans - 1).addr), + trans: 0, + out, + aut_state, + }); + } + } + } + + fn next_with<T>( + &mut self, + mut map: impl FnMut(&A::State) -> T, + ) -> Option<(&[u8], Output, T)> { + if let Some(out) = self.empty_output.take() { + if self.end_at.exceeded_by(&[]) { + self.stack.clear(); + return None; + } + + let start = self.aut.start(); + if self.aut.is_match(&start) { + return Some((&[], out, map(&start))); + } + } + while let Some(state) = self.stack.pop() { + if state.trans >= state.node.len() + || !self.aut.can_match(&state.aut_state) + { + if state.node.addr() != self.fst.root_addr() { + self.inp.pop().unwrap(); + } + continue; + } + let trans = state.node.transition(state.trans); + let out = state.out.cat(trans.out); + let next_state = self.aut.accept(&state.aut_state, trans.inp); + let t = map(&next_state); + let mut is_match = self.aut.is_match(&next_state); + let next_node = self.fst.node(trans.addr); + self.inp.push(trans.inp); + if next_node.is_final() { + if let Some(eof_state) = self.aut.accept_eof(&next_state) { + is_match = self.aut.is_match(&eof_state); + } + } + self.stack.push(StreamState { trans: state.trans + 1, ..state }); + self.stack.push(StreamState { + node: next_node, + trans: 0, + out, + aut_state: next_state, + }); + if self.end_at.exceeded_by(&self.inp) { + // We are done, forever. + self.stack.clear(); + return None; + } + if next_node.is_final() && is_match { + return Some(( + &self.inp, + out.cat(next_node.final_output()), + t, + )); + } + } + None + } +} + +impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], Output, A::State); + + fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> { + self.next_with(|state| state.clone()) + } +} + +/// An output is a value that is associated with a key in a finite state +/// transducer. +/// +/// Note that outputs must satisfy an algebra. Namely, it must have an additive +/// identity and the following binary operations defined: `prefix`, +/// `concatenation` and `subtraction`. `prefix` and `concatenation` are +/// commutative while `subtraction` is not. `subtraction` is only defined on +/// pairs of operands where the first operand is greater than or equal to the +/// second operand. +/// +/// Currently, output values must be `u64`. However, in theory, an output value +/// can be anything that satisfies the above algebra. Future versions of this +/// crate may make outputs generic on this algebra. +#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] +pub struct Output(u64); + +impl Output { + /// Create a new output from a `u64`. + #[inline] + pub fn new(v: u64) -> Output { + Output(v) + } + + /// Create a zero output. + #[inline] + pub fn zero() -> Output { + Output(0) + } + + /// Retrieve the value inside this output. + #[inline] + pub fn value(self) -> u64 { + self.0 + } + + /// Returns true if this is a zero output. + #[inline] + pub fn is_zero(self) -> bool { + self.0 == 0 + } + + /// Returns the prefix of this output and `o`. + #[inline] + pub fn prefix(self, o: Output) -> Output { + Output(cmp::min(self.0, o.0)) + } + + /// Returns the concatenation of this output and `o`. + #[inline] + pub fn cat(self, o: Output) -> Output { + Output(self.0 + o.0) + } + + /// Returns the subtraction of `o` from this output. + /// + /// This function panics if `self < o`. + #[inline] + pub fn sub(self, o: Output) -> Output { + Output( + self.0 + .checked_sub(o.0) + .expect("BUG: underflow subtraction not allowed"), + ) + } +} + +/// A transition from one note to another. +#[derive(Copy, Clone, Hash, Eq, PartialEq)] +pub struct Transition { + /// The byte input associated with this transition. + pub inp: u8, + /// The output associated with this transition. + pub out: Output, + /// The address of the node that this transition points to. + pub addr: CompiledAddr, +} + +impl Default for Transition { + #[inline] + fn default() -> Transition { + Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS } + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.out.is_zero() { + write!(f, "{} -> {}", self.inp as char, self.addr) + } else { + write!( + f, + "({}, {}) -> {}", + self.inp as char, + self.out.value(), + self.addr + ) + } + } +} + +#[inline] +#[cfg(target_pointer_width = "64")] +fn u64_to_usize(n: u64) -> usize { + n as usize +} + +#[inline] +#[cfg(not(target_pointer_width = "64"))] +fn u64_to_usize(n: u64) -> usize { + if n > std::usize::MAX as u64 { + panic!( + "\ +Cannot convert node address {} to a pointer sized variable. If this FST +is very large and was generated on a system with a larger pointer size +than this system, then it is not possible to read this FST on this +system.", + n + ); + } + n as usize +} diff --git a/vendor/fst/src/raw/node.rs b/vendor/fst/src/raw/node.rs new file mode 100644 index 000000000..820b29eac --- /dev/null +++ b/vendor/fst/src/raw/node.rs @@ -0,0 +1,1012 @@ +use std::cmp; +use std::fmt; +use std::io; +use std::ops::Range; + +use crate::bytes; +use crate::raw::build::BuilderNode; +use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; +use crate::raw::{ + u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS, +}; + +/// The threshold (in number of transitions) at which an index is created for +/// a node's transitions. This speeds up lookup time at the expense of FST +/// size. +const TRANS_INDEX_THRESHOLD: usize = 32; + +/// Node represents a single state in a finite state transducer. +/// +/// Nodes are very cheap to construct. Notably, they satisfy the `Copy` trait. +#[derive(Clone, Copy)] +pub struct Node<'f> { + data: &'f [u8], + version: u64, + state: State, + start: CompiledAddr, + end: usize, + is_final: bool, + ntrans: usize, + sizes: PackSizes, + final_output: Output, +} + +impl<'f> fmt::Debug for Node<'f> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "NODE@{}", self.start)?; + writeln!(f, " end_addr: {}", self.end)?; + writeln!(f, " size: {} bytes", self.as_slice().len())?; + writeln!(f, " state: {:?}", self.state)?; + writeln!(f, " is_final: {}", self.is_final())?; + writeln!(f, " final_output: {:?}", self.final_output())?; + writeln!(f, " # transitions: {}", self.len())?; + writeln!(f, " transitions:")?; + for t in self.transitions() { + writeln!(f, " {:?}", t)?; + } + Ok(()) + } +} + +impl<'f> Node<'f> { + /// Creates a new note at the address given. + /// + /// `data` should be a slice to an entire FST. + pub(crate) fn new( + version: u64, + addr: CompiledAddr, + data: &[u8], + ) -> Node<'_> { + let state = State::new(data, addr); + match state { + State::EmptyFinal => Node { + data: &[], + version, + state: State::EmptyFinal, + start: EMPTY_ADDRESS, + end: EMPTY_ADDRESS, + is_final: true, + ntrans: 0, + sizes: PackSizes::new(), + final_output: Output::zero(), + }, + State::OneTransNext(s) => { + let data = &data[..addr + 1]; + Node { + data, + version, + state, + start: addr, + end: s.end_addr(data), + is_final: false, + sizes: PackSizes::new(), + ntrans: 1, + final_output: Output::zero(), + } + } + State::OneTrans(s) => { + let data = &data[..addr + 1]; + let sizes = s.sizes(data); + Node { + data, + version, + state, + start: addr, + end: s.end_addr(data, sizes), + is_final: false, + ntrans: 1, + sizes, + final_output: Output::zero(), + } + } + State::AnyTrans(s) => { + let data = &data[..addr + 1]; + let sizes = s.sizes(data); + let ntrans = s.ntrans(data); + Node { + data, + version, + state, + start: addr, + end: s.end_addr(version, data, sizes, ntrans), + is_final: s.is_final_state(), + ntrans, + sizes, + final_output: s.final_output(version, data, sizes, ntrans), + } + } + } + } + /// Returns an iterator over all transitions in this node in lexicographic + /// order. + #[inline] + pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { + Transitions { node: self, range: 0..self.len() } + } + + /// Returns the transition at index `i`. + #[inline(always)] + pub fn transition(&self, i: usize) -> Transition { + // The `inline(always)` annotation on this function appears to + // dramatically speed up FST traversal. In particular, measuring the + // time it takes to run `fst range something-big.fst` shows almost a 2x + // improvement with this single `inline(always)` annotation. + + match self.state { + State::OneTransNext(s) => { + assert!(i == 0); + Transition { + inp: s.input(self), + out: Output::zero(), + addr: s.trans_addr(self), + } + } + State::OneTrans(s) => { + assert!(i == 0); + Transition { + inp: s.input(self), + out: s.output(self), + addr: s.trans_addr(self), + } + } + State::AnyTrans(s) => Transition { + inp: s.input(self, i), + out: s.output(self, i), + addr: s.trans_addr(self, i), + }, + State::EmptyFinal => panic!("out of bounds"), + } + } + + /// Returns the transition address of the `i`th transition. + #[inline] + pub fn transition_addr(&self, i: usize) -> CompiledAddr { + match self.state { + State::OneTransNext(s) => { + assert!(i == 0); + s.trans_addr(self) + } + State::OneTrans(s) => { + assert!(i == 0); + s.trans_addr(self) + } + State::AnyTrans(s) => s.trans_addr(self, i), + State::EmptyFinal => panic!("out of bounds"), + } + } + + /// Finds the `i`th transition corresponding to the given input byte. + /// + /// If no transition for this byte exists, then `None` is returned. + #[inline] + pub fn find_input(&self, b: u8) -> Option<usize> { + match self.state { + State::OneTransNext(s) if s.input(self) == b => Some(0), + State::OneTransNext(_) => None, + State::OneTrans(s) if s.input(self) == b => Some(0), + State::OneTrans(_) => None, + State::AnyTrans(s) => s.find_input(self, b), + State::EmptyFinal => None, + } + } + + /// If this node is final and has a terminal output value, then it is + /// returned. Otherwise, a zero output is returned. + #[inline] + pub fn final_output(&self) -> Output { + self.final_output + } + + /// Returns true if and only if this node corresponds to a final or "match" + /// state in the finite state transducer. + #[inline] + pub fn is_final(&self) -> bool { + self.is_final + } + + /// Returns the number of transitions in this node. + /// + /// The maximum number of transitions is 256. + #[inline] + pub fn len(&self) -> usize { + self.ntrans + } + + /// Returns true if and only if this node has zero transitions. + #[inline] + pub fn is_empty(&self) -> bool { + self.ntrans == 0 + } + + /// Return the address of this node. + #[inline] + pub fn addr(&self) -> CompiledAddr { + self.start + } + + #[doc(hidden)] + #[inline] + pub fn as_slice(&self) -> &[u8] { + &self.data[self.end..] + } + + #[doc(hidden)] + #[inline] + pub fn state(&self) -> &'static str { + match self.state { + State::OneTransNext(_) => "OTN", + State::OneTrans(_) => "OT", + State::AnyTrans(_) => "AT", + State::EmptyFinal => "EF", + } + } + + fn compile<W: io::Write>( + wtr: W, + last_addr: CompiledAddr, + addr: CompiledAddr, + node: &BuilderNode, + ) -> io::Result<()> { + assert!(node.trans.len() <= 256); + if node.trans.is_empty() + && node.is_final + && node.final_output.is_zero() + { + return Ok(()); + } else if node.trans.len() != 1 || node.is_final { + StateAnyTrans::compile(wtr, addr, node) + } else { + if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() { + StateOneTransNext::compile(wtr, addr, node.trans[0].inp) + } else { + StateOneTrans::compile(wtr, addr, node.trans[0]) + } + } + } +} + +impl BuilderNode { + pub fn compile_to<W: io::Write>( + &self, + wtr: W, + last_addr: CompiledAddr, + addr: CompiledAddr, + ) -> io::Result<()> { + Node::compile(wtr, last_addr, addr, self) + } +} + +#[derive(Clone, Copy, Debug)] +enum State { + OneTransNext(StateOneTransNext), + OneTrans(StateOneTrans), + AnyTrans(StateAnyTrans), + EmptyFinal, +} + +// one trans flag (1), next flag (1), common input +#[derive(Clone, Copy, Debug)] +struct StateOneTransNext(u8); +// one trans flag (1), next flag (0), common input +#[derive(Clone, Copy, Debug)] +struct StateOneTrans(u8); +// one trans flag (0), final flag, # transitions +#[derive(Clone, Copy, Debug)] +struct StateAnyTrans(u8); + +impl State { + fn new(data: &[u8], addr: CompiledAddr) -> State { + if addr == EMPTY_ADDRESS { + return State::EmptyFinal; + } + let v = data[addr]; + match (v & 0b11_000000) >> 6 { + 0b11 => State::OneTransNext(StateOneTransNext(v)), + 0b10 => State::OneTrans(StateOneTrans(v)), + _ => State::AnyTrans(StateAnyTrans(v)), + } + } +} + +impl StateOneTransNext { + fn compile<W: io::Write>( + mut wtr: W, + _: CompiledAddr, + input: u8, + ) -> io::Result<()> { + let mut state = StateOneTransNext::new(); + state.set_common_input(input); + if state.common_input().is_none() { + wtr.write_all(&[input])?; + } + wtr.write_all(&[state.0])?; + Ok(()) + } + + #[inline] + fn new() -> StateOneTransNext { + StateOneTransNext(0b11_000000) + } + + #[inline] + fn set_common_input(&mut self, input: u8) { + self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111); + } + + #[inline] + fn common_input(&self) -> Option<u8> { + common_input(self.0 & 0b00_111111) + } + + #[inline] + fn input_len(&self) -> usize { + if self.common_input().is_none() { + 1 + } else { + 0 + } + } + + #[inline] + fn end_addr(&self, data: &[u8]) -> usize { + data.len() - 1 - self.input_len() + } + + #[inline] + fn input(&self, node: &Node<'_>) -> u8 { + if let Some(inp) = self.common_input() { + inp + } else { + node.data[node.start - 1] + } + } + + #[inline] + fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { + node.end as CompiledAddr - 1 + } +} + +impl StateOneTrans { + fn compile<W: io::Write>( + mut wtr: W, + addr: CompiledAddr, + trans: Transition, + ) -> io::Result<()> { + let out = trans.out.value(); + let output_pack_size = + if out == 0 { 0 } else { bytes::pack_uint(&mut wtr, out)? }; + let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?; + + let mut pack_sizes = PackSizes::new(); + pack_sizes.set_output_pack_size(output_pack_size); + pack_sizes.set_transition_pack_size(trans_pack_size); + wtr.write_all(&[pack_sizes.encode()])?; + + let mut state = StateOneTrans::new(); + state.set_common_input(trans.inp); + if state.common_input().is_none() { + wtr.write_all(&[trans.inp])?; + } + wtr.write_all(&[state.0])?; + Ok(()) + } + + #[inline] + fn new() -> StateOneTrans { + StateOneTrans(0b10_000000) + } + + #[inline] + fn set_common_input(&mut self, input: u8) { + self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111); + } + + #[inline] + fn common_input(&self) -> Option<u8> { + common_input(self.0 & 0b00_111111) + } + + #[inline] + fn input_len(&self) -> usize { + if self.common_input().is_none() { + 1 + } else { + 0 + } + } + + #[inline] + fn sizes(&self, data: &[u8]) -> PackSizes { + let i = data.len() - 1 - self.input_len() - 1; + PackSizes::decode(data[i]) + } + + #[inline] + fn end_addr(&self, data: &[u8], sizes: PackSizes) -> usize { + data.len() - 1 + - self.input_len() + - 1 // pack size + - sizes.transition_pack_size() + - sizes.output_pack_size() + } + + #[inline] + fn input(&self, node: &Node<'_>) -> u8 { + if let Some(inp) = self.common_input() { + inp + } else { + node.data[node.start - 1] + } + } + + #[inline] + fn output(&self, node: &Node<'_>) -> Output { + let osize = node.sizes.output_pack_size(); + if osize == 0 { + return Output::zero(); + } + let tsize = node.sizes.transition_pack_size(); + let i = node.start + - self.input_len() + - 1 // pack size + - tsize - osize; + Output::new(bytes::unpack_uint(&node.data[i..], osize as u8)) + } + + #[inline] + fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { + let tsize = node.sizes.transition_pack_size(); + let i = node.start + - self.input_len() + - 1 // pack size + - tsize; + unpack_delta(&node.data[i..], tsize, node.end) + } +} + +impl StateAnyTrans { + fn compile<W: io::Write>( + mut wtr: W, + addr: CompiledAddr, + node: &BuilderNode, + ) -> io::Result<()> { + assert!(node.trans.len() <= 256); + + let mut tsize = 0; + let mut osize = bytes::pack_size(node.final_output.value()); + let mut any_outs = !node.final_output.is_zero(); + for t in &node.trans { + tsize = cmp::max(tsize, pack_delta_size(addr, t.addr)); + osize = cmp::max(osize, bytes::pack_size(t.out.value())); + any_outs = any_outs || !t.out.is_zero(); + } + + let mut pack_sizes = PackSizes::new(); + if any_outs { + pack_sizes.set_output_pack_size(osize); + } else { + pack_sizes.set_output_pack_size(0); + } + pack_sizes.set_transition_pack_size(tsize); + + let mut state = StateAnyTrans::new(); + state.set_final_state(node.is_final); + state.set_state_ntrans(node.trans.len() as u8); + + if any_outs { + if node.is_final { + bytes::pack_uint_in( + &mut wtr, + node.final_output.value(), + osize, + )?; + } + for t in node.trans.iter().rev() { + bytes::pack_uint_in(&mut wtr, t.out.value(), osize)?; + } + } + for t in node.trans.iter().rev() { + pack_delta_in(&mut wtr, addr, t.addr, tsize)?; + } + for t in node.trans.iter().rev() { + wtr.write_all(&[t.inp])?; + } + if node.trans.len() > TRANS_INDEX_THRESHOLD { + // A value of 255 indicates that no transition exists for the byte + // at that index. (Except when there are 256 transitions.) Namely, + // any value greater than or equal to the number of transitions in + // this node indicates an absent transition. + let mut index = [255u8; 256]; + for (i, t) in node.trans.iter().enumerate() { + index[t.inp as usize] = i as u8; + } + wtr.write_all(&index)?; + } + + wtr.write_all(&[pack_sizes.encode()])?; + if state.state_ntrans().is_none() { + if node.trans.len() == 256 { + // 256 can't be represented in a u8, so we abuse the fact that + // the # of transitions can never be 1 here, since 1 is always + // encoded in the state byte. + wtr.write_all(&[1])?; + } else { + wtr.write_all(&[node.trans.len() as u8])?; + } + } + wtr.write_all(&[state.0])?; + Ok(()) + } + + #[inline] + fn new() -> StateAnyTrans { + StateAnyTrans(0b00_000000) + } + + #[inline] + fn set_final_state(&mut self, yes: bool) { + if yes { + self.0 |= 0b01_000000; + } + } + + #[inline] + fn is_final_state(&self) -> bool { + self.0 & 0b01_000000 == 0b01_000000 + } + + #[inline] + fn set_state_ntrans(&mut self, n: u8) { + if n <= 0b00_111111 { + self.0 = (self.0 & 0b11_000000) | n; + } + } + + #[inline] + fn state_ntrans(&self) -> Option<u8> { + let n = self.0 & 0b00_111111; + if n == 0 { + None + } else { + Some(n) + } + } + + #[inline] + fn sizes(&self, data: &[u8]) -> PackSizes { + let i = data.len() - 1 - self.ntrans_len() - 1; + PackSizes::decode(data[i]) + } + + #[inline] + fn total_trans_size( + &self, + version: u64, + sizes: PackSizes, + ntrans: usize, + ) -> usize { + let index_size = self.trans_index_size(version, ntrans); + ntrans + (ntrans * sizes.transition_pack_size()) + index_size + } + + #[inline] + fn trans_index_size(&self, version: u64, ntrans: usize) -> usize { + if version >= 2 && ntrans > TRANS_INDEX_THRESHOLD { + 256 + } else { + 0 + } + } + + #[inline] + fn ntrans_len(&self) -> usize { + if self.state_ntrans().is_none() { + 1 + } else { + 0 + } + } + + #[inline] + fn ntrans(&self, data: &[u8]) -> usize { + if let Some(n) = self.state_ntrans() { + n as usize + } else { + let n = data[data.len() - 2] as usize; + if n == 1 { + // "1" is never a normal legal value here, because if there + // is only 1 transition, then it is encoded in the state byte. + 256 + } else { + n + } + } + } + + #[inline] + fn final_output( + &self, + version: u64, + data: &[u8], + sizes: PackSizes, + ntrans: usize, + ) -> Output { + let osize = sizes.output_pack_size(); + if osize == 0 || !self.is_final_state() { + return Output::zero(); + } + let at = data.len() - 1 + - self.ntrans_len() + - 1 // pack size + - self.total_trans_size(version, sizes, ntrans) + - (ntrans * osize) // output values + - osize; // the desired output value + Output::new(bytes::unpack_uint(&data[at..], osize as u8)) + } + + #[inline] + fn end_addr( + &self, + version: u64, + data: &[u8], + sizes: PackSizes, + ntrans: usize, + ) -> usize { + let osize = sizes.output_pack_size(); + let final_osize = if !self.is_final_state() { 0 } else { osize }; + data.len() - 1 + - self.ntrans_len() + - 1 // pack size + - self.total_trans_size(version, sizes, ntrans) + - (ntrans * osize) // output values + - final_osize // final output + } + + #[inline] + fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr { + assert!(i < node.ntrans); + let tsize = node.sizes.transition_pack_size(); + let at = node.start + - self.ntrans_len() + - 1 // pack size + - self.trans_index_size(node.version, node.ntrans) + - node.ntrans // inputs + - (i * tsize) // the previous transition addresses + - tsize; // the desired transition address + unpack_delta(&node.data[at..], tsize, node.end) + } + + #[inline] + fn input(&self, node: &Node<'_>, i: usize) -> u8 { + let at = node.start + - self.ntrans_len() + - 1 // pack size + - self.trans_index_size(node.version, node.ntrans) + - i + - 1; // the input byte + node.data[at] + } + + #[inline] + fn find_input(&self, node: &Node<'_>, b: u8) -> Option<usize> { + if node.version >= 2 && node.ntrans > TRANS_INDEX_THRESHOLD { + let start = node.start + - self.ntrans_len() + - 1 // pack size + - self.trans_index_size(node.version, node.ntrans); + let i = node.data[start + b as usize] as usize; + if i >= node.ntrans { + None + } else { + Some(i) + } + } else { + let start = node.start + - self.ntrans_len() + - 1 // pack size + - node.ntrans; // inputs + let end = start + node.ntrans; + let inputs = &node.data[start..end]; + inputs.iter().position(|&b2| b == b2).map(|i| node.ntrans - i - 1) + } + } + + #[inline] + fn output(&self, node: &Node<'_>, i: usize) -> Output { + let osize = node.sizes.output_pack_size(); + if osize == 0 { + return Output::zero(); + } + let at = node.start + - self.ntrans_len() + - 1 // pack size + - self.total_trans_size(node.version, node.sizes, node.ntrans) + - (i * osize) // the previous outputs + - osize; // the desired output value + Output::new(bytes::unpack_uint(&node.data[at..], osize as u8)) + } +} + +// high 4 bits is transition address packed size. +// low 4 bits is output value packed size. +// +// `0` is a legal value which means there are no transitions/outputs. +#[derive(Clone, Copy, Debug)] +struct PackSizes(u8); + +impl PackSizes { + #[inline] + fn new() -> PackSizes { + PackSizes(0) + } + + #[inline] + fn decode(v: u8) -> PackSizes { + PackSizes(v) + } + + #[inline] + fn encode(&self) -> u8 { + self.0 + } + + #[inline] + fn set_transition_pack_size(&mut self, size: u8) { + assert!(size <= 8); + self.0 = (self.0 & 0b0000_1111) | (size << 4); + } + + #[inline] + fn transition_pack_size(&self) -> usize { + ((self.0 & 0b1111_0000) >> 4) as usize + } + + #[inline] + fn set_output_pack_size(&mut self, size: u8) { + assert!(size <= 8); + self.0 = (self.0 & 0b1111_0000) | size; + } + + #[inline] + fn output_pack_size(&self) -> usize { + (self.0 & 0b0000_1111) as usize + } +} + +/// An iterator over all transitions in a node. +/// +/// `'f` is the lifetime of the underlying fst and `'n` is the lifetime of +/// the underlying `Node`. +pub struct Transitions<'f, 'n> { + node: &'n Node<'f>, + range: Range<usize>, +} + +impl<'f, 'n> Iterator for Transitions<'f, 'n> { + type Item = Transition; + + #[inline] + fn next(&mut self) -> Option<Transition> { + self.range.next().map(|i| self.node.transition(i)) + } +} + +/// common_idx translate a byte to an index in the COMMON_INPUTS_INV array. +/// +/// I wonder if it would be prudent to store this mapping in the FST itself. +/// The advantage of doing so would mean that common inputs would reflect the +/// specific data in the FST. The problem of course is that this table has to +/// be computed up front, which is pretty much at odds with the streaming +/// nature of the builder. +/// +/// Nevertheless, the *caller* may have a priori knowledge that could be +/// supplied to the builder manually, which could then be embedded in the FST. +#[inline] +fn common_idx(input: u8, max: u8) -> u8 { + let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8; + if val > max { + 0 + } else { + val + } +} + +/// common_input translates a common input index stored in a serialized FST +/// to the corresponding byte. +#[inline] +fn common_input(idx: u8) -> Option<u8> { + if idx == 0 { + None + } else { + Some(COMMON_INPUTS_INV[(idx - 1) as usize]) + } +} + +#[inline] +fn pack_delta<W: io::Write>( + wtr: W, + node_addr: CompiledAddr, + trans_addr: CompiledAddr, +) -> io::Result<u8> { + let nbytes = pack_delta_size(node_addr, trans_addr); + pack_delta_in(wtr, node_addr, trans_addr, nbytes)?; + Ok(nbytes) +} + +#[inline] +fn pack_delta_in<W: io::Write>( + wtr: W, + node_addr: CompiledAddr, + trans_addr: CompiledAddr, + nbytes: u8, +) -> io::Result<()> { + let delta_addr = if trans_addr == EMPTY_ADDRESS { + EMPTY_ADDRESS + } else { + node_addr - trans_addr + }; + bytes::pack_uint_in(wtr, delta_addr as u64, nbytes) +} + +#[inline] +fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { + let delta_addr = if trans_addr == EMPTY_ADDRESS { + EMPTY_ADDRESS + } else { + node_addr - trans_addr + }; + bytes::pack_size(delta_addr as u64) +} + +#[inline] +fn unpack_delta( + slice: &[u8], + trans_pack_size: usize, + node_addr: usize, +) -> CompiledAddr { + let delta = bytes::unpack_uint(slice, trans_pack_size as u8); + let delta_addr = u64_to_usize(delta); + if delta_addr == EMPTY_ADDRESS { + EMPTY_ADDRESS + } else { + node_addr - delta_addr + } +} + +#[cfg(test)] +mod tests { + use quickcheck::{quickcheck, TestResult}; + + use crate::raw::build::BuilderNode; + use crate::raw::node::Node; + use crate::raw::{Builder, CompiledAddr, Output, Transition, VERSION}; + use crate::stream::Streamer; + + const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr; + + #[test] + fn prop_emits_inputs() { + fn p(mut bs: Vec<Vec<u8>>) -> TestResult { + bs.sort(); + bs.dedup(); + + let mut bfst = Builder::memory(); + for word in &bs { + bfst.add(word).unwrap(); + } + let fst = bfst.into_fst(); + let mut rdr = fst.stream(); + let mut words = vec![]; + while let Some(w) = rdr.next() { + words.push(w.0.to_owned()); + } + TestResult::from_bool(bs == words) + } + quickcheck(p as fn(Vec<Vec<u8>>) -> TestResult) + } + + fn nodes_equal(compiled: &Node, uncompiled: &BuilderNode) -> bool { + println!("{:?}", compiled); + assert_eq!(compiled.is_final(), uncompiled.is_final); + assert_eq!(compiled.len(), uncompiled.trans.len()); + assert_eq!(compiled.final_output(), uncompiled.final_output); + for (ct, ut) in + compiled.transitions().zip(uncompiled.trans.iter().cloned()) + { + assert_eq!(ct.inp, ut.inp); + assert_eq!(ct.out, ut.out); + assert_eq!(ct.addr, ut.addr); + } + true + } + + fn compile(node: &BuilderNode) -> (CompiledAddr, Vec<u8>) { + let mut buf = vec![0; 24]; + node.compile_to(&mut buf, NEVER_LAST, 24).unwrap(); + (buf.len() as CompiledAddr - 1, buf) + } + + fn roundtrip(bnode: &BuilderNode) -> bool { + let (addr, bytes) = compile(bnode); + let node = Node::new(VERSION, addr, &bytes); + nodes_equal(&node, &bnode) + } + + fn trans(addr: CompiledAddr, inp: u8) -> Transition { + Transition { inp, out: Output::zero(), addr } + } + + #[test] + fn bin_no_trans() { + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![], + }; + let (addr, buf) = compile(&bnode); + let node = Node::new(VERSION, addr, &buf); + assert_eq!(node.as_slice().len(), 3); + roundtrip(&bnode); + } + + #[test] + fn bin_one_trans_common() { + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![trans(20, b'a')], + }; + let (addr, buf) = compile(&bnode); + let node = Node::new(VERSION, addr, &buf); + assert_eq!(node.as_slice().len(), 3); + roundtrip(&bnode); + } + + #[test] + fn bin_one_trans_not_common() { + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![trans(2, b'\xff')], + }; + let (addr, buf) = compile(&bnode); + let node = Node::new(VERSION, addr, &buf); + assert_eq!(node.as_slice().len(), 4); + roundtrip(&bnode); + } + + #[test] + fn bin_many_trans() { + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![ + trans(2, b'a'), + trans(3, b'b'), + trans(4, b'c'), + trans(5, b'd'), + trans(6, b'e'), + trans(7, b'f'), + ], + }; + let (addr, buf) = compile(&bnode); + let node = Node::new(VERSION, addr, &buf); + assert_eq!(node.as_slice().len(), 14); + roundtrip(&bnode); + } + + #[test] + fn node_max_trans() { + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: (0..256).map(|i| trans(0, i as u8)).collect(), + }; + let (addr, buf) = compile(&bnode); + let node = Node::new(VERSION, addr, &buf); + assert_eq!(node.transitions().count(), 256); + assert_eq!(node.len(), node.transitions().count()); + roundtrip(&bnode); + } +} diff --git a/vendor/fst/src/raw/ops.rs b/vendor/fst/src/raw/ops.rs new file mode 100644 index 000000000..9baf85540 --- /dev/null +++ b/vendor/fst/src/raw/ops.rs @@ -0,0 +1,643 @@ +use std::cmp; +use std::collections::BinaryHeap; +use std::iter::FromIterator; + +use crate::raw::Output; +use crate::stream::{IntoStreamer, Streamer}; + +/// Permits stream operations to be hetergeneous with respect to streams. +type BoxedStream<'f> = + Box<dyn for<'a> Streamer<'a, Item = (&'a [u8], Output)> + 'f>; + +/// A value indexed by a stream. +/// +/// Indexed values are used to indicate the presence of a key in multiple +/// streams during a set operation. Namely, the index corresponds to the stream +/// (by the order in which it was added to the operation, starting at `0`) +/// and the value corresponds to the value associated with a particular key +/// in that stream. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct IndexedValue { + /// The index of the stream that produced this value (starting at `0`). + pub index: usize, + /// The value. + pub value: u64, +} + +/// A builder for collecting fst streams on which to perform set operations +/// on the keys of fsts. +/// +/// Set operations include intersection, union, difference and symmetric +/// difference. The result of each set operation is itself a stream that emits +/// pairs of keys and a sequence of each occurrence of that key in the +/// participating streams. This information allows one to perform set +/// operations on fsts and customize how conflicting output values are handled. +/// +/// All set operations work efficiently on an arbitrary number of +/// streams with memory proportional to the number of streams. +/// +/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)` +/// where `n1, n2, n3, ...` correspond to the number of elements in each +/// stream. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying set. +pub struct OpBuilder<'f> { + streams: Vec<BoxedStream<'f>>, +} + +impl<'f> OpBuilder<'f> { + /// Create a new set operation builder. + #[inline] + pub fn new() -> OpBuilder<'f> { + OpBuilder { streams: vec![] } + } + + /// Add a stream to this set operation. + /// + /// This is useful for a chaining style pattern, e.g., + /// `builder.add(stream1).add(stream2).union()`. + /// + /// The stream must emit a lexicographically ordered sequence of key-value + /// pairs. + pub fn add<I, S>(mut self, stream: I) -> OpBuilder<'f> + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + self.push(stream); + self + } + + /// Add a stream to this set operation. + /// + /// The stream must emit a lexicographically ordered sequence of key-value + /// pairs. + pub fn push<I, S>(&mut self, stream: I) + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { + self.streams.push(Box::new(stream.into_stream())); + } + + /// Performs a union operation on all streams that have been added. + /// + /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The + /// first element of the tuple is the byte string key. The second element + /// of the tuple is a list of all occurrences of that key in participating + /// streams. The `IndexedValue` contains an index and the value associated + /// with that key in that stream. The index uniquely identifies each + /// stream, which is an integer that is auto-incremented when a stream + /// is added to this operation (starting at `0`). + #[inline] + pub fn union(self) -> Union<'f> { + Union { + heap: StreamHeap::new(self.streams), + outs: vec![], + cur_slot: None, + } + } + + /// Performs an intersection operation on all streams that have been added. + /// + /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The + /// first element of the tuple is the byte string key. The second element + /// of the tuple is a list of all occurrences of that key in participating + /// streams. The `IndexedValue` contains an index and the value associated + /// with that key in that stream. The index uniquely identifies each + /// stream, which is an integer that is auto-incremented when a stream + /// is added to this operation (starting at `0`). + #[inline] + pub fn intersection(self) -> Intersection<'f> { + Intersection { + heap: StreamHeap::new(self.streams), + outs: vec![], + cur_slot: None, + } + } + + /// Performs a difference operation with respect to the first stream added. + /// That is, this returns a stream of all elements in the first stream + /// that don't exist in any other stream that has been added. + /// + /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The + /// first element of the tuple is the byte string key. The second element + /// of the tuple is a list of all occurrences of that key in participating + /// streams. The `IndexedValue` contains an index and the value associated + /// with that key in that stream. The index uniquely identifies each + /// stream, which is an integer that is auto-incremented when a stream + /// is added to this operation (starting at `0`). + /// + /// The interface is the same for all the operations, but due to the nature + /// of `difference`, each yielded key contains exactly one `IndexValue` with + /// `index` set to 0. + #[inline] + pub fn difference(mut self) -> Difference<'f> { + let first = self.streams.swap_remove(0); + Difference { + set: first, + key: vec![], + heap: StreamHeap::new(self.streams), + outs: vec![], + } + } + + /// Performs a symmetric difference operation on all of the streams that + /// have been added. + /// + /// When there are only two streams, then the keys returned correspond to + /// keys that are in either stream but *not* in both streams. + /// + /// More generally, for any number of streams, keys that occur in an odd + /// number of streams are returned. + /// + /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The + /// first element of the tuple is the byte string key. The second element + /// of the tuple is a list of all occurrences of that key in participating + /// streams. The `IndexedValue` contains an index and the value associated + /// with that key in that stream. The index uniquely identifies each + /// stream, which is an integer that is auto-incremented when a stream + /// is added to this operation (starting at `0`). + #[inline] + pub fn symmetric_difference(self) -> SymmetricDifference<'f> { + SymmetricDifference { + heap: StreamHeap::new(self.streams), + outs: vec![], + cur_slot: None, + } + } +} + +impl<'f, I, S> Extend<I> for OpBuilder<'f> +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, +{ + fn extend<T>(&mut self, it: T) + where + T: IntoIterator<Item = I>, + { + for stream in it { + self.push(stream); + } + } +} + +impl<'f, I, S> FromIterator<I> for OpBuilder<'f> +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, +{ + fn from_iter<T>(it: T) -> OpBuilder<'f> + where + T: IntoIterator<Item = I>, + { + let mut op = OpBuilder::new(); + op.extend(it); + op + } +} + +/// A stream of set union over multiple fst streams in lexicographic order. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying map. +pub struct Union<'f> { + heap: StreamHeap<'f>, + outs: Vec<IndexedValue>, + cur_slot: Option<Slot>, +} + +impl<'a, 'f> Streamer<'a> for Union<'f> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(slot); + } + let slot = match self.heap.pop() { + None => return None, + Some(slot) => { + self.cur_slot = Some(slot); + self.cur_slot.as_ref().unwrap() + } + }; + self.outs.clear(); + self.outs.push(slot.indexed_value()); + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { + self.outs.push(slot2.indexed_value()); + self.heap.refill(slot2); + } + Some((slot.input(), &self.outs)) + } +} + +/// A stream of set intersection over multiple fst streams in lexicographic +/// order. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct Intersection<'f> { + heap: StreamHeap<'f>, + outs: Vec<IndexedValue>, + cur_slot: Option<Slot>, +} + +impl<'a, 'f> Streamer<'a> for Intersection<'f> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(slot); + } + loop { + let slot = match self.heap.pop() { + None => return None, + Some(slot) => slot, + }; + self.outs.clear(); + self.outs.push(slot.indexed_value()); + let mut popped: usize = 1; + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { + self.outs.push(slot2.indexed_value()); + self.heap.refill(slot2); + popped += 1; + } + if popped < self.heap.num_slots() { + self.heap.refill(slot); + } else { + self.cur_slot = Some(slot); + let key = self.cur_slot.as_ref().unwrap().input(); + return Some((key, &self.outs)); + } + } + } +} + +/// A stream of set difference over multiple fst streams in lexicographic +/// order. +/// +/// The difference operation is taken with respect to the first stream and the +/// rest of the streams. i.e., All elements in the first stream that do not +/// appear in any other streams. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct Difference<'f> { + set: BoxedStream<'f>, + key: Vec<u8>, + heap: StreamHeap<'f>, + outs: Vec<IndexedValue>, +} + +impl<'a, 'f> Streamer<'a> for Difference<'f> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { + loop { + match self.set.next() { + None => return None, + Some((key, out)) => { + self.key.clear(); + self.key.extend(key); + self.outs.clear(); + self.outs + .push(IndexedValue { index: 0, value: out.value() }); + } + }; + let mut unique = true; + while let Some(slot) = self.heap.pop_if_le(&self.key) { + if slot.input() == &*self.key { + unique = false; + } + self.heap.refill(slot); + } + if unique { + return Some((&self.key, &self.outs)); + } + } + } +} + +/// A stream of set symmetric difference over multiple fst streams in +/// lexicographic order. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct SymmetricDifference<'f> { + heap: StreamHeap<'f>, + outs: Vec<IndexedValue>, + cur_slot: Option<Slot>, +} + +impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> { + type Item = (&'a [u8], &'a [IndexedValue]); + + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { + if let Some(slot) = self.cur_slot.take() { + self.heap.refill(slot); + } + loop { + let slot = match self.heap.pop() { + None => return None, + Some(slot) => slot, + }; + self.outs.clear(); + self.outs.push(slot.indexed_value()); + let mut popped: usize = 1; + while let Some(slot2) = self.heap.pop_if_equal(slot.input()) { + self.outs.push(slot2.indexed_value()); + self.heap.refill(slot2); + popped += 1; + } + // This key is in the symmetric difference if and only if it + // appears in an odd number of sets. + if popped % 2 == 0 { + self.heap.refill(slot); + } else { + self.cur_slot = Some(slot); + let key = self.cur_slot.as_ref().unwrap().input(); + return Some((key, &self.outs)); + } + } + } +} + +struct StreamHeap<'f> { + rdrs: Vec<BoxedStream<'f>>, + heap: BinaryHeap<Slot>, +} + +impl<'f> StreamHeap<'f> { + fn new(streams: Vec<BoxedStream<'f>>) -> StreamHeap<'f> { + let mut u = StreamHeap { rdrs: streams, heap: BinaryHeap::new() }; + for i in 0..u.rdrs.len() { + u.refill(Slot::new(i)); + } + u + } + + fn pop(&mut self) -> Option<Slot> { + self.heap.pop() + } + + fn peek_is_duplicate(&self, key: &[u8]) -> bool { + self.heap.peek().map(|s| s.input() == key).unwrap_or(false) + } + + fn pop_if_equal(&mut self, key: &[u8]) -> Option<Slot> { + if self.peek_is_duplicate(key) { + self.pop() + } else { + None + } + } + + fn pop_if_le(&mut self, key: &[u8]) -> Option<Slot> { + if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { + self.pop() + } else { + None + } + } + + fn num_slots(&self) -> usize { + self.rdrs.len() + } + + fn refill(&mut self, mut slot: Slot) { + if let Some((input, output)) = self.rdrs[slot.idx].next() { + slot.set_input(input); + slot.set_output(output); + self.heap.push(slot); + } + } +} + +#[derive(Debug, Eq, PartialEq)] +struct Slot { + idx: usize, + input: Vec<u8>, + output: Output, +} + +impl Slot { + fn new(rdr_idx: usize) -> Slot { + Slot { + idx: rdr_idx, + input: Vec::with_capacity(64), + output: Output::zero(), + } + } + + fn indexed_value(&self) -> IndexedValue { + IndexedValue { index: self.idx, value: self.output.value() } + } + + fn input(&self) -> &[u8] { + &self.input + } + + fn set_input(&mut self, input: &[u8]) { + self.input.clear(); + self.input.extend(input); + } + + fn set_output(&mut self, output: Output) { + self.output = output; + } +} + +impl PartialOrd for Slot { + fn partial_cmp(&self, other: &Slot) -> Option<cmp::Ordering> { + (&self.input, self.output) + .partial_cmp(&(&other.input, other.output)) + .map(|ord| ord.reverse()) + } +} + +impl Ord for Slot { + fn cmp(&self, other: &Slot) -> cmp::Ordering { + self.partial_cmp(other).unwrap() + } +} + +#[cfg(test)] +mod tests { + use crate::raw::tests::{fst_map, fst_set}; + use crate::raw::Fst; + use crate::stream::{IntoStreamer, Streamer}; + + use super::OpBuilder; + + fn s(string: &str) -> String { + string.to_owned() + } + + macro_rules! create_set_op { + ($name:ident, $op:ident) => { + fn $name(sets: Vec<Vec<&str>>) -> Vec<String> { + let fsts: Vec<Fst<_>> = + sets.into_iter().map(fst_set).collect(); + let op: OpBuilder = fsts.iter().collect(); + let mut stream = op.$op().into_stream(); + let mut keys = vec![]; + while let Some((key, _)) = stream.next() { + keys.push(String::from_utf8(key.to_vec()).unwrap()); + } + keys + } + }; + } + + macro_rules! create_map_op { + ($name:ident, $op:ident) => { + fn $name(sets: Vec<Vec<(&str, u64)>>) -> Vec<(String, u64)> { + let fsts: Vec<Fst<_>> = + sets.into_iter().map(fst_map).collect(); + let op: OpBuilder = fsts.iter().collect(); + let mut stream = op.$op().into_stream(); + let mut keys = vec![]; + while let Some((key, outs)) = stream.next() { + let merged = outs.iter().fold(0, |a, b| a + b.value); + let s = String::from_utf8(key.to_vec()).unwrap(); + keys.push((s, merged)); + } + keys + } + }; + } + + create_set_op!(fst_union, union); + create_set_op!(fst_intersection, intersection); + create_set_op!(fst_symmetric_difference, symmetric_difference); + create_set_op!(fst_difference, difference); + create_map_op!(fst_union_map, union); + create_map_op!(fst_intersection_map, intersection); + create_map_op!(fst_symmetric_difference_map, symmetric_difference); + create_map_op!(fst_difference_map, difference); + + #[test] + fn union_set() { + let v = fst_union(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]); + assert_eq!(v, vec!["a", "b", "c", "x", "y", "z"]); + } + + #[test] + fn union_set_dupes() { + let v = fst_union(vec![vec!["aa", "b", "cc"], vec!["b", "cc", "z"]]); + assert_eq!(v, vec!["aa", "b", "cc", "z"]); + } + + #[test] + fn union_map() { + let v = fst_union_map(vec![ + vec![("a", 1), ("b", 2), ("c", 3)], + vec![("x", 1), ("y", 2), ("z", 3)], + ]); + assert_eq!( + v, + vec![ + (s("a"), 1), + (s("b"), 2), + (s("c"), 3), + (s("x"), 1), + (s("y"), 2), + (s("z"), 3), + ] + ); + } + + #[test] + fn union_map_dupes() { + let v = fst_union_map(vec![ + vec![("aa", 1), ("b", 2), ("cc", 3)], + vec![("b", 1), ("cc", 2), ("z", 3)], + vec![("b", 1)], + ]); + assert_eq!( + v, + vec![(s("aa"), 1), (s("b"), 4), (s("cc"), 5), (s("z"), 3),] + ); + } + + #[test] + fn intersect_set() { + let v = + fst_intersection(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]); + assert_eq!(v, Vec::<String>::new()); + } + + #[test] + fn intersect_set_dupes() { + let v = fst_intersection(vec![ + vec!["aa", "b", "cc"], + vec!["b", "cc", "z"], + ]); + assert_eq!(v, vec!["b", "cc"]); + } + + #[test] + fn intersect_map() { + let v = fst_intersection_map(vec![ + vec![("a", 1), ("b", 2), ("c", 3)], + vec![("x", 1), ("y", 2), ("z", 3)], + ]); + assert_eq!(v, Vec::<(String, u64)>::new()); + } + + #[test] + fn intersect_map_dupes() { + let v = fst_intersection_map(vec![ + vec![("aa", 1), ("b", 2), ("cc", 3)], + vec![("b", 1), ("cc", 2), ("z", 3)], + vec![("b", 1)], + ]); + assert_eq!(v, vec![(s("b"), 4)]); + } + + #[test] + fn symmetric_difference() { + let v = fst_symmetric_difference(vec![ + vec!["a", "b", "c"], + vec!["a", "b"], + vec!["a"], + ]); + assert_eq!(v, vec!["a", "c"]); + } + + #[test] + fn symmetric_difference_map() { + let v = fst_symmetric_difference_map(vec![ + vec![("a", 1), ("b", 2), ("c", 3)], + vec![("a", 1), ("b", 2)], + vec![("a", 1)], + ]); + assert_eq!(v, vec![(s("a"), 3), (s("c"), 3)]); + } + + #[test] + fn difference() { + let v = fst_difference(vec![ + vec!["a", "b", "c"], + vec!["a", "b"], + vec!["a"], + ]); + assert_eq!(v, vec!["c"]); + } + + #[test] + fn difference2() { + // Regression test: https://github.com/BurntSushi/fst/issues/19 + let v = fst_difference(vec![vec!["a", "c"], vec!["b", "c"]]); + assert_eq!(v, vec!["a"]); + let v = fst_difference(vec![vec!["bar", "foo"], vec!["baz", "foo"]]); + assert_eq!(v, vec!["bar"]); + } + + #[test] + fn difference_map() { + let v = fst_difference_map(vec![ + vec![("a", 1), ("b", 2), ("c", 3)], + vec![("a", 1), ("b", 2)], + vec![("a", 1)], + ]); + assert_eq!(v, vec![(s("c"), 3)]); + } +} diff --git a/vendor/fst/src/raw/registry.rs b/vendor/fst/src/raw/registry.rs new file mode 100644 index 000000000..3b00a5b0c --- /dev/null +++ b/vendor/fst/src/raw/registry.rs @@ -0,0 +1,264 @@ +use crate::raw::build::BuilderNode; +use crate::raw::{CompiledAddr, NONE_ADDRESS}; + +#[derive(Debug)] +pub struct Registry { + table: Vec<RegistryCell>, + table_size: usize, // number of rows + mru_size: usize, // number of columns +} + +#[derive(Debug)] +struct RegistryCache<'a> { + cells: &'a mut [RegistryCell], +} + +#[derive(Clone, Debug)] +pub struct RegistryCell { + addr: CompiledAddr, + node: BuilderNode, +} + +#[derive(Debug)] +pub enum RegistryEntry<'a> { + Found(CompiledAddr), + NotFound(&'a mut RegistryCell), + Rejected, +} + +impl Registry { + pub fn new(table_size: usize, mru_size: usize) -> Registry { + let empty_cell = RegistryCell::none(); + let ncells = table_size.checked_mul(mru_size).unwrap(); + Registry { table: vec![empty_cell; ncells], table_size, mru_size } + } + + pub fn entry<'a>(&'a mut self, node: &BuilderNode) -> RegistryEntry<'a> { + if self.table.is_empty() { + return RegistryEntry::Rejected; + } + let bucket = self.hash(node); + let start = self.mru_size * bucket; + let end = start + self.mru_size; + RegistryCache { cells: &mut self.table[start..end] }.entry(node) + } + + fn hash(&self, node: &BuilderNode) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + // + // In unscientific experiments, this provides the same compression + // as `std::hash::SipHasher` but is much much faster. + const FNV_PRIME: u64 = 1099511628211; + let mut h = 14695981039346656037; + h = (h ^ (node.is_final as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ node.final_output.value()).wrapping_mul(FNV_PRIME); + for t in &node.trans { + h = (h ^ (t.inp as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ t.out.value()).wrapping_mul(FNV_PRIME); + h = (h ^ (t.addr as u64)).wrapping_mul(FNV_PRIME); + } + (h as usize) % self.table_size + } +} + +impl<'a> RegistryCache<'a> { + fn entry(mut self, node: &BuilderNode) -> RegistryEntry<'a> { + if self.cells.len() == 1 { + let cell = &mut self.cells[0]; + if !cell.is_none() && &cell.node == node { + RegistryEntry::Found(cell.addr) + } else { + cell.node.clone_from(node); + RegistryEntry::NotFound(cell) + } + } else if self.cells.len() == 2 { + let cell1 = &mut self.cells[0]; + if !cell1.is_none() && &cell1.node == node { + return RegistryEntry::Found(cell1.addr); + } + + let cell2 = &mut self.cells[1]; + if !cell2.is_none() && &cell2.node == node { + let addr = cell2.addr; + self.cells.swap(0, 1); + return RegistryEntry::Found(addr); + } + + self.cells[1].node.clone_from(node); + self.cells.swap(0, 1); + RegistryEntry::NotFound(&mut self.cells[0]) + } else { + let find = |c: &RegistryCell| !c.is_none() && &c.node == node; + if let Some(i) = self.cells.iter().position(find) { + let addr = self.cells[i].addr; + self.promote(i); // most recently used + RegistryEntry::Found(addr) + } else { + let last = self.cells.len() - 1; + self.cells[last].node.clone_from(node); // discard LRU + self.promote(last); + RegistryEntry::NotFound(&mut self.cells[0]) + } + } + } + + fn promote(&mut self, mut i: usize) { + assert!(i < self.cells.len()); + while i > 0 { + self.cells.swap(i - 1, i); + i -= 1; + } + } +} + +impl RegistryCell { + fn none() -> RegistryCell { + RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() } + } + + fn is_none(&self) -> bool { + self.addr == NONE_ADDRESS + } + + pub fn insert(&mut self, addr: CompiledAddr) { + self.addr = addr; + } +} + +#[cfg(test)] +mod tests { + use super::{Registry, RegistryCache, RegistryCell, RegistryEntry}; + use crate::raw::build::BuilderNode; + use crate::raw::{Output, Transition}; + + fn assert_rejected(entry: RegistryEntry) { + match entry { + RegistryEntry::Rejected => {} + entry => panic!("expected rejected entry, got: {:?}", entry), + } + } + + fn assert_not_found(entry: RegistryEntry) { + match entry { + RegistryEntry::NotFound(_) => {} + entry => panic!("expected nout found entry, got: {:?}", entry), + } + } + + fn assert_insert_and_found(reg: &mut Registry, bnode: &BuilderNode) { + match reg.entry(&bnode) { + RegistryEntry::NotFound(cell) => cell.insert(1234), + entry => panic!("unexpected not found entry, got: {:?}", entry), + } + match reg.entry(&bnode) { + RegistryEntry::Found(addr) => assert_eq!(addr, 1234), + entry => panic!("unexpected found entry, got: {:?}", entry), + } + } + + #[test] + fn empty_is_ok() { + let mut reg = Registry::new(0, 0); + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![], + }; + assert_rejected(reg.entry(&bnode)); + } + + #[test] + fn one_final_is_ok() { + let mut reg = Registry::new(1, 1); + let bnode = BuilderNode { + is_final: true, + final_output: Output::zero(), + trans: vec![], + }; + assert_insert_and_found(&mut reg, &bnode); + } + + #[test] + fn one_with_trans_is_ok() { + let mut reg = Registry::new(1, 1); + let bnode = BuilderNode { + is_final: false, + final_output: Output::zero(), + trans: vec![Transition { + addr: 0, + inp: b'a', + out: Output::zero(), + }], + }; + assert_insert_and_found(&mut reg, &bnode); + assert_not_found( + reg.entry(&BuilderNode { is_final: true, ..bnode.clone() }), + ); + assert_not_found(reg.entry(&BuilderNode { + trans: vec![Transition { + addr: 0, + inp: b'b', + out: Output::zero(), + }], + ..bnode.clone() + })); + assert_not_found(reg.entry(&BuilderNode { + trans: vec![Transition { + addr: 0, + inp: b'a', + out: Output::new(1), + }], + ..bnode.clone() + })); + } + + #[test] + fn cache_works() { + let mut reg = Registry::new(1, 1); + + let bnode1 = BuilderNode { is_final: true, ..BuilderNode::default() }; + assert_insert_and_found(&mut reg, &bnode1); + + let bnode2 = + BuilderNode { final_output: Output::new(1), ..bnode1.clone() }; + assert_insert_and_found(&mut reg, &bnode2); + assert_not_found(reg.entry(&bnode1)); + } + + #[test] + fn promote() { + let bn = BuilderNode::default(); + let mut bnodes = vec![ + RegistryCell { addr: 1, node: bn.clone() }, + RegistryCell { addr: 2, node: bn.clone() }, + RegistryCell { addr: 3, node: bn.clone() }, + RegistryCell { addr: 4, node: bn.clone() }, + ]; + let mut cache = RegistryCache { cells: &mut bnodes }; + + cache.promote(0); + assert_eq!(cache.cells[0].addr, 1); + assert_eq!(cache.cells[1].addr, 2); + assert_eq!(cache.cells[2].addr, 3); + assert_eq!(cache.cells[3].addr, 4); + + cache.promote(1); + assert_eq!(cache.cells[0].addr, 2); + assert_eq!(cache.cells[1].addr, 1); + assert_eq!(cache.cells[2].addr, 3); + assert_eq!(cache.cells[3].addr, 4); + + cache.promote(3); + assert_eq!(cache.cells[0].addr, 4); + assert_eq!(cache.cells[1].addr, 2); + assert_eq!(cache.cells[2].addr, 1); + assert_eq!(cache.cells[3].addr, 3); + + cache.promote(2); + assert_eq!(cache.cells[0].addr, 1); + assert_eq!(cache.cells[1].addr, 4); + assert_eq!(cache.cells[2].addr, 2); + assert_eq!(cache.cells[3].addr, 3); + } +} diff --git a/vendor/fst/src/raw/registry_minimal.rs b/vendor/fst/src/raw/registry_minimal.rs new file mode 100644 index 000000000..663b0123a --- /dev/null +++ b/vendor/fst/src/raw/registry_minimal.rs @@ -0,0 +1,53 @@ +// This module is a drop-in but inefficient replacement of the LRU registry. +// In particular, this registry will never forget a node. In other words, if +// this registry is used during construction, then you're guaranteed a minimal +// FST. +// +// This is really only meant to be used for debugging and experiments. It is +// a memory/CPU hog. +// +// One "easy" improvement here is to use an FNV hash instead of the super +// expensive SipHasher. + +#![allow(dead_code)] + +use std::collections::hash_map::{Entry, HashMap}; + +use crate::raw::build::BuilderNode; +use crate::raw::CompiledAddr; + +#[derive(Debug)] +pub struct Registry { + table: HashMap<BuilderNode, RegistryCell>, +} + +#[derive(Debug)] +pub enum RegistryEntry<'a> { + Found(CompiledAddr), + NotFound(&'a mut RegistryCell), + Rejected, +} + +#[derive(Clone, Copy, Debug)] +pub struct RegistryCell(CompiledAddr); + +impl Registry { + pub fn new(table_size: usize, _lru_size: usize) -> Registry { + Registry { table: HashMap::with_capacity(table_size) } + } + + pub fn entry<'a>(&'a mut self, bnode: &BuilderNode) -> RegistryEntry<'a> { + match self.table.entry(bnode.clone()) { + Entry::Occupied(v) => RegistryEntry::Found(v.get().0), + Entry::Vacant(v) => { + RegistryEntry::NotFound(v.insert(RegistryCell(0))) + } + } + } +} + +impl RegistryCell { + pub fn insert(&mut self, addr: CompiledAddr) { + self.0 = addr; + } +} diff --git a/vendor/fst/src/raw/tests.rs b/vendor/fst/src/raw/tests.rs new file mode 100644 index 000000000..c9e77a5f7 --- /dev/null +++ b/vendor/fst/src/raw/tests.rs @@ -0,0 +1,589 @@ +use crate::automaton::AlwaysMatch; +use crate::error::Error; +use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; +use crate::stream::Streamer; + +const TEXT: &'static str = include_str!("./../../data/words-100000"); + +pub fn fst_set<I, S>(ss: I) -> Fst<Vec<u8>> +where + I: IntoIterator<Item = S>, + S: AsRef<[u8]>, +{ + let mut bfst = Builder::memory(); + let mut ss: Vec<Vec<u8>> = + ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); + ss.sort(); + ss.dedup(); + for s in ss.iter().into_iter() { + bfst.add(s).unwrap(); + } + let fst = bfst.into_fst(); + assert_eq!(fst.len(), ss.len()); + fst +} + +pub fn fst_map<I, S>(ss: I) -> Fst<Vec<u8>> +where + I: IntoIterator<Item = (S, u64)>, + S: AsRef<[u8]>, +{ + let mut bfst = Builder::memory(); + let mut ss: Vec<(Vec<u8>, u64)> = + ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect(); + ss.sort(); + ss.dedup(); + for (s, o) in ss.into_iter() { + bfst.insert(s, o).unwrap(); + } + bfst.into_fst() +} + +pub fn fst_inputs<D: AsRef<[u8]>>(fst: &Fst<D>) -> Vec<Vec<u8>> { + let mut words = vec![]; + let mut rdr = fst.stream(); + while let Some((word, _)) = rdr.next() { + words.push(word.to_vec()); + } + words +} + +pub fn fst_inputs_outputs<D: AsRef<[u8]>>( + fst: &Fst<D>, +) -> Vec<(Vec<u8>, u64)> { + let mut words = vec![]; + let mut rdr = fst.stream(); + while let Some((word, out)) = rdr.next() { + words.push((word.to_vec(), out.value())); + } + words +} + +macro_rules! test_set { + ($name:ident, $($s:expr),+) => { + #[test] + fn $name() { + let mut items = vec![$($s),*]; + let fst = fst_set(&items); + let mut rdr = fst.stream(); + items.sort(); + items.dedup(); + for item in &items { + assert_eq!(rdr.next().unwrap().0, item.as_bytes()); + } + assert_eq!(rdr.next(), None); + for item in &items { + assert!(fst.get(item).is_some()); + } + } + } +} + +macro_rules! test_set_fail { + ($name:ident, $($s:expr),+) => { + #[test] + #[should_panic] + fn $name() { + let mut bfst = Builder::memory(); + $(bfst.add($s).unwrap();)* + } + } +} + +test_set!(fst_set_only_empty, ""); +test_set!(fst_set_one, "a"); +test_set!(fst_set_dupe_empty, "", ""); +test_set!(fst_set_dupe1, "a", "a"); +test_set!(fst_set_dupe2, "a", "b", "b"); +test_set!(fst_set_two1, "a", "b"); +test_set!(fst_set_two2, "a", "ab"); +test_set!(fst_set_jan, "jam", "jbm", "jcm", "jdm", "jem", "jfm", "jgm"); + +test_set_fail!(fst_set_order1, "b", "a"); +test_set_fail!(fst_set_order2, "a", "b", "c", "a"); + +#[test] +fn fst_set_100000() { + let words: Vec<Vec<u8>> = + TEXT.lines().map(|s| s.as_bytes().to_vec()).collect(); + let fst = fst_set(words.clone()); + assert_eq!(words, fst_inputs(&fst)); + for word in &words { + assert!( + fst.get(word).is_some(), + "failed to find word: {}", + std::str::from_utf8(word).unwrap() + ); + } +} + +macro_rules! test_map { + ($name:ident, $($s:expr, $o:expr),+) => { + #[test] + fn $name() { + let fst = fst_map(vec![$(($s, $o)),*]); + let mut rdr = fst.stream(); + $({ + let (s, o) = rdr.next().unwrap(); + assert_eq!((s, o.value()), ($s.as_bytes(), $o)); + })* + assert_eq!(rdr.next(), None); + $({ + assert_eq!(fst.get($s.as_bytes()), Some(Output::new($o))); + })* + } + } +} + +macro_rules! test_map_fail { + ($name:ident, $($s:expr, $o:expr),+) => { + #[test] + #[should_panic] + fn $name() { + let mut bfst = Builder::memory(); + $(bfst.insert($s, $o).unwrap();)* + } + } +} + +test_map!(fst_map_only_empty1, "", 0); +test_map!(fst_map_only_empty2, "", 100); +test_map!(fst_map_only_empty3, "", 9999999999); +test_map!(fst_map_one1, "a", 0); +test_map!(fst_map_one2, "a", 100); +test_map!(fst_map_one3, "a", 999999999); +test_map!(fst_map_two, "a", 1, "b", 2); +test_map!(fst_map_many1, "a", 34786, "ab", 26); +test_map!( + fst_map_many2, + "a", + 34786, + "ab", + 26, + "abc", + 58976, + "abcd", + 25, + "z", + 58, + "zabc", + 6798 +); +test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0); + +test_map_fail!(fst_map_dupe_empty, "", 0, "", 0); +test_map_fail!(fst_map_dupe1, "a", 0, "a", 0); +test_map_fail!(fst_map_dupe2, "a", 0, "b", 0, "b", 0); +test_map_fail!(fst_map_order1, "b", 0, "a", 0); +test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0); + +#[test] +fn fst_map_100000_increments() { + let words: Vec<(Vec<u8>, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, s)| (s.as_bytes().to_vec(), i as u64)) + .collect(); + let fst = fst_map(words.clone()); + assert_eq!(words, fst_inputs_outputs(&fst)); + for &(ref word, out) in &words { + assert_eq!(fst.get(word), Some(Output::new(out))); + } +} + +#[test] +fn fst_map_100000_lengths() { + let words: Vec<(Vec<u8>, u64)> = TEXT + .lines() + .map(|s| (s.as_bytes().to_vec(), s.len() as u64)) + .collect(); + let fst = fst_map(words.clone()); + assert_eq!(words, fst_inputs_outputs(&fst)); + for &(ref word, out) in &words { + assert_eq!(fst.get(word), Some(Output::new(out))); + } +} + +#[test] +fn invalid_version() { + match Fst::new(vec![0; 36]) { + Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0), + Err(err) => panic!("expected version error, got {:?}", err), + Ok(_) => panic!("expected version error, got FST"), + } +} + +#[test] +fn invalid_version_crate_too_old() { + let mut buf = vec![0; 36]; + crate::bytes::write_u64_le(VERSION + 1, &mut buf); + match Fst::new(buf) { + Err(Error::Fst(raw::Error::Version { got, .. })) => { + assert_eq!(got, VERSION + 1); + } + Err(err) => panic!("expected version error, got {:?}", err), + Ok(_) => panic!("expected version error, got FST"), + } +} + +#[test] +fn invalid_format() { + match Fst::new(vec![0; 0]) { + Err(Error::Fst(raw::Error::Format { .. })) => {} + Err(err) => panic!("expected format error, got {:?}", err), + Ok(_) => panic!("expected format error, got FST"), + } +} + +#[test] +fn fst_set_zero() { + let fst = fst_set::<_, String>(vec![]); + let mut rdr = fst.stream(); + assert_eq!(rdr.next(), None); +} + +macro_rules! test_range { + ( + $name:ident, + min: $min:expr, + max: $max:expr, + imin: $imin:expr, + imax: $imax:expr, + $($s:expr),* + ) => { + #[test] + fn $name() { + let items: Vec<&'static str> = vec![$($s),*]; + let items: Vec<_> = items + .into_iter() + .enumerate() + .map(|(i, k)| (k, i as u64)) + .collect(); + let fst = fst_map(items.clone()); + let mut rdr = Stream::new(fst.as_ref(), AlwaysMatch, $min, $max); + for i in $imin..$imax { + assert_eq!( + rdr.next().unwrap(), + (items[i].0.as_bytes(), Output::new(items[i].1)), + ); + } + assert_eq!(rdr.next(), None); + } + } +} + +test_range! { + fst_range_empty_1, + min: Bound::Unbounded, max: Bound::Unbounded, + imin: 0, imax: 0, +} + +test_range! { + fst_range_empty_2, + min: Bound::Unbounded, max: Bound::Unbounded, + imin: 0, imax: 1, + "" +} + +test_range! { + fst_range_empty_3, + min: Bound::Included(vec![]), max: Bound::Unbounded, + imin: 0, imax: 1, + "" +} + +test_range! { + fst_range_empty_4, + min: Bound::Excluded(vec![]), max: Bound::Unbounded, + imin: 0, imax: 0, + "" +} + +test_range! { + fst_range_empty_5, + min: Bound::Included(vec![]), max: Bound::Unbounded, + imin: 0, imax: 2, + "", "a" +} + +test_range! { + fst_range_empty_6, + min: Bound::Excluded(vec![]), max: Bound::Unbounded, + imin: 1, imax: 2, + "", "a" +} + +test_range! { + fst_range_empty_7, + min: Bound::Unbounded, max: Bound::Unbounded, + imin: 0, imax: 2, + "", "a" +} + +test_range! { + fst_range_empty_8, + min: Bound::Unbounded, max: Bound::Included(vec![]), + imin: 0, imax: 1, + "" +} + +test_range! { + fst_range_empty_9, + min: Bound::Unbounded, max: Bound::Excluded(vec![]), + imin: 0, imax: 0, + "" +} + +test_range! { + fst_range_empty_10, + min: Bound::Unbounded, max: Bound::Included(vec![]), + imin: 0, imax: 1, + "", "a" +} + +test_range! { + fst_range_empty_11, + min: Bound::Included(vec![]), max: Bound::Included(vec![]), + imin: 0, imax: 1, + "" +} + +test_range! { + fst_range_1, + min: Bound::Included(vec![b'a']), max: Bound::Included(vec![b'z']), + imin: 0, imax: 4, + "a", "b", "y", "z" +} + +test_range! { + fst_range_2, + min: Bound::Excluded(vec![b'a']), max: Bound::Included(vec![b'y']), + imin: 1, imax: 3, + "a", "b", "y", "z" +} + +test_range! { + fst_range_3, + min: Bound::Excluded(vec![b'a']), max: Bound::Excluded(vec![b'y']), + imin: 1, imax: 2, + "a", "b", "y", "z" +} + +test_range! { + fst_range_4, + min: Bound::Unbounded, max: Bound::Unbounded, + imin: 0, imax: 4, + "a", "b", "y", "z" +} + +test_range! { + fst_range_5, + min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, + imin: 0, imax: 0, + "a", "ab", "abc", "abcd", "abcde" +} + +test_range! { + fst_range_6, + min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, + imin: 5, imax: 6, + "a", "ab", "abc", "abcd", "abcde", "abe" +} + +test_range! { + fst_range_7, + min: Bound::Excluded(b"abd".to_vec()), max: Bound::Unbounded, + imin: 5, imax: 6, + "a", "ab", "abc", "abcd", "abcde", "abe" +} + +test_range! { + fst_range_8, + min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, + imin: 5, imax: 6, + "a", "ab", "abc", "abcd", "abcde", "xyz" +} + +test_range! { + fst_range_9, + min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), + imin: 0, imax: 5, + "a", "ab", "abc", "abcd", "abcde", "abe" +} + +test_range! { + fst_range_10, + min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), + imin: 0, imax: 6, + "a", "ab", "abc", "abcd", "abcde", "abd" +} + +test_range! { + fst_range_11, + min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), + imin: 0, imax: 6, + "a", "ab", "abc", "abcd", "abcde", "abd", "abdx" +} + +test_range! { + fst_range_12, + min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), + imin: 0, imax: 5, + "a", "ab", "abc", "abcd", "abcde", "abe" +} + +test_range! { + fst_range_13, + min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), + imin: 0, imax: 5, + "a", "ab", "abc", "abcd", "abcde", "abd" +} + +test_range! { + fst_range_14, + min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), + imin: 0, imax: 5, + "a", "ab", "abc", "abcd", "abcde", "abd", "abdx" +} + +test_range! { + fst_range_15, + min: Bound::Included(vec![b'd']), max: Bound::Included(vec![b'c']), + imin: 0, imax: 0, + "a", "b", "c", "d", "e", "f" +} + +test_range! { + fst_range_16, + min: Bound::Included(vec![b'c']), max: Bound::Included(vec![b'c']), + imin: 2, imax: 3, + "a", "b", "c", "d", "e", "f" +} + +test_range! { + fst_range_17, + min: Bound::Excluded(vec![b'c']), max: Bound::Excluded(vec![b'c']), + imin: 0, imax: 0, + "a", "b", "c", "d", "e", "f" +} + +test_range! { + fst_range_18, + min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'c']), + imin: 0, imax: 0, + "a", "b", "c", "d", "e", "f" +} + +test_range! { + fst_range_19, + min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'd']), + imin: 2, imax: 3, + "a", "b", "c", "d", "e", "f" +} + +#[test] +fn one_vec_multiple_fsts() { + let mut bfst1 = Builder::memory(); + bfst1.add(b"bar").unwrap(); + bfst1.add(b"baz").unwrap(); + let bytes = bfst1.into_inner().unwrap(); + let fst1_len = bytes.len(); + + let mut bfst2 = Builder::new(bytes).unwrap(); + bfst2.add(b"bar").unwrap(); + bfst2.add(b"foo").unwrap(); + + let bytes = bfst2.into_inner().unwrap(); + let slice1 = &bytes[0..fst1_len]; + let slice2 = &bytes[fst1_len..bytes.len()]; + + let fst1 = Fst::new(slice1).unwrap(); + let fst2 = Fst::new(slice2).unwrap(); + + assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]); + assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]); +} + +#[test] +fn bytes_written() { + let mut bfst1 = Builder::memory(); + bfst1.add(b"bar").unwrap(); + bfst1.add(b"baz").unwrap(); + let counted_len = bfst1.bytes_written(); + let bytes = bfst1.into_inner().unwrap(); + let fst1_len = bytes.len() as u64; + let footer_size = 28; + assert_eq!(counted_len + footer_size, fst1_len); +} + +#[test] +fn get_key_simple() { + let map = fst_map(vec![("abc", 2), ("xyz", 3)]); + assert_eq!(map.get_key(0), None); + assert_eq!(map.get_key(1), None); + assert_eq!(map.get_key(2), Some(b"abc".to_vec())); + assert_eq!(map.get_key(3), Some(b"xyz".to_vec())); + assert_eq!(map.get_key(4), None); +} + +#[test] +fn get_key_words() { + let words: Vec<(Vec<u8>, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64)) + .collect(); + let map = fst_map(words.clone()); + for (key, value) in words { + assert_eq!(map.get_key(value), Some(key)); + } +} + +#[test] +fn get_key_words_discontiguous() { + let words: Vec<(Vec<u8>, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) + .collect(); + let map = fst_map(words.clone()); + for (key, value) in words { + assert_eq!(map.get_key(value), Some(key)); + } +} + +#[test] +fn verify_ok_nonempty() { + let words: Vec<(Vec<u8>, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) + .collect(); + let map = fst_map(words.clone()); + assert!(map.verify().is_ok()); +} + +#[test] +fn verify_ok_empty() { + let map = fst_map(Vec::<(&str, u64)>::new()); + assert!(map.verify().is_ok()); +} + +#[test] +fn verify_err() { + let mut b = Builder::memory(); + b.add(b"bar").unwrap(); + b.add(b"baz").unwrap(); + let mut bytes = b.into_inner().unwrap(); + + { + let fst = Fst::new(&bytes).unwrap(); + assert!(fst.verify().is_ok()); + } + + bytes[17] = b'\xFF'; + { + let fst = Fst::new(&bytes).unwrap(); + assert!(fst.verify().is_err()); + } +} |