summaryrefslogtreecommitdiffstats
path: root/vendor/fst/src/raw
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-19 09:26:03 +0000
commit9918693037dce8aa4bb6f08741b6812923486c18 (patch)
tree21d2b40bec7e6a7ea664acee056eb3d08e15a1cf /vendor/fst/src/raw
parentReleasing progress-linux version 1.75.0+dfsg1-5~progress7.99u1. (diff)
downloadrustc-9918693037dce8aa4bb6f08741b6812923486c18.tar.xz
rustc-9918693037dce8aa4bb6f08741b6812923486c18.zip
Merging upstream version 1.76.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/fst/src/raw')
-rw-r--r--vendor/fst/src/raw/build.rs474
-rw-r--r--vendor/fst/src/raw/common_inputs.rs289
-rw-r--r--vendor/fst/src/raw/counting_writer.rs70
-rw-r--r--vendor/fst/src/raw/crc32.rs59
-rw-r--r--vendor/fst/src/raw/crc32_table.rs2
-rw-r--r--vendor/fst/src/raw/error.rs171
-rw-r--r--vendor/fst/src/raw/mod.rs1382
-rw-r--r--vendor/fst/src/raw/node.rs1012
-rw-r--r--vendor/fst/src/raw/ops.rs643
-rw-r--r--vendor/fst/src/raw/registry.rs264
-rw-r--r--vendor/fst/src/raw/registry_minimal.rs53
-rw-r--r--vendor/fst/src/raw/tests.rs589
12 files changed, 5008 insertions, 0 deletions
diff --git a/vendor/fst/src/raw/build.rs b/vendor/fst/src/raw/build.rs
new file mode 100644
index 000000000..e93626b27
--- /dev/null
+++ b/vendor/fst/src/raw/build.rs
@@ -0,0 +1,474 @@
+use std::io;
+
+use crate::bytes;
+use crate::error::Result;
+use crate::raw::counting_writer::CountingWriter;
+use crate::raw::error::Error;
+use crate::raw::registry::{Registry, RegistryEntry};
+use crate::raw::{
+ CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS,
+ NONE_ADDRESS, VERSION,
+};
+// use raw::registry_minimal::{Registry, RegistryEntry};
+use crate::stream::{IntoStreamer, Streamer};
+
+/// A builder for creating a finite state transducer.
+///
+/// This is not your average everyday builder. It has two important qualities
+/// that make it a bit unique from what you might expect:
+///
+/// 1. All keys must be added in lexicographic order. Adding a key out of order
+/// will result in an error. Additionally, adding a duplicate key with an
+/// output value will also result in an error. That is, once a key is
+/// associated with a value, that association can never be modified or
+/// deleted.
+/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
+/// built. For an in memory representation, this can be a `Vec<u8>`.
+///
+/// Point (2) is especially important because it means that an fst can be
+/// constructed *without storing the entire fst in memory*. Namely, since it
+/// works with any `io::Write`, it can be streamed directly to a file.
+///
+/// With that said, the builder does use memory, but **memory usage is bounded
+/// to a constant size**. The amount of memory used trades off with the
+/// compression ratio. Currently, the implementation hard codes this trade off
+/// which can result in about 5-20MB of heap usage during construction. (N.B.
+/// Guaranteeing a maximal compression ratio requires memory proportional to
+/// the size of the fst, which defeats some of the benefit of streaming
+/// it to disk. In practice, a small bounded amount of memory achieves
+/// close-to-minimal compression ratios.)
+///
+/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
+/// number of elements added to the fst.
+pub struct Builder<W> {
+ /// The FST raw data is written directly to `wtr`.
+ ///
+ /// No internal buffering is done.
+ wtr: CountingWriter<W>,
+ /// The stack of unfinished nodes.
+ ///
+ /// An unfinished node is a node that could potentially have a new
+ /// transition added to it when a new word is added to the dictionary.
+ unfinished: UnfinishedNodes,
+ /// A map of finished nodes.
+ ///
+ /// A finished node is one that has been compiled and written to `wtr`.
+ /// After this point, the node is considered immutable and will never
+ /// change.
+ registry: Registry,
+ /// The last word added.
+ ///
+ /// This is used to enforce the invariant that words are added in sorted
+ /// order.
+ last: Option<Vec<u8>>,
+ /// The address of the last compiled node.
+ ///
+ /// This is used to optimize states with one transition that point
+ /// to the previously compiled node. (The previously compiled node in
+ /// this case actually corresponds to the next state for the transition,
+ /// since states are compiled in reverse.)
+ last_addr: CompiledAddr,
+ /// The number of keys added.
+ len: usize,
+}
+
+#[derive(Debug)]
+struct UnfinishedNodes {
+ stack: Vec<BuilderNodeUnfinished>,
+}
+
+#[derive(Debug)]
+struct BuilderNodeUnfinished {
+ node: BuilderNode,
+ last: Option<LastTransition>,
+}
+
+#[derive(Debug, Hash, Eq, PartialEq)]
+pub struct BuilderNode {
+ pub is_final: bool,
+ pub final_output: Output,
+ pub trans: Vec<Transition>,
+}
+
+#[derive(Debug)]
+struct LastTransition {
+ inp: u8,
+ out: Output,
+}
+
+impl Builder<Vec<u8>> {
+ /// Create a builder that builds an fst in memory.
+ #[inline]
+ pub fn memory() -> Builder<Vec<u8>> {
+ Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
+ }
+
+ /// Finishes construction of the FST and returns it.
+ #[inline]
+ pub fn into_fst(self) -> Fst<Vec<u8>> {
+ self.into_inner().and_then(Fst::new).unwrap()
+ }
+}
+
+impl<W: io::Write> Builder<W> {
+ /// Create a builder that builds an fst by writing it to `wtr` in a
+ /// streaming fashion.
+ pub fn new(wtr: W) -> Result<Builder<W>> {
+ Builder::new_type(wtr, 0)
+ }
+
+ /// The same as `new`, except it sets the type of the fst to the type
+ /// given.
+ pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
+ let mut wtr = CountingWriter::new(wtr);
+ // Don't allow any nodes to have address 0-7. We use these to encode
+ // the API version. We also use addresses `0` and `1` as special
+ // sentinel values, so they should never correspond to a real node.
+ bytes::io_write_u64_le(VERSION, &mut wtr)?;
+ // Similarly for 8-15 for the fst type.
+ bytes::io_write_u64_le(ty, &mut wtr)?;
+ Ok(Builder {
+ wtr,
+ unfinished: UnfinishedNodes::new(),
+ registry: Registry::new(10_000, 2),
+ last: None,
+ last_addr: NONE_ADDRESS,
+ len: 0,
+ })
+ }
+
+ /// Adds a byte string to this FST with a zero output value.
+ pub fn add<B>(&mut self, bs: B) -> Result<()>
+ where
+ B: AsRef<[u8]>,
+ {
+ self.check_last_key(bs.as_ref(), false)?;
+ self.insert_output(bs, None)
+ }
+
+ /// Insert a new key-value pair into the fst.
+ ///
+ /// Keys must be convertible to byte strings. Values must be a `u64`, which
+ /// is a restriction of the current implementation of finite state
+ /// transducers. (Values may one day be expanded to other types.)
+ ///
+ /// If a key is inserted that is less than or equal to any previous key
+ /// added, then an error is returned. Similarly, if there was a problem
+ /// writing to the underlying writer, an error is returned.
+ pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
+ where
+ B: AsRef<[u8]>,
+ {
+ self.check_last_key(bs.as_ref(), true)?;
+ self.insert_output(bs, Some(Output::new(val)))
+ }
+
+ /// Calls insert on each item in the iterator.
+ ///
+ /// If an error occurred while adding an element, processing is stopped
+ /// and the error is returned.
+ ///
+ /// If a key is inserted that is less than or equal to any previous key
+ /// added, then an error is returned. Similarly, if there was a problem
+ /// writing to the underlying writer, an error is returned.
+ pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
+ where
+ T: AsRef<[u8]>,
+ I: IntoIterator<Item = (T, Output)>,
+ {
+ for (key, out) in iter {
+ self.insert(key, out.value())?;
+ }
+ Ok(())
+ }
+
+ /// Calls insert on each item in the stream.
+ ///
+ /// Note that unlike `extend_iter`, this is not generic on the items in
+ /// the stream.
+ ///
+ /// If a key is inserted that is less than or equal to any previous key
+ /// added, then an error is returned. Similarly, if there was a problem
+ /// writing to the underlying writer, an error is returned.
+ pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ let mut stream = stream.into_stream();
+ while let Some((key, out)) = stream.next() {
+ self.insert(key, out.value())?;
+ }
+ Ok(())
+ }
+
+ /// Finishes the construction of the fst and flushes the underlying
+ /// writer. After completion, the data written to `W` may be read using
+ /// one of `Fst`'s constructor methods.
+ pub fn finish(self) -> Result<()> {
+ self.into_inner()?;
+ Ok(())
+ }
+
+ /// Just like `finish`, except it returns the underlying writer after
+ /// flushing it.
+ pub fn into_inner(mut self) -> Result<W> {
+ self.compile_from(0)?;
+ let root_node = self.unfinished.pop_root();
+ let root_addr = self.compile(&root_node)?;
+ bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
+ bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
+
+ let sum = self.wtr.masked_checksum();
+ let mut wtr = self.wtr.into_inner();
+ bytes::io_write_u32_le(sum, &mut wtr)?;
+ wtr.flush()?;
+ Ok(wtr)
+ }
+
+ fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
+ where
+ B: AsRef<[u8]>,
+ {
+ let bs = bs.as_ref();
+ if bs.is_empty() {
+ self.len = 1; // must be first key, so length is always 1
+ self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
+ return Ok(());
+ }
+ let (prefix_len, out) = if let Some(out) = out {
+ self.unfinished.find_common_prefix_and_set_output(bs, out)
+ } else {
+ (self.unfinished.find_common_prefix(bs), Output::zero())
+ };
+ if prefix_len == bs.len() {
+ // If the prefix found consumes the entire set of bytes, then
+ // the prefix *equals* the bytes given. This means it is a
+ // duplicate value with no output. So we can give up here.
+ //
+ // If the below assert fails, then that means we let a duplicate
+ // value through even when inserting outputs.
+ assert!(out.is_zero());
+ return Ok(());
+ }
+ self.len += 1;
+ self.compile_from(prefix_len)?;
+ self.unfinished.add_suffix(&bs[prefix_len..], out);
+ Ok(())
+ }
+
+ fn compile_from(&mut self, istate: usize) -> Result<()> {
+ let mut addr = NONE_ADDRESS;
+ while istate + 1 < self.unfinished.len() {
+ let node = if addr == NONE_ADDRESS {
+ self.unfinished.pop_empty()
+ } else {
+ self.unfinished.pop_freeze(addr)
+ };
+ addr = self.compile(&node)?;
+ assert!(addr != NONE_ADDRESS);
+ }
+ self.unfinished.top_last_freeze(addr);
+ Ok(())
+ }
+
+ fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
+ if node.is_final
+ && node.trans.is_empty()
+ && node.final_output.is_zero()
+ {
+ return Ok(EMPTY_ADDRESS);
+ }
+ let entry = self.registry.entry(&node);
+ if let RegistryEntry::Found(ref addr) = entry {
+ return Ok(*addr);
+ }
+ let start_addr = self.wtr.count() as CompiledAddr;
+ node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
+ self.last_addr = self.wtr.count() as CompiledAddr - 1;
+ if let RegistryEntry::NotFound(cell) = entry {
+ cell.insert(self.last_addr);
+ }
+ Ok(self.last_addr)
+ }
+
+ fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
+ if let Some(ref mut last) = self.last {
+ if check_dupe && bs == &**last {
+ return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
+ }
+ if bs < &**last {
+ return Err(Error::OutOfOrder {
+ previous: last.to_vec(),
+ got: bs.to_vec(),
+ }
+ .into());
+ }
+ last.clear();
+ for &b in bs {
+ last.push(b);
+ }
+ } else {
+ self.last = Some(bs.to_vec());
+ }
+ Ok(())
+ }
+
+ /// Gets a reference to the underlying writer.
+ pub fn get_ref(&self) -> &W {
+ self.wtr.get_ref()
+ }
+
+ /// Returns the number of bytes written to the underlying writer
+ pub fn bytes_written(&self) -> u64 {
+ self.wtr.count()
+ }
+}
+
+impl UnfinishedNodes {
+ fn new() -> UnfinishedNodes {
+ let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
+ unfinished.push_empty(false);
+ unfinished
+ }
+
+ fn len(&self) -> usize {
+ self.stack.len()
+ }
+
+ fn push_empty(&mut self, is_final: bool) {
+ self.stack.push(BuilderNodeUnfinished {
+ node: BuilderNode { is_final, ..BuilderNode::default() },
+ last: None,
+ });
+ }
+
+ fn pop_root(&mut self) -> BuilderNode {
+ assert!(self.stack.len() == 1);
+ assert!(self.stack[0].last.is_none());
+ self.stack.pop().unwrap().node
+ }
+
+ fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
+ let mut unfinished = self.stack.pop().unwrap();
+ unfinished.last_compiled(addr);
+ unfinished.node
+ }
+
+ fn pop_empty(&mut self) -> BuilderNode {
+ let unfinished = self.stack.pop().unwrap();
+ assert!(unfinished.last.is_none());
+ unfinished.node
+ }
+
+ fn set_root_output(&mut self, out: Output) {
+ self.stack[0].node.is_final = true;
+ self.stack[0].node.final_output = out;
+ }
+
+ fn top_last_freeze(&mut self, addr: CompiledAddr) {
+ let last = self.stack.len().checked_sub(1).unwrap();
+ self.stack[last].last_compiled(addr);
+ }
+
+ fn add_suffix(&mut self, bs: &[u8], out: Output) {
+ if bs.is_empty() {
+ return;
+ }
+ let last = self.stack.len().checked_sub(1).unwrap();
+ assert!(self.stack[last].last.is_none());
+ self.stack[last].last = Some(LastTransition { inp: bs[0], out });
+ for &b in &bs[1..] {
+ self.stack.push(BuilderNodeUnfinished {
+ node: BuilderNode::default(),
+ last: Some(LastTransition { inp: b, out: Output::zero() }),
+ });
+ }
+ self.push_empty(true);
+ }
+
+ fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
+ bs.iter()
+ .zip(&self.stack)
+ .take_while(|&(&b, ref node)| {
+ node.last.as_ref().map(|t| t.inp == b).unwrap_or(false)
+ })
+ .count()
+ }
+
+ fn find_common_prefix_and_set_output(
+ &mut self,
+ bs: &[u8],
+ mut out: Output,
+ ) -> (usize, Output) {
+ let mut i = 0;
+ while i < bs.len() {
+ let add_prefix = match self.stack[i].last.as_mut() {
+ Some(ref mut t) if t.inp == bs[i] => {
+ i += 1;
+ let common_pre = t.out.prefix(out);
+ let add_prefix = t.out.sub(common_pre);
+ out = out.sub(common_pre);
+ t.out = common_pre;
+ add_prefix
+ }
+ _ => break,
+ };
+ if !add_prefix.is_zero() {
+ self.stack[i].add_output_prefix(add_prefix);
+ }
+ }
+ (i, out)
+ }
+}
+
+impl BuilderNodeUnfinished {
+ fn last_compiled(&mut self, addr: CompiledAddr) {
+ if let Some(trans) = self.last.take() {
+ self.node.trans.push(Transition {
+ inp: trans.inp,
+ out: trans.out,
+ addr,
+ });
+ }
+ }
+
+ fn add_output_prefix(&mut self, prefix: Output) {
+ if self.node.is_final {
+ self.node.final_output = prefix.cat(self.node.final_output);
+ }
+ for t in &mut self.node.trans {
+ t.out = prefix.cat(t.out);
+ }
+ if let Some(ref mut t) = self.last {
+ t.out = prefix.cat(t.out);
+ }
+ }
+}
+
+impl Clone for BuilderNode {
+ fn clone(&self) -> BuilderNode {
+ BuilderNode {
+ is_final: self.is_final,
+ final_output: self.final_output,
+ trans: self.trans.clone(),
+ }
+ }
+
+ fn clone_from(&mut self, source: &BuilderNode) {
+ self.is_final = source.is_final;
+ self.final_output = source.final_output;
+ self.trans.clear();
+ self.trans.extend(source.trans.iter());
+ }
+}
+
+impl Default for BuilderNode {
+ fn default() -> BuilderNode {
+ BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![],
+ }
+ }
+}
diff --git a/vendor/fst/src/raw/common_inputs.rs b/vendor/fst/src/raw/common_inputs.rs
new file mode 100644
index 000000000..f7d7d23b1
--- /dev/null
+++ b/vendor/fst/src/raw/common_inputs.rs
@@ -0,0 +1,289 @@
+pub const COMMON_INPUTS: [u8; 256] = [
+ 84, // '\x00'
+ 85, // '\x01'
+ 86, // '\x02'
+ 87, // '\x03'
+ 88, // '\x04'
+ 89, // '\x05'
+ 90, // '\x06'
+ 91, // '\x07'
+ 92, // '\x08'
+ 93, // '\t'
+ 94, // '\n'
+ 95, // '\x0b'
+ 96, // '\x0c'
+ 97, // '\r'
+ 98, // '\x0e'
+ 99, // '\x0f'
+ 100, // '\x10'
+ 101, // '\x11'
+ 102, // '\x12'
+ 103, // '\x13'
+ 104, // '\x14'
+ 105, // '\x15'
+ 106, // '\x16'
+ 107, // '\x17'
+ 108, // '\x18'
+ 109, // '\x19'
+ 110, // '\x1a'
+ 111, // '\x1b'
+ 112, // '\x1c'
+ 113, // '\x1d'
+ 114, // '\x1e'
+ 115, // '\x1f'
+ 116, // ' '
+ 80, // '!'
+ 117, // '"'
+ 118, // '#'
+ 79, // '$'
+ 39, // '%'
+ 30, // '&'
+ 81, // "'"
+ 75, // '('
+ 74, // ')'
+ 82, // '*'
+ 57, // '+'
+ 66, // ','
+ 16, // '-'
+ 12, // '.'
+ 2, // '/'
+ 19, // '0'
+ 20, // '1'
+ 21, // '2'
+ 27, // '3'
+ 32, // '4'
+ 29, // '5'
+ 35, // '6'
+ 36, // '7'
+ 37, // '8'
+ 34, // '9'
+ 24, // ':'
+ 73, // ';'
+ 119, // '<'
+ 23, // '='
+ 120, // '>'
+ 40, // '?'
+ 83, // '@'
+ 44, // 'A'
+ 48, // 'B'
+ 42, // 'C'
+ 43, // 'D'
+ 49, // 'E'
+ 46, // 'F'
+ 62, // 'G'
+ 61, // 'H'
+ 47, // 'I'
+ 69, // 'J'
+ 68, // 'K'
+ 58, // 'L'
+ 56, // 'M'
+ 55, // 'N'
+ 59, // 'O'
+ 51, // 'P'
+ 72, // 'Q'
+ 54, // 'R'
+ 45, // 'S'
+ 52, // 'T'
+ 64, // 'U'
+ 65, // 'V'
+ 63, // 'W'
+ 71, // 'X'
+ 67, // 'Y'
+ 70, // 'Z'
+ 77, // '['
+ 121, // '\\'
+ 78, // ']'
+ 122, // '^'
+ 31, // '_'
+ 123, // '`'
+ 4, // 'a'
+ 25, // 'b'
+ 9, // 'c'
+ 17, // 'd'
+ 1, // 'e'
+ 26, // 'f'
+ 22, // 'g'
+ 13, // 'h'
+ 7, // 'i'
+ 50, // 'j'
+ 38, // 'k'
+ 14, // 'l'
+ 15, // 'm'
+ 10, // 'n'
+ 3, // 'o'
+ 8, // 'p'
+ 60, // 'q'
+ 6, // 'r'
+ 5, // 's'
+ 0, // 't'
+ 18, // 'u'
+ 33, // 'v'
+ 11, // 'w'
+ 41, // 'x'
+ 28, // 'y'
+ 53, // 'z'
+ 124, // '{'
+ 125, // '|'
+ 126, // '}'
+ 76, // '~'
+ 127, // '\x7f'
+ 128, // '\x80'
+ 129, // '\x81'
+ 130, // '\x82'
+ 131, // '\x83'
+ 132, // '\x84'
+ 133, // '\x85'
+ 134, // '\x86'
+ 135, // '\x87'
+ 136, // '\x88'
+ 137, // '\x89'
+ 138, // '\x8a'
+ 139, // '\x8b'
+ 140, // '\x8c'
+ 141, // '\x8d'
+ 142, // '\x8e'
+ 143, // '\x8f'
+ 144, // '\x90'
+ 145, // '\x91'
+ 146, // '\x92'
+ 147, // '\x93'
+ 148, // '\x94'
+ 149, // '\x95'
+ 150, // '\x96'
+ 151, // '\x97'
+ 152, // '\x98'
+ 153, // '\x99'
+ 154, // '\x9a'
+ 155, // '\x9b'
+ 156, // '\x9c'
+ 157, // '\x9d'
+ 158, // '\x9e'
+ 159, // '\x9f'
+ 160, // '\xa0'
+ 161, // '¡'
+ 162, // '¢'
+ 163, // '£'
+ 164, // '¤'
+ 165, // '¥'
+ 166, // '¦'
+ 167, // '§'
+ 168, // '¨'
+ 169, // '©'
+ 170, // 'ª'
+ 171, // '«'
+ 172, // '¬'
+ 173, // '\xad'
+ 174, // '®'
+ 175, // '¯'
+ 176, // '°'
+ 177, // '±'
+ 178, // '²'
+ 179, // '³'
+ 180, // '´'
+ 181, // 'µ'
+ 182, // '¶'
+ 183, // '·'
+ 184, // '¸'
+ 185, // '¹'
+ 186, // 'º'
+ 187, // '»'
+ 188, // '¼'
+ 189, // '½'
+ 190, // '¾'
+ 191, // '¿'
+ 192, // 'À'
+ 193, // 'Á'
+ 194, // 'Â'
+ 195, // 'Ã'
+ 196, // 'Ä'
+ 197, // 'Å'
+ 198, // 'Æ'
+ 199, // 'Ç'
+ 200, // 'È'
+ 201, // 'É'
+ 202, // 'Ê'
+ 203, // 'Ë'
+ 204, // 'Ì'
+ 205, // 'Í'
+ 206, // 'Î'
+ 207, // 'Ï'
+ 208, // 'Ð'
+ 209, // 'Ñ'
+ 210, // 'Ò'
+ 211, // 'Ó'
+ 212, // 'Ô'
+ 213, // 'Õ'
+ 214, // 'Ö'
+ 215, // '×'
+ 216, // 'Ø'
+ 217, // 'Ù'
+ 218, // 'Ú'
+ 219, // 'Û'
+ 220, // 'Ü'
+ 221, // 'Ý'
+ 222, // 'Þ'
+ 223, // 'ß'
+ 224, // 'à'
+ 225, // 'á'
+ 226, // 'â'
+ 227, // 'ã'
+ 228, // 'ä'
+ 229, // 'å'
+ 230, // 'æ'
+ 231, // 'ç'
+ 232, // 'è'
+ 233, // 'é'
+ 234, // 'ê'
+ 235, // 'ë'
+ 236, // 'ì'
+ 237, // 'í'
+ 238, // 'î'
+ 239, // 'ï'
+ 240, // 'ð'
+ 241, // 'ñ'
+ 242, // 'ò'
+ 243, // 'ó'
+ 244, // 'ô'
+ 245, // 'õ'
+ 246, // 'ö'
+ 247, // '÷'
+ 248, // 'ø'
+ 249, // 'ù'
+ 250, // 'ú'
+ 251, // 'û'
+ 252, // 'ü'
+ 253, // 'ý'
+ 254, // 'þ'
+ 255, // 'ÿ'
+];
+
+pub const COMMON_INPUTS_INV: [u8; 256] = [
+ b't', b'e', b'/', b'o', b'a', b's', b'r', b'i', b'p', b'c', b'n', b'w',
+ b'.', b'h', b'l', b'm', b'-', b'd', b'u', b'0', b'1', b'2', b'g', b'=',
+ b':', b'b', b'f', b'3', b'y', b'5', b'&', b'_', b'4', b'v', b'9', b'6',
+ b'7', b'8', b'k', b'%', b'?', b'x', b'C', b'D', b'A', b'S', b'F', b'I',
+ b'B', b'E', b'j', b'P', b'T', b'z', b'R', b'N', b'M', b'+', b'L', b'O',
+ b'q', b'H', b'G', b'W', b'U', b'V', b',', b'Y', b'K', b'J', b'Z', b'X',
+ b'Q', b';', b')', b'(', b'~', b'[', b']', b'$', b'!', b'\'', b'*', b'@',
+ b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07',
+ b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10',
+ b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18',
+ b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'"',
+ b'#', b'<', b'>', b'\\', b'^', b'`', b'{', b'|', b'}', b'\x7f', b'\x80',
+ b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88',
+ b'\x89', b'\x8a', b'\x8b', b'\x8c', b'\x8d', b'\x8e', b'\x8f', b'\x90',
+ b'\x91', b'\x92', b'\x93', b'\x94', b'\x95', b'\x96', b'\x97', b'\x98',
+ b'\x99', b'\x9a', b'\x9b', b'\x9c', b'\x9d', b'\x9e', b'\x9f', b'\xa0',
+ b'\xa1', b'\xa2', b'\xa3', b'\xa4', b'\xa5', b'\xa6', b'\xa7', b'\xa8',
+ b'\xa9', b'\xaa', b'\xab', b'\xac', b'\xad', b'\xae', b'\xaf', b'\xb0',
+ b'\xb1', b'\xb2', b'\xb3', b'\xb4', b'\xb5', b'\xb6', b'\xb7', b'\xb8',
+ b'\xb9', b'\xba', b'\xbb', b'\xbc', b'\xbd', b'\xbe', b'\xbf', b'\xc0',
+ b'\xc1', b'\xc2', b'\xc3', b'\xc4', b'\xc5', b'\xc6', b'\xc7', b'\xc8',
+ b'\xc9', b'\xca', b'\xcb', b'\xcc', b'\xcd', b'\xce', b'\xcf', b'\xd0',
+ b'\xd1', b'\xd2', b'\xd3', b'\xd4', b'\xd5', b'\xd6', b'\xd7', b'\xd8',
+ b'\xd9', b'\xda', b'\xdb', b'\xdc', b'\xdd', b'\xde', b'\xdf', b'\xe0',
+ b'\xe1', b'\xe2', b'\xe3', b'\xe4', b'\xe5', b'\xe6', b'\xe7', b'\xe8',
+ b'\xe9', b'\xea', b'\xeb', b'\xec', b'\xed', b'\xee', b'\xef', b'\xf0',
+ b'\xf1', b'\xf2', b'\xf3', b'\xf4', b'\xf5', b'\xf6', b'\xf7', b'\xf8',
+ b'\xf9', b'\xfa', b'\xfb', b'\xfc', b'\xfd', b'\xfe', b'\xff',
+];
diff --git a/vendor/fst/src/raw/counting_writer.rs b/vendor/fst/src/raw/counting_writer.rs
new file mode 100644
index 000000000..70d9315ff
--- /dev/null
+++ b/vendor/fst/src/raw/counting_writer.rs
@@ -0,0 +1,70 @@
+use std::io;
+
+use crate::raw::crc32::CheckSummer;
+
+/// Wraps any writer that counts and checksums bytes written.
+pub struct CountingWriter<W> {
+ wtr: W,
+ cnt: u64,
+ summer: CheckSummer,
+}
+
+impl<W: io::Write> CountingWriter<W> {
+ /// Wrap the given writer with a counter.
+ pub fn new(wtr: W) -> CountingWriter<W> {
+ CountingWriter { wtr, cnt: 0, summer: CheckSummer::new() }
+ }
+
+ /// Return the total number of bytes written to the underlying writer.
+ ///
+ /// The count returned is the sum of all counts resulting from a call
+ /// to `write`.
+ pub fn count(&self) -> u64 {
+ self.cnt
+ }
+
+ /// Returns the masked CRC32C checksum of the bytes written so far.
+ ///
+ /// This "masked" checksum is the same one used by the Snappy frame format.
+ /// Masking is supposed to make the checksum robust with respect to data
+ /// that contains the checksum itself.
+ pub fn masked_checksum(&self) -> u32 {
+ self.summer.masked()
+ }
+
+ /// Unwrap the counting writer and return the inner writer.
+ pub fn into_inner(self) -> W {
+ self.wtr
+ }
+
+ /// Gets a reference to the underlying writer.
+ pub fn get_ref(&self) -> &W {
+ &self.wtr
+ }
+}
+
+impl<W: io::Write> io::Write for CountingWriter<W> {
+ fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+ self.summer.update(buf);
+ let n = self.wtr.write(buf)?;
+ self.cnt += n as u64;
+ Ok(n)
+ }
+
+ fn flush(&mut self) -> io::Result<()> {
+ self.wtr.flush()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::CountingWriter;
+ use std::io::Write;
+
+ #[test]
+ fn counts_bytes() {
+ let mut wtr = CountingWriter::new(vec![]);
+ wtr.write_all(b"foobar").unwrap();
+ assert_eq!(wtr.count(), 6);
+ }
+}
diff --git a/vendor/fst/src/raw/crc32.rs b/vendor/fst/src/raw/crc32.rs
new file mode 100644
index 000000000..06b444205
--- /dev/null
+++ b/vendor/fst/src/raw/crc32.rs
@@ -0,0 +1,59 @@
+use crate::bytes;
+use crate::raw::crc32_table::{TABLE, TABLE16};
+
+/// Provides a simple API to perform a rolling CRC32C checksum.
+#[derive(Clone, Copy, Debug)]
+pub struct CheckSummer {
+ sum: u32,
+}
+
+impl CheckSummer {
+ /// Create a new checksummer that can compute CRC32C checksums on arbitrary
+ /// bytes.
+ pub fn new() -> CheckSummer {
+ CheckSummer { sum: 0 }
+ }
+
+ /// Returns the "masked" CRC32 checksum of the data so far using the
+ /// Castagnoli polynomial. This "masked" checksum is the same one used
+ /// by the Snappy frame format. Masking is supposed to make the checksum
+ /// robust with respect to data that contains the checksum itself.
+ pub fn masked(&self) -> u32 {
+ let sum = self.sum;
+ (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
+ }
+
+ /// Update the current checksum with the checksum for the given bytes.
+ pub fn update(&mut self, buf: &[u8]) {
+ self.sum = crc32c_slice16(self.sum, buf);
+ }
+}
+
+/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
+fn crc32c_slice16(prev: u32, mut buf: &[u8]) -> u32 {
+ let mut crc: u32 = !prev;
+ while buf.len() >= 16 {
+ crc ^= bytes::read_u32_le(buf);
+ crc = TABLE16[0][buf[15] as usize]
+ ^ TABLE16[1][buf[14] as usize]
+ ^ TABLE16[2][buf[13] as usize]
+ ^ TABLE16[3][buf[12] as usize]
+ ^ TABLE16[4][buf[11] as usize]
+ ^ TABLE16[5][buf[10] as usize]
+ ^ TABLE16[6][buf[9] as usize]
+ ^ TABLE16[7][buf[8] as usize]
+ ^ TABLE16[8][buf[7] as usize]
+ ^ TABLE16[9][buf[6] as usize]
+ ^ TABLE16[10][buf[5] as usize]
+ ^ TABLE16[11][buf[4] as usize]
+ ^ TABLE16[12][(crc >> 24) as u8 as usize]
+ ^ TABLE16[13][(crc >> 16) as u8 as usize]
+ ^ TABLE16[14][(crc >> 8) as u8 as usize]
+ ^ TABLE16[15][(crc) as u8 as usize];
+ buf = &buf[16..];
+ }
+ for &b in buf {
+ crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
+ }
+ !crc
+}
diff --git a/vendor/fst/src/raw/crc32_table.rs b/vendor/fst/src/raw/crc32_table.rs
new file mode 100644
index 000000000..7821b5d86
--- /dev/null
+++ b/vendor/fst/src/raw/crc32_table.rs
@@ -0,0 +1,2 @@
+// Generated by build.rs.
+include!(concat!(env!("OUT_DIR"), "/crc32_table.rs"));
diff --git a/vendor/fst/src/raw/error.rs b/vendor/fst/src/raw/error.rs
new file mode 100644
index 000000000..b28377179
--- /dev/null
+++ b/vendor/fst/src/raw/error.rs
@@ -0,0 +1,171 @@
+use std::fmt;
+use std::str;
+use std::string::FromUtf8Error;
+
+use crate::raw::FstType;
+
+/// An error that occurred while using a finite state transducer.
+///
+/// This enum is non-exhaustive. New variants may be added to it in
+/// compatible releases.
+pub enum Error {
+ /// A version mismatch occurred while reading a finite state transducer.
+ ///
+ /// This occurs when the API version (of the crate) does not match the
+ /// version encoded in the finite state transducer.
+ ///
+ /// When this error is encountered, there are only two ways to fix it:
+ ///
+ /// 1. Change the version of the library to one that is compatible with
+ /// the given finite state transducer.
+ /// 2. Rebuild the finite state transducer.
+ Version {
+ /// The expected version, which is hard-coded into the current version
+ /// of this crate.
+ expected: u64,
+ /// The version read from the finite state transducer.
+ got: u64,
+ },
+ /// An unexpected error occurred while reading a finite state transducer.
+ /// Usually this occurs because the data is corrupted or is not actually
+ /// a finite state transducer serialized by this library.
+ Format {
+ /// The number of bytes given to the FST constructor.
+ size: usize,
+ },
+ /// An error that is returned if verification of an FST fails because of a
+ /// checksum mismatch.
+ ChecksumMismatch {
+ /// The checksum that was expected.
+ expected: u32,
+ /// The checksum that was actually computed.
+ got: u32,
+ },
+ /// An error that is returned if the caller attempts to verify an FST
+ /// that does not have a checksum, as is the case for all FSTs generated
+ /// by this crate before version `0.4`.
+ ChecksumMissing,
+ /// A duplicate key was inserted into a finite state transducer, which is
+ /// not allowed.
+ DuplicateKey {
+ /// The duplicate key.
+ got: Vec<u8>,
+ },
+ /// A key was inserted out of order into a finite state transducer.
+ ///
+ /// Keys must always be inserted in lexicographic order.
+ OutOfOrder {
+ /// The last key successfully inserted.
+ previous: Vec<u8>,
+ /// The key that caused this error to occur.
+ got: Vec<u8>,
+ },
+ /// A finite state transducer with an unexpected type was found.
+ ///
+ /// This is not currently used in this crate, but callers may wish to
+ /// employ its use for alternative data structures implemented on top of
+ /// finite state transducers.
+ WrongType {
+ /// The expected finite state transducer type.
+ expected: FstType,
+ /// The type read from a finite state transducer.
+ got: FstType,
+ },
+ /// An error that occurred when trying to decode a UTF-8 byte key.
+ FromUtf8(FromUtf8Error),
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Error::FromUtf8(ref err) => err.fmt(f),
+ Error::Version { expected, got } => write!(
+ f,
+ "\
+Error opening FST: expected API version {}, got API version {}. \
+It looks like the FST you're trying to open is either not an FST file or it \
+was generated with a different version of the 'fst' crate. You'll either need \
+to change the version of the 'fst' crate you're using, or re-generate the
+FST.",
+ expected, got
+ ),
+ Error::Format { size } => write!(
+ f,
+ "\
+Error opening FST with size {} bytes: An unknown error occurred. This \
+usually means you're trying to read data that isn't actually an encoded FST.",
+ size
+ ),
+ Error::ChecksumMismatch { expected, got } => write!(
+ f,
+ "FST verification failed: expected checksum of {} but got {}",
+ expected, got,
+ ),
+ Error::ChecksumMissing => write!(
+ f,
+ "FST verification failed: FST does not contain a checksum",
+ ),
+ Error::DuplicateKey { ref got } => write!(
+ f,
+ "Error inserting duplicate key: '{}'.",
+ format_bytes(&*got)
+ ),
+ Error::OutOfOrder { ref previous, ref got } => write!(
+ f,
+ "\
+Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \
+inserted in lexicographic order.",
+ format_bytes(&*got),
+ format_bytes(&*previous)
+ ),
+ Error::WrongType { expected, got } => write!(
+ f,
+ "\
+Error opening FST: expected type '{}', got type '{}'.",
+ expected, got
+ ),
+ Error::__Nonexhaustive => unreachable!(),
+ }
+ }
+}
+
+impl fmt::Debug for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Display::fmt(self, f)
+ }
+}
+
+impl std::error::Error for Error {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match *self {
+ Error::FromUtf8(ref err) => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl From<FromUtf8Error> for Error {
+ #[inline]
+ fn from(err: FromUtf8Error) -> Error {
+ Error::FromUtf8(err)
+ }
+}
+
+/// Attempt to convert an arbitrary byte string to a more convenient display
+/// form.
+///
+/// Essentially, try to decode the bytes as UTF-8 and show that. Failing that,
+/// just show the sequence of bytes.
+fn format_bytes(bytes: &[u8]) -> String {
+ match str::from_utf8(bytes) {
+ Ok(s) => s.to_owned(),
+ Err(_) => format!("{:?}", bytes),
+ }
+}
diff --git a/vendor/fst/src/raw/mod.rs b/vendor/fst/src/raw/mod.rs
new file mode 100644
index 000000000..6cc0eeb2c
--- /dev/null
+++ b/vendor/fst/src/raw/mod.rs
@@ -0,0 +1,1382 @@
+/*!
+Operations on raw finite state transducers.
+
+This sub-module exposes the guts of a finite state transducer. Many parts of
+it, such as construction and traversal, are mirrored in the `set` and `map`
+sub-modules. Other parts of it, such as direct access to nodes and transitions
+in the transducer, do not have any analog.
+
+# Overview of types
+
+`Fst` is a read only interface to pre-constructed finite state transducers.
+`Node` is a read only interface to a single node in a transducer. `Builder` is
+used to create new finite state transducers. (Once a transducer is created, it
+can never be modified.) `Stream` is a stream of all inputs and outputs in a
+transducer. `StreamBuilder` builds range queries. `OpBuilder` collects streams
+and executes set operations like `union` or `intersection` on them with the
+option of specifying a merge strategy for output values.
+
+Most of the rest of the types are streams from set operations.
+*/
+use std::cmp;
+use std::fmt;
+
+use crate::automaton::{AlwaysMatch, Automaton};
+use crate::bytes;
+use crate::error::Result;
+use crate::stream::{IntoStreamer, Streamer};
+
+pub use crate::raw::build::Builder;
+pub use crate::raw::error::Error;
+pub use crate::raw::node::{Node, Transitions};
+pub use crate::raw::ops::{
+ Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference,
+ Union,
+};
+
+mod build;
+mod common_inputs;
+mod counting_writer;
+mod crc32;
+mod crc32_table;
+mod error;
+mod node;
+mod ops;
+mod registry;
+mod registry_minimal;
+#[cfg(test)]
+mod tests;
+
+/// The API version of this crate.
+///
+/// This version number is written to every finite state transducer created by
+/// this crate. When a finite state transducer is read, its version number is
+/// checked against this value.
+///
+/// Currently, any version mismatch results in an error. Fixing this requires
+/// regenerating the finite state transducer or switching to a version of this
+/// crate that is compatible with the serialized transducer. This particular
+/// behavior may be relaxed in future versions.
+pub const VERSION: u64 = 3;
+
+/// A sentinel value used to indicate an empty final state.
+const EMPTY_ADDRESS: CompiledAddr = 0;
+
+/// A sentinel value used to indicate an invalid state.
+///
+/// This is never the address of a node in a serialized transducer.
+const NONE_ADDRESS: CompiledAddr = 1;
+
+/// FstType is a convention used to indicate the type of the underlying
+/// transducer.
+///
+/// This crate reserves the range 0-255 (inclusive) but currently leaves the
+/// meaning of 0-255 unspecified.
+pub type FstType = u64;
+
+/// CompiledAddr is the type used to address nodes in a finite state
+/// transducer.
+///
+/// It is most useful as a pointer to nodes. It can be used in the `Fst::node`
+/// method to resolve the pointer.
+pub type CompiledAddr = usize;
+
+/// An acyclic deterministic finite state transducer.
+///
+/// # How does it work?
+///
+/// The short answer: it's just like a prefix trie, which compresses keys
+/// based only on their prefixes, except that a automaton/transducer also
+/// compresses suffixes.
+///
+/// The longer answer is that keys in an automaton are stored only in the
+/// transitions from one state to another. A key can be acquired by tracing
+/// a path from the root of the automaton to any match state. The inputs along
+/// each transition are concatenated. Once a match state is reached, the
+/// concatenation of inputs up until that point corresponds to a single key.
+///
+/// But why is it called a transducer instead of an automaton? A finite state
+/// transducer is just like a finite state automaton, except that it has output
+/// transitions in addition to input transitions. Namely, the value associated
+/// with any particular key is determined by summing the outputs along every
+/// input transition that leads to the key's corresponding match state.
+///
+/// This is best demonstrated with a couple images. First, let's ignore the
+/// "transducer" aspect and focus on a plain automaton.
+///
+/// Consider that your keys are abbreviations of some of the months in the
+/// Gregorian calendar:
+///
+/// ```plain
+/// jan
+/// feb
+/// mar
+/// may
+/// jun
+/// jul
+/// ```
+///
+/// The corresponding automaton that stores all of these as keys looks like
+/// this:
+///
+/// ![finite state automaton](https://burntsushi.net/stuff/months-set.png)
+///
+/// Notice here how the prefix and suffix of `jan` and `jun` are shared.
+/// Similarly, the prefixes of `jun` and `jul` are shared and the prefixes
+/// of `mar` and `may` are shared.
+///
+/// All of the keys from this automaton can be enumerated in lexicographic
+/// order by following every transition from each node in lexicographic
+/// order. Since it is acyclic, the procedure will terminate.
+///
+/// A key can be found by tracing it through the transitions in the automaton.
+/// For example, the key `aug` is known not to be in the automaton by only
+/// visiting the root state (because there is no `a` transition). For another
+/// example, the key `jax` is known not to be in the set only after moving
+/// through the transitions for `j` and `a`. Namely, after those transitions
+/// are followed, there are no transitions for `x`.
+///
+/// Notice here that looking up a key is proportional the length of the key
+/// itself. Namely, lookup time is not affected by the number of keys in the
+/// automaton!
+///
+/// Additionally, notice that the automaton exploits the fact that many keys
+/// share common prefixes and suffixes. For example, `jun` and `jul` are
+/// represented with no more states than would be required to represent either
+/// one on its own. Instead, the only change is a single extra transition. This
+/// is a form of compression and is key to how the automatons produced by this
+/// crate are so small.
+///
+/// Let's move on to finite state transducers. Consider the same set of keys
+/// as above, but let's assign their numeric month values:
+///
+/// ```plain
+/// jan,1
+/// feb,2
+/// mar,3
+/// may,5
+/// jun,6
+/// jul,7
+/// ```
+///
+/// The corresponding transducer looks very similar to the automaton above,
+/// except outputs have been added to some of the transitions:
+///
+/// ![finite state transducer](https://burntsushi.net/stuff/months-map.png)
+///
+/// All of the operations with a transducer are the same as described above
+/// for automatons. Additionally, the same compression techniques are used:
+/// common prefixes and suffixes in keys are exploited.
+///
+/// The key difference is that some transitions have been given an output.
+/// As one follows input transitions, one must sum the outputs as they
+/// are seen. (A transition with no output represents the additive identity,
+/// or `0` in this case.) For example, when looking up `feb`, the transition
+/// `f` has output `2`, the transition `e` has output `0`, and the transition
+/// `b` also has output `0`. The sum of these is `2`, which is exactly the
+/// value we associated with `feb`.
+///
+/// For another more interesting example, consider `jul`. The `j` transition
+/// has output `1`, the `u` transition has output `5` and the `l` transition
+/// has output `1`. Summing these together gets us `7`, which is again the
+/// correct value associated with `jul`. Notice that if we instead looked up
+/// the `jun` key, then the `n` transition would be followed instead of the
+/// `l` transition, which has no output. Therefore, the `jun` key equals
+/// `1+5+0=6`.
+///
+/// The trick to transducers is that there exists a unique path through the
+/// transducer for every key, and its outputs are stored appropriately along
+/// this path such that the correct value is returned when they are all summed
+/// together. This process also enables the data that makes up each value to be
+/// shared across many values in the transducer in exactly the same way that
+/// keys are shared. This is yet another form of compression!
+///
+/// # Bonus: a billion strings
+///
+/// The amount of compression one can get from automata can be absolutely
+/// ridiuclous. Consider the particular case of storing all billion strings
+/// in the range `0000000001-1000000000`, e.g.,
+///
+/// ```plain
+/// 0000000001
+/// 0000000002
+/// ...
+/// 0000000100
+/// 0000000101
+/// ...
+/// 0999999999
+/// 1000000000
+/// ```
+///
+/// The corresponding automaton looks like this:
+///
+/// ![finite state automaton - one billion strings](https://burntsushi.net/stuff/one-billion.png)
+///
+/// Indeed, the on disk size of this automaton is a mere **251 bytes**.
+///
+/// Of course, this is a bit of a pathological best case, but it does serve
+/// to show how good compression can be in the optimal case.
+///
+/// Also, check out the
+/// [corresponding transducer](https://burntsushi.net/stuff/one-billion-map.svg)
+/// that maps each string to its integer value. It's a bit bigger, but still
+/// only takes up **896 bytes** of space on disk. This demonstrates that
+/// output values are also compressible.
+///
+/// # Does this crate produce minimal transducers?
+///
+/// For any non-trivial sized set of keys, it is unlikely that this crate will
+/// produce a minimal transducer. As far as this author knows, guaranteeing a
+/// minimal transducer requires working memory proportional to the number of
+/// states. This can be quite costly and is anathema to the main design goal of
+/// this crate: provide the ability to work with gigantic sets of strings with
+/// constant memory overhead.
+///
+/// Instead, construction of a finite state transducer uses a cache of
+/// states. More frequently used states are cached and reused, which provides
+/// reasonably good compression ratios. (No comprehensive benchmarks exist to
+/// back up this claim.)
+///
+/// It is possible that this crate may expose a way to guarantee minimal
+/// construction of transducers at the expense of exorbitant memory
+/// requirements.
+///
+/// # Bibliography
+///
+/// I initially got the idea to use finite state tranducers to represent
+/// ordered sets/maps from
+/// [Michael
+/// McCandless'](http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html)
+/// work on incorporating transducers in Lucene.
+///
+/// However, my work would also not have been possible without the hard work
+/// of many academics, especially
+/// [Jan Daciuk](http://galaxy.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/).
+///
+/// * [Incremental construction of minimal acyclic finite-state automata](https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601)
+/// (Section 3 provides a decent overview of the algorithm used to construct
+/// transducers in this crate, assuming all outputs are `0`.)
+/// * [Direct Construction of Minimal Acyclic Subsequential Transducers](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf)
+/// (The whole thing. The proof is dense but illuminating. The algorithm at
+/// the end is the money shot, namely, it incorporates output values.)
+/// * [Experiments with Automata Compression](https://www.researchgate.net/profile/Jiri-Dvorsky/publication/221568039_Word_Random_Access_Compression/links/0c96052c095630d5b3000000/Word-Random-Access-Compression.pdf#page=116), [Smaller Representation of Finite State Automata](https://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf)
+/// (various compression techniques for representing states/transitions)
+/// * [Jan Daciuk's dissertation](http://www.pg.gda.pl/~jandac/thesis.ps.gz)
+/// (excellent for in depth overview)
+/// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](https://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf)
+/// (excellent for surface level overview)
+#[derive(Clone)]
+pub struct Fst<D> {
+ meta: Meta,
+ data: D,
+}
+
+#[derive(Debug, Clone)]
+struct Meta {
+ version: u64,
+ root_addr: CompiledAddr,
+ ty: FstType,
+ len: usize,
+ /// A checksum is missing when the FST version is <= 2. (Checksums were
+ /// added in version 3.)
+ checksum: Option<u32>,
+}
+
+impl Fst<Vec<u8>> {
+ /// Create a new FST from an iterator of lexicographically ordered byte
+ /// strings. Every key's value is set to `0`.
+ ///
+ /// If the iterator does not yield values in lexicographic order, then an
+ /// error is returned.
+ ///
+ /// Note that this is a convenience function to build an FST in memory.
+ /// To build an FST that streams to an arbitrary `io::Write`, use
+ /// `raw::Builder`.
+ pub fn from_iter_set<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
+ where
+ K: AsRef<[u8]>,
+ I: IntoIterator<Item = K>,
+ {
+ let mut builder = Builder::memory();
+ for k in iter {
+ builder.add(k)?;
+ }
+ Ok(builder.into_fst())
+ }
+
+ /// Create a new FST from an iterator of lexicographically ordered byte
+ /// strings. The iterator should consist of tuples, where the first element
+ /// is the byte string and the second element is its corresponding value.
+ ///
+ /// If the iterator does not yield unique keys in lexicographic order, then
+ /// an error is returned.
+ ///
+ /// Note that this is a convenience function to build an FST in memory.
+ /// To build an FST that streams to an arbitrary `io::Write`, use
+ /// `raw::Builder`.
+ pub fn from_iter_map<K, I>(iter: I) -> Result<Fst<Vec<u8>>>
+ where
+ K: AsRef<[u8]>,
+ I: IntoIterator<Item = (K, u64)>,
+ {
+ let mut builder = Builder::memory();
+ for (k, v) in iter {
+ builder.insert(k, v)?;
+ }
+ Ok(builder.into_fst())
+ }
+}
+
+impl<D: AsRef<[u8]>> Fst<D> {
+ /// Creates a transducer from its representation as a raw byte sequence.
+ ///
+ /// This operation is intentionally very cheap (no allocations and no
+ /// copies). In particular, no verification on the integrity of the
+ /// FST is performed. Callers may opt into integrity checks via the
+ /// [`Fst::verify`](struct.Fst.html#method.verify) method.
+ ///
+ /// The fst must have been written with a compatible finite state
+ /// transducer builder (`Builder` qualifies). If the format is invalid or
+ /// if there is a mismatch between the API version of this library and the
+ /// fst, then an error is returned.
+ #[inline]
+ pub fn new(data: D) -> Result<Fst<D>> {
+ let bytes = data.as_ref();
+ if bytes.len() < 36 {
+ return Err(Error::Format { size: bytes.len() }.into());
+ }
+ // The read_u64 unwraps below are OK because they can never fail.
+ // They can only fail when there is an IO error or if there is an
+ // unexpected EOF. However, we are reading from a byte slice (no
+ // IO errors possible) and we've confirmed the byte slice is at least
+ // N bytes (no unexpected EOF).
+ let version = bytes::read_u64_le(&bytes);
+ if version == 0 || version > VERSION {
+ return Err(
+ Error::Version { expected: VERSION, got: version }.into()
+ );
+ }
+ let ty = bytes::read_u64_le(&bytes[8..]);
+
+ let (end, checksum) = if version <= 2 {
+ (bytes.len(), None)
+ } else {
+ let checksum = bytes::read_u32_le(&bytes[bytes.len() - 4..]);
+ (bytes.len() - 4, Some(checksum))
+ };
+ let root_addr = {
+ let last = &bytes[end - 8..];
+ u64_to_usize(bytes::read_u64_le(last))
+ };
+ let len = {
+ let last2 = &bytes[end - 16..];
+ u64_to_usize(bytes::read_u64_le(last2))
+ };
+ // The root node is always the last node written, so its address should
+ // be near the end. After the root node is written, we still have to
+ // write the root *address* and the number of keys in the FST, along
+ // with the checksum. That's 20 bytes. The extra byte used below (21
+ // and not 20) comes from the fact that the root address points to
+ // the last byte in the root node, rather than the byte immediately
+ // following the root node.
+ //
+ // If this check passes, it is still possible that the FST is invalid
+ // but probably unlikely. If this check reports a false positive, then
+ // the program will probably panic. In the worst case, the FST will
+ // operate but be subtly wrong. (This would require the bytes to be in
+ // a format expected by an FST, which is incredibly unlikely.)
+ //
+ // The special check for EMPTY_ADDRESS is needed since an empty FST
+ // has a root node that is empty and final, which means it has the
+ // special address `0`. In that case, the FST is the smallest it can
+ // be: the version, type, root address and number of nodes. That's
+ // 36 bytes (8 byte u64 each).
+ //
+ // And finally, our calculation changes somewhat based on version.
+ // If the FST version is less than 3, then it does not have a checksum.
+ let (empty_total, addr_offset) =
+ if version <= 2 { (32, 17) } else { (36, 21) };
+ if (root_addr == EMPTY_ADDRESS && bytes.len() != empty_total)
+ && root_addr + addr_offset != bytes.len()
+ {
+ return Err(Error::Format { size: bytes.len() }.into());
+ }
+ let meta = Meta { version, root_addr, ty, len, checksum };
+ Ok(Fst { meta, data })
+ }
+
+ /// Retrieves the value associated with a key.
+ ///
+ /// If the key does not exist, then `None` is returned.
+ #[inline]
+ pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
+ self.as_ref().get(key.as_ref())
+ }
+
+ /// Returns true if and only if the given key is in this FST.
+ #[inline]
+ pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool {
+ self.as_ref().contains_key(key.as_ref())
+ }
+
+ /// Retrieves the key associated with the given value.
+ ///
+ /// This is like `get_key_into`, but will return the key itself without
+ /// allowing the caller to reuse an allocation.
+ ///
+ /// If the given value does not exist, then `None` is returned.
+ ///
+ /// The values in this FST are not monotonically increasing when sorted
+ /// lexicographically by key, then this routine has unspecified behavior.
+ #[inline]
+ pub fn get_key(&self, value: u64) -> Option<Vec<u8>> {
+ let mut key = vec![];
+ if self.get_key_into(value, &mut key) {
+ Some(key)
+ } else {
+ None
+ }
+ }
+
+ /// Retrieves the key associated with the given value.
+ ///
+ /// If the given value does not exist, then `false` is returned. In this
+ /// case, the contents of `key` are unspecified.
+ ///
+ /// The given buffer is not clearer before the key is written to it.
+ ///
+ /// The values in this FST are not monotonically increasing when sorted
+ /// lexicographically by key, then this routine has unspecified behavior.
+ #[inline]
+ pub fn get_key_into(&self, value: u64, key: &mut Vec<u8>) -> bool {
+ self.as_ref().get_key_into(value, key)
+ }
+
+ /// Return a lexicographically ordered stream of all key-value pairs in
+ /// this fst.
+ #[inline]
+ pub fn stream(&self) -> Stream<'_> {
+ StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
+ }
+
+ /// Return a builder for range queries.
+ ///
+ /// A range query returns a subset of key-value pairs in this fst in a
+ /// range given in lexicographic order.
+ #[inline]
+ pub fn range(&self) -> StreamBuilder<'_> {
+ StreamBuilder::new(self.as_ref(), AlwaysMatch)
+ }
+
+ /// Executes an automaton on the keys of this FST.
+ #[inline]
+ pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
+ StreamBuilder::new(self.as_ref(), aut)
+ }
+
+ /// Executes an automaton on the keys of this FST and yields matching
+ /// keys along with the corresponding matching states in the given
+ /// automaton.
+ #[inline]
+ pub fn search_with_state<A: Automaton>(
+ &self,
+ aut: A,
+ ) -> StreamWithStateBuilder<'_, A> {
+ StreamWithStateBuilder::new(self.as_ref(), aut)
+ }
+
+ /// Returns the number of keys in this fst.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.as_ref().len()
+ }
+
+ /// Returns true if and only if this fst has no keys.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.as_ref().is_empty()
+ }
+
+ /// Returns the number of bytes used by this fst.
+ #[inline]
+ pub fn size(&self) -> usize {
+ self.as_ref().size()
+ }
+
+ /// Attempts to verify this FST by computing its checksum.
+ ///
+ /// This will scan over all of the bytes in the underlying FST, so this
+ /// may be an expensive operation depending on the size of the FST.
+ ///
+ /// This returns an error in two cases:
+ ///
+ /// 1. When a checksum does not exist, which is the case for FSTs that were
+ /// produced by the `fst` crate before version `0.4`.
+ /// 2. When the checksum in the FST does not match the computed checksum
+ /// performed by this procedure.
+ #[inline]
+ pub fn verify(&self) -> Result<()> {
+ use crate::raw::crc32::CheckSummer;
+
+ let expected = match self.as_ref().meta.checksum {
+ None => return Err(Error::ChecksumMissing.into()),
+ Some(expected) => expected,
+ };
+ let mut summer = CheckSummer::new();
+ summer.update(&self.as_bytes()[..self.as_bytes().len() - 4]);
+ let got = summer.masked();
+ if expected == got {
+ return Ok(());
+ }
+ Err(Error::ChecksumMismatch { expected, got }.into())
+ }
+
+ /// Creates a new fst operation with this fst added to it.
+ ///
+ /// The `OpBuilder` type can be used to add additional fst streams
+ /// and perform set operations like union, intersection, difference and
+ /// symmetric difference on the keys of the fst. These set operations also
+ /// allow one to specify how conflicting values are merged in the stream.
+ #[inline]
+ pub fn op(&self) -> OpBuilder<'_> {
+ OpBuilder::new().add(self)
+ }
+
+ /// Returns true if and only if the `self` fst is disjoint with the fst
+ /// `stream`.
+ ///
+ /// `stream` must be a lexicographically ordered sequence of byte strings
+ /// with associated values.
+ #[inline]
+ pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ self.op().add(stream).intersection().next().is_none()
+ }
+
+ /// Returns true if and only if the `self` fst is a subset of the fst
+ /// `stream`.
+ ///
+ /// `stream` must be a lexicographically ordered sequence of byte strings
+ /// with associated values.
+ #[inline]
+ pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ let mut op = self.op().add(stream).intersection();
+ let mut count = 0;
+ while let Some(_) = op.next() {
+ count += 1;
+ }
+ count == self.len()
+ }
+
+ /// Returns true if and only if the `self` fst is a superset of the fst
+ /// `stream`.
+ ///
+ /// `stream` must be a lexicographically ordered sequence of byte strings
+ /// with associated values.
+ #[inline]
+ pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ let mut op = self.op().add(stream).union();
+ let mut count = 0;
+ while let Some(_) = op.next() {
+ count += 1;
+ }
+ count == self.len()
+ }
+
+ /// Returns the underlying type of this fst.
+ ///
+ /// FstType is a convention used to indicate the type of the underlying
+ /// transducer.
+ ///
+ /// This crate reserves the range 0-255 (inclusive) but currently leaves
+ /// the meaning of 0-255 unspecified.
+ #[inline]
+ pub fn fst_type(&self) -> FstType {
+ self.as_ref().fst_type()
+ }
+
+ /// Returns the root node of this fst.
+ #[inline]
+ pub fn root(&self) -> Node<'_> {
+ self.as_ref().root()
+ }
+
+ /// Returns the node at the given address.
+ ///
+ /// Node addresses can be obtained by reading transitions on `Node` values.
+ #[inline]
+ pub fn node(&self, addr: CompiledAddr) -> Node<'_> {
+ self.as_ref().node(addr)
+ }
+
+ /// Returns a copy of the binary contents of this FST.
+ #[inline]
+ pub fn to_vec(&self) -> Vec<u8> {
+ self.as_ref().to_vec()
+ }
+
+ /// Returns the binary contents of this FST.
+ #[inline]
+ pub fn as_bytes(&self) -> &[u8] {
+ self.as_ref().as_bytes()
+ }
+
+ #[inline]
+ fn as_ref(&self) -> FstRef {
+ FstRef { meta: &self.meta, data: self.data.as_ref() }
+ }
+}
+
+impl<D> Fst<D> {
+ /// Returns the underlying data which constitutes the FST itself.
+ #[inline]
+ pub fn into_inner(self) -> D {
+ self.data
+ }
+
+ /// Returns a borrow to the underlying data which constitutes the FST itself.
+ #[inline]
+ pub fn as_inner(&self) -> &D {
+ &self.data
+ }
+
+ /// Maps the underlying data of the fst to another data type.
+ #[inline]
+ pub fn map_data<F, T>(self, mut f: F) -> Result<Fst<T>>
+ where
+ F: FnMut(D) -> T,
+ T: AsRef<[u8]>,
+ {
+ Fst::new(f(self.into_inner()))
+ }
+}
+
+impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst<D> {
+ type Item = (&'a [u8], Output);
+ type Into = Stream<'f>;
+
+ #[inline]
+ fn into_stream(self) -> Stream<'f> {
+ StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream()
+ }
+}
+
+struct FstRef<'f> {
+ meta: &'f Meta,
+ data: &'f [u8],
+}
+
+impl<'f> FstRef<'f> {
+ #[inline]
+ fn get(&self, key: &[u8]) -> Option<Output> {
+ let mut node = self.root();
+ let mut out = Output::zero();
+ for &b in key {
+ node = match node.find_input(b) {
+ None => return None,
+ Some(i) => {
+ let t = node.transition(i);
+ out = out.cat(t.out);
+ self.node(t.addr)
+ }
+ }
+ }
+ if !node.is_final() {
+ None
+ } else {
+ Some(out.cat(node.final_output()))
+ }
+ }
+
+ #[inline]
+ fn contains_key(&self, key: &[u8]) -> bool {
+ let mut node = self.root();
+ for &b in key {
+ node = match node.find_input(b) {
+ None => return false,
+ Some(i) => self.node(node.transition_addr(i)),
+ }
+ }
+ node.is_final()
+ }
+
+ #[inline]
+ fn get_key_into(&self, mut value: u64, key: &mut Vec<u8>) -> bool {
+ let mut node = self.root();
+ while value != 0 || !node.is_final() {
+ let trans = node
+ .transitions()
+ .take_while(|t| t.out.value() <= value)
+ .last();
+ node = match trans {
+ None => return false,
+ Some(t) => {
+ value -= t.out.value();
+ key.push(t.inp);
+ self.node(t.addr)
+ }
+ };
+ }
+ true
+ }
+
+ #[inline]
+ fn len(&self) -> usize {
+ self.meta.len
+ }
+
+ #[inline]
+ fn is_empty(&self) -> bool {
+ self.meta.len == 0
+ }
+
+ #[inline]
+ fn size(&self) -> usize {
+ self.as_bytes().len()
+ }
+
+ #[inline]
+ fn fst_type(&self) -> FstType {
+ self.meta.ty
+ }
+
+ #[inline]
+ fn root_addr(&self) -> CompiledAddr {
+ self.meta.root_addr
+ }
+
+ #[inline]
+ fn root(&self) -> Node<'f> {
+ self.node(self.root_addr())
+ }
+
+ #[inline]
+ fn node(&self, addr: CompiledAddr) -> Node<'f> {
+ Node::new(self.meta.version, addr, self.as_bytes())
+ }
+
+ #[inline]
+ fn to_vec(&self) -> Vec<u8> {
+ self.as_bytes().to_vec()
+ }
+
+ #[inline]
+ fn as_bytes(&self) -> &'f [u8] {
+ self.data
+ }
+
+ #[inline]
+ fn empty_final_output(&self) -> Option<Output> {
+ let root = self.root();
+ if root.is_final() {
+ Some(root.final_output())
+ } else {
+ None
+ }
+ }
+}
+
+/// A builder for constructing range queries on streams.
+///
+/// Once all bounds are set, one should call `into_stream` to get a
+/// `Stream`.
+///
+/// Bounds are not additive. That is, if `ge` is called twice on the same
+/// builder, then the second setting wins.
+///
+/// The `A` type parameter corresponds to an optional automaton to filter
+/// the stream. By default, no filtering is done.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct StreamBuilder<'f, A = AlwaysMatch> {
+ fst: FstRef<'f>,
+ aut: A,
+ min: Bound,
+ max: Bound,
+}
+
+impl<'f, A: Automaton> StreamBuilder<'f, A> {
+ fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> {
+ StreamBuilder {
+ fst,
+ aut,
+ min: Bound::Unbounded,
+ max: Bound::Unbounded,
+ }
+ }
+
+ /// Specify a greater-than-or-equal-to bound.
+ pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
+ self.min = Bound::Included(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a greater-than bound.
+ pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
+ self.min = Bound::Excluded(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a less-than-or-equal-to bound.
+ pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
+ self.max = Bound::Included(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a less-than bound.
+ pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamBuilder<'f, A> {
+ self.max = Bound::Excluded(bound.as_ref().to_owned());
+ self
+ }
+}
+
+impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
+ type Item = (&'a [u8], Output);
+ type Into = Stream<'f, A>;
+
+ fn into_stream(self) -> Stream<'f, A> {
+ Stream::new(self.fst, self.aut, self.min, self.max)
+ }
+}
+
+/// A builder for constructing range queries on streams that include automaton
+/// states.
+///
+/// In general, one should use `StreamBuilder` unless you have a specific need
+/// for accessing the states of the underlying automaton that is being used to
+/// filter this stream.
+///
+/// Once all bounds are set, one should call `into_stream` to get a
+/// `Stream`.
+///
+/// Bounds are not additive. That is, if `ge` is called twice on the same
+/// builder, then the second setting wins.
+///
+/// The `A` type parameter corresponds to an optional automaton to filter
+/// the stream. By default, no filtering is done.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> {
+ fst: FstRef<'f>,
+ aut: A,
+ min: Bound,
+ max: Bound,
+}
+
+impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> {
+ fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> {
+ StreamWithStateBuilder {
+ fst,
+ aut,
+ min: Bound::Unbounded,
+ max: Bound::Unbounded,
+ }
+ }
+
+ /// Specify a greater-than-or-equal-to bound.
+ pub fn ge<T: AsRef<[u8]>>(
+ mut self,
+ bound: T,
+ ) -> StreamWithStateBuilder<'f, A> {
+ self.min = Bound::Included(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a greater-than bound.
+ pub fn gt<T: AsRef<[u8]>>(
+ mut self,
+ bound: T,
+ ) -> StreamWithStateBuilder<'f, A> {
+ self.min = Bound::Excluded(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a less-than-or-equal-to bound.
+ pub fn le<T: AsRef<[u8]>>(
+ mut self,
+ bound: T,
+ ) -> StreamWithStateBuilder<'f, A> {
+ self.max = Bound::Included(bound.as_ref().to_owned());
+ self
+ }
+
+ /// Specify a less-than bound.
+ pub fn lt<T: AsRef<[u8]>>(
+ mut self,
+ bound: T,
+ ) -> StreamWithStateBuilder<'f, A> {
+ self.max = Bound::Excluded(bound.as_ref().to_owned());
+ self
+ }
+}
+
+impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a>
+ for StreamWithStateBuilder<'f, A>
+where
+ A::State: Clone,
+{
+ type Item = (&'a [u8], Output, A::State);
+ type Into = StreamWithState<'f, A>;
+
+ fn into_stream(self) -> StreamWithState<'f, A> {
+ StreamWithState::new(self.fst, self.aut, self.min, self.max)
+ }
+}
+
+#[derive(Debug)]
+enum Bound {
+ Included(Vec<u8>),
+ Excluded(Vec<u8>),
+ Unbounded,
+}
+
+impl Bound {
+ #[inline]
+ fn exceeded_by(&self, inp: &[u8]) -> bool {
+ match *self {
+ Bound::Included(ref v) => inp > v,
+ Bound::Excluded(ref v) => inp >= v,
+ Bound::Unbounded => false,
+ }
+ }
+
+ #[inline]
+ fn is_empty(&self) -> bool {
+ match *self {
+ Bound::Included(ref v) => v.is_empty(),
+ Bound::Excluded(ref v) => v.is_empty(),
+ Bound::Unbounded => true,
+ }
+ }
+
+ #[inline]
+ fn is_inclusive(&self) -> bool {
+ match *self {
+ Bound::Excluded(_) => false,
+ _ => true,
+ }
+ }
+}
+
+/// A lexicographically ordered stream of key-value pairs from an fst.
+///
+/// The `A` type parameter corresponds to an optional automaton to filter
+/// the stream. By default, no filtering is done.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>);
+
+impl<'f, A: Automaton> Stream<'f, A> {
+ fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> {
+ Stream(StreamWithState::new(fst, aut, min, max))
+ }
+
+ /// Convert this stream into a vector of byte strings and outputs.
+ ///
+ /// Note that this creates a new allocation for every key in the stream.
+ pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> {
+ let mut vs = vec![];
+ while let Some((k, v)) = self.next() {
+ vs.push((k.to_vec(), v.value()));
+ }
+ vs
+ }
+
+ /// Convert this stream into a vector of Unicode strings and outputs.
+ ///
+ /// If any key is not valid UTF-8, then iteration on the stream is stopped
+ /// and a UTF-8 decoding error is returned.
+ ///
+ /// Note that this creates a new allocation for every key in the stream.
+ pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> {
+ let mut vs = vec![];
+ while let Some((k, v)) = self.next() {
+ let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
+ vs.push((k, v.value()));
+ }
+ Ok(vs)
+ }
+
+ /// Convert this stream into a vector of byte strings.
+ ///
+ /// Note that this creates a new allocation for every key in the stream.
+ pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> {
+ let mut vs = vec![];
+ while let Some((k, _)) = self.next() {
+ vs.push(k.to_vec());
+ }
+ vs
+ }
+
+ /// Convert this stream into a vector of Unicode strings.
+ ///
+ /// If any key is not valid UTF-8, then iteration on the stream is stopped
+ /// and a UTF-8 decoding error is returned.
+ ///
+ /// Note that this creates a new allocation for every key in the stream.
+ pub fn into_str_keys(mut self) -> Result<Vec<String>> {
+ let mut vs = vec![];
+ while let Some((k, _)) = self.next() {
+ let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
+ vs.push(k);
+ }
+ Ok(vs)
+ }
+
+ /// Convert this stream into a vector of outputs.
+ pub fn into_values(mut self) -> Vec<u64> {
+ let mut vs = vec![];
+ while let Some((_, v)) = self.next() {
+ vs.push(v.value());
+ }
+ vs
+ }
+}
+
+impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
+ type Item = (&'a [u8], Output);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], Output)> {
+ self.0.next_with(|_| ()).map(|(key, out, _)| (key, out))
+ }
+}
+
+/// A lexicographically ordered stream of key-value-state triples from an fst
+/// and an automaton.
+///
+/// The key-values are from the underyling FSTP while the states are from the
+/// automaton.
+///
+/// The `A` type parameter corresponds to an optional automaton to filter
+/// the stream. By default, no filtering is done.
+///
+/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
+pub struct StreamWithState<'f, A = AlwaysMatch>
+where
+ A: Automaton,
+{
+ fst: FstRef<'f>,
+ aut: A,
+ inp: Vec<u8>,
+ empty_output: Option<Output>,
+ stack: Vec<StreamState<'f, A::State>>,
+ end_at: Bound,
+}
+
+#[derive(Clone, Debug)]
+struct StreamState<'f, S> {
+ node: Node<'f>,
+ trans: usize,
+ out: Output,
+ aut_state: S,
+}
+
+impl<'f, A: Automaton> StreamWithState<'f, A> {
+ fn new(
+ fst: FstRef<'f>,
+ aut: A,
+ min: Bound,
+ max: Bound,
+ ) -> StreamWithState<'f, A> {
+ let mut rdr = StreamWithState {
+ fst,
+ aut,
+ inp: Vec::with_capacity(16),
+ empty_output: None,
+ stack: vec![],
+ end_at: max,
+ };
+ rdr.seek_min(min);
+ rdr
+ }
+
+ /// Seeks the underlying stream such that the next key to be read is the
+ /// smallest key in the underlying fst that satisfies the given minimum
+ /// bound.
+ ///
+ /// This theoretically should be straight-forward, but we need to make
+ /// sure our stack is correct, which includes accounting for automaton
+ /// states.
+ fn seek_min(&mut self, min: Bound) {
+ if min.is_empty() {
+ if min.is_inclusive() {
+ self.empty_output = self.fst.empty_final_output();
+ }
+ self.stack = vec![StreamState {
+ node: self.fst.root(),
+ trans: 0,
+ out: Output::zero(),
+ aut_state: self.aut.start(),
+ }];
+ return;
+ }
+ let (key, inclusive) = match min {
+ Bound::Excluded(ref min) => (min, false),
+ Bound::Included(ref min) => (min, true),
+ Bound::Unbounded => unreachable!(),
+ };
+ // At this point, we need to find the starting location of `min` in
+ // the FST. However, as we search, we need to maintain a stack of
+ // reader states so that the reader can pick up where we left off.
+ // N.B. We do not necessarily need to stop in a final state, unlike
+ // the one-off `find` method. For the example, the given bound might
+ // not actually exist in the FST.
+ let mut node = self.fst.root();
+ let mut out = Output::zero();
+ let mut aut_state = self.aut.start();
+ for &b in key {
+ match node.find_input(b) {
+ Some(i) => {
+ let t = node.transition(i);
+ let prev_state = aut_state;
+ aut_state = self.aut.accept(&prev_state, b);
+ self.inp.push(b);
+ self.stack.push(StreamState {
+ node,
+ trans: i + 1,
+ out,
+ aut_state: prev_state,
+ });
+ out = out.cat(t.out);
+ node = self.fst.node(t.addr);
+ }
+ None => {
+ // This is a little tricky. We're in this case if the
+ // given bound is not a prefix of any key in the FST.
+ // Since this is a minimum bound, we need to find the
+ // first transition in this node that proceeds the current
+ // input byte.
+ self.stack.push(StreamState {
+ node,
+ trans: node
+ .transitions()
+ .position(|t| t.inp > b)
+ .unwrap_or(node.len()),
+ out,
+ aut_state,
+ });
+ return;
+ }
+ }
+ }
+ if !self.stack.is_empty() {
+ let last = self.stack.len() - 1;
+ if inclusive {
+ self.stack[last].trans -= 1;
+ self.inp.pop();
+ } else {
+ let node = self.stack[last].node;
+ let trans = self.stack[last].trans;
+ self.stack.push(StreamState {
+ node: self.fst.node(node.transition(trans - 1).addr),
+ trans: 0,
+ out,
+ aut_state,
+ });
+ }
+ }
+ }
+
+ fn next_with<T>(
+ &mut self,
+ mut map: impl FnMut(&A::State) -> T,
+ ) -> Option<(&[u8], Output, T)> {
+ if let Some(out) = self.empty_output.take() {
+ if self.end_at.exceeded_by(&[]) {
+ self.stack.clear();
+ return None;
+ }
+
+ let start = self.aut.start();
+ if self.aut.is_match(&start) {
+ return Some((&[], out, map(&start)));
+ }
+ }
+ while let Some(state) = self.stack.pop() {
+ if state.trans >= state.node.len()
+ || !self.aut.can_match(&state.aut_state)
+ {
+ if state.node.addr() != self.fst.root_addr() {
+ self.inp.pop().unwrap();
+ }
+ continue;
+ }
+ let trans = state.node.transition(state.trans);
+ let out = state.out.cat(trans.out);
+ let next_state = self.aut.accept(&state.aut_state, trans.inp);
+ let t = map(&next_state);
+ let mut is_match = self.aut.is_match(&next_state);
+ let next_node = self.fst.node(trans.addr);
+ self.inp.push(trans.inp);
+ if next_node.is_final() {
+ if let Some(eof_state) = self.aut.accept_eof(&next_state) {
+ is_match = self.aut.is_match(&eof_state);
+ }
+ }
+ self.stack.push(StreamState { trans: state.trans + 1, ..state });
+ self.stack.push(StreamState {
+ node: next_node,
+ trans: 0,
+ out,
+ aut_state: next_state,
+ });
+ if self.end_at.exceeded_by(&self.inp) {
+ // We are done, forever.
+ self.stack.clear();
+ return None;
+ }
+ if next_node.is_final() && is_match {
+ return Some((
+ &self.inp,
+ out.cat(next_node.final_output()),
+ t,
+ ));
+ }
+ }
+ None
+ }
+}
+
+impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A>
+where
+ A::State: Clone,
+{
+ type Item = (&'a [u8], Output, A::State);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> {
+ self.next_with(|state| state.clone())
+ }
+}
+
+/// An output is a value that is associated with a key in a finite state
+/// transducer.
+///
+/// Note that outputs must satisfy an algebra. Namely, it must have an additive
+/// identity and the following binary operations defined: `prefix`,
+/// `concatenation` and `subtraction`. `prefix` and `concatenation` are
+/// commutative while `subtraction` is not. `subtraction` is only defined on
+/// pairs of operands where the first operand is greater than or equal to the
+/// second operand.
+///
+/// Currently, output values must be `u64`. However, in theory, an output value
+/// can be anything that satisfies the above algebra. Future versions of this
+/// crate may make outputs generic on this algebra.
+#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub struct Output(u64);
+
+impl Output {
+ /// Create a new output from a `u64`.
+ #[inline]
+ pub fn new(v: u64) -> Output {
+ Output(v)
+ }
+
+ /// Create a zero output.
+ #[inline]
+ pub fn zero() -> Output {
+ Output(0)
+ }
+
+ /// Retrieve the value inside this output.
+ #[inline]
+ pub fn value(self) -> u64 {
+ self.0
+ }
+
+ /// Returns true if this is a zero output.
+ #[inline]
+ pub fn is_zero(self) -> bool {
+ self.0 == 0
+ }
+
+ /// Returns the prefix of this output and `o`.
+ #[inline]
+ pub fn prefix(self, o: Output) -> Output {
+ Output(cmp::min(self.0, o.0))
+ }
+
+ /// Returns the concatenation of this output and `o`.
+ #[inline]
+ pub fn cat(self, o: Output) -> Output {
+ Output(self.0 + o.0)
+ }
+
+ /// Returns the subtraction of `o` from this output.
+ ///
+ /// This function panics if `self < o`.
+ #[inline]
+ pub fn sub(self, o: Output) -> Output {
+ Output(
+ self.0
+ .checked_sub(o.0)
+ .expect("BUG: underflow subtraction not allowed"),
+ )
+ }
+}
+
+/// A transition from one note to another.
+#[derive(Copy, Clone, Hash, Eq, PartialEq)]
+pub struct Transition {
+ /// The byte input associated with this transition.
+ pub inp: u8,
+ /// The output associated with this transition.
+ pub out: Output,
+ /// The address of the node that this transition points to.
+ pub addr: CompiledAddr,
+}
+
+impl Default for Transition {
+ #[inline]
+ fn default() -> Transition {
+ Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS }
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if self.out.is_zero() {
+ write!(f, "{} -> {}", self.inp as char, self.addr)
+ } else {
+ write!(
+ f,
+ "({}, {}) -> {}",
+ self.inp as char,
+ self.out.value(),
+ self.addr
+ )
+ }
+ }
+}
+
+#[inline]
+#[cfg(target_pointer_width = "64")]
+fn u64_to_usize(n: u64) -> usize {
+ n as usize
+}
+
+#[inline]
+#[cfg(not(target_pointer_width = "64"))]
+fn u64_to_usize(n: u64) -> usize {
+ if n > std::usize::MAX as u64 {
+ panic!(
+ "\
+Cannot convert node address {} to a pointer sized variable. If this FST
+is very large and was generated on a system with a larger pointer size
+than this system, then it is not possible to read this FST on this
+system.",
+ n
+ );
+ }
+ n as usize
+}
diff --git a/vendor/fst/src/raw/node.rs b/vendor/fst/src/raw/node.rs
new file mode 100644
index 000000000..820b29eac
--- /dev/null
+++ b/vendor/fst/src/raw/node.rs
@@ -0,0 +1,1012 @@
+use std::cmp;
+use std::fmt;
+use std::io;
+use std::ops::Range;
+
+use crate::bytes;
+use crate::raw::build::BuilderNode;
+use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV};
+use crate::raw::{
+ u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS,
+};
+
+/// The threshold (in number of transitions) at which an index is created for
+/// a node's transitions. This speeds up lookup time at the expense of FST
+/// size.
+const TRANS_INDEX_THRESHOLD: usize = 32;
+
+/// Node represents a single state in a finite state transducer.
+///
+/// Nodes are very cheap to construct. Notably, they satisfy the `Copy` trait.
+#[derive(Clone, Copy)]
+pub struct Node<'f> {
+ data: &'f [u8],
+ version: u64,
+ state: State,
+ start: CompiledAddr,
+ end: usize,
+ is_final: bool,
+ ntrans: usize,
+ sizes: PackSizes,
+ final_output: Output,
+}
+
+impl<'f> fmt::Debug for Node<'f> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "NODE@{}", self.start)?;
+ writeln!(f, " end_addr: {}", self.end)?;
+ writeln!(f, " size: {} bytes", self.as_slice().len())?;
+ writeln!(f, " state: {:?}", self.state)?;
+ writeln!(f, " is_final: {}", self.is_final())?;
+ writeln!(f, " final_output: {:?}", self.final_output())?;
+ writeln!(f, " # transitions: {}", self.len())?;
+ writeln!(f, " transitions:")?;
+ for t in self.transitions() {
+ writeln!(f, " {:?}", t)?;
+ }
+ Ok(())
+ }
+}
+
+impl<'f> Node<'f> {
+ /// Creates a new note at the address given.
+ ///
+ /// `data` should be a slice to an entire FST.
+ pub(crate) fn new(
+ version: u64,
+ addr: CompiledAddr,
+ data: &[u8],
+ ) -> Node<'_> {
+ let state = State::new(data, addr);
+ match state {
+ State::EmptyFinal => Node {
+ data: &[],
+ version,
+ state: State::EmptyFinal,
+ start: EMPTY_ADDRESS,
+ end: EMPTY_ADDRESS,
+ is_final: true,
+ ntrans: 0,
+ sizes: PackSizes::new(),
+ final_output: Output::zero(),
+ },
+ State::OneTransNext(s) => {
+ let data = &data[..addr + 1];
+ Node {
+ data,
+ version,
+ state,
+ start: addr,
+ end: s.end_addr(data),
+ is_final: false,
+ sizes: PackSizes::new(),
+ ntrans: 1,
+ final_output: Output::zero(),
+ }
+ }
+ State::OneTrans(s) => {
+ let data = &data[..addr + 1];
+ let sizes = s.sizes(data);
+ Node {
+ data,
+ version,
+ state,
+ start: addr,
+ end: s.end_addr(data, sizes),
+ is_final: false,
+ ntrans: 1,
+ sizes,
+ final_output: Output::zero(),
+ }
+ }
+ State::AnyTrans(s) => {
+ let data = &data[..addr + 1];
+ let sizes = s.sizes(data);
+ let ntrans = s.ntrans(data);
+ Node {
+ data,
+ version,
+ state,
+ start: addr,
+ end: s.end_addr(version, data, sizes, ntrans),
+ is_final: s.is_final_state(),
+ ntrans,
+ sizes,
+ final_output: s.final_output(version, data, sizes, ntrans),
+ }
+ }
+ }
+ }
+ /// Returns an iterator over all transitions in this node in lexicographic
+ /// order.
+ #[inline]
+ pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> {
+ Transitions { node: self, range: 0..self.len() }
+ }
+
+ /// Returns the transition at index `i`.
+ #[inline(always)]
+ pub fn transition(&self, i: usize) -> Transition {
+ // The `inline(always)` annotation on this function appears to
+ // dramatically speed up FST traversal. In particular, measuring the
+ // time it takes to run `fst range something-big.fst` shows almost a 2x
+ // improvement with this single `inline(always)` annotation.
+
+ match self.state {
+ State::OneTransNext(s) => {
+ assert!(i == 0);
+ Transition {
+ inp: s.input(self),
+ out: Output::zero(),
+ addr: s.trans_addr(self),
+ }
+ }
+ State::OneTrans(s) => {
+ assert!(i == 0);
+ Transition {
+ inp: s.input(self),
+ out: s.output(self),
+ addr: s.trans_addr(self),
+ }
+ }
+ State::AnyTrans(s) => Transition {
+ inp: s.input(self, i),
+ out: s.output(self, i),
+ addr: s.trans_addr(self, i),
+ },
+ State::EmptyFinal => panic!("out of bounds"),
+ }
+ }
+
+ /// Returns the transition address of the `i`th transition.
+ #[inline]
+ pub fn transition_addr(&self, i: usize) -> CompiledAddr {
+ match self.state {
+ State::OneTransNext(s) => {
+ assert!(i == 0);
+ s.trans_addr(self)
+ }
+ State::OneTrans(s) => {
+ assert!(i == 0);
+ s.trans_addr(self)
+ }
+ State::AnyTrans(s) => s.trans_addr(self, i),
+ State::EmptyFinal => panic!("out of bounds"),
+ }
+ }
+
+ /// Finds the `i`th transition corresponding to the given input byte.
+ ///
+ /// If no transition for this byte exists, then `None` is returned.
+ #[inline]
+ pub fn find_input(&self, b: u8) -> Option<usize> {
+ match self.state {
+ State::OneTransNext(s) if s.input(self) == b => Some(0),
+ State::OneTransNext(_) => None,
+ State::OneTrans(s) if s.input(self) == b => Some(0),
+ State::OneTrans(_) => None,
+ State::AnyTrans(s) => s.find_input(self, b),
+ State::EmptyFinal => None,
+ }
+ }
+
+ /// If this node is final and has a terminal output value, then it is
+ /// returned. Otherwise, a zero output is returned.
+ #[inline]
+ pub fn final_output(&self) -> Output {
+ self.final_output
+ }
+
+ /// Returns true if and only if this node corresponds to a final or "match"
+ /// state in the finite state transducer.
+ #[inline]
+ pub fn is_final(&self) -> bool {
+ self.is_final
+ }
+
+ /// Returns the number of transitions in this node.
+ ///
+ /// The maximum number of transitions is 256.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.ntrans
+ }
+
+ /// Returns true if and only if this node has zero transitions.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.ntrans == 0
+ }
+
+ /// Return the address of this node.
+ #[inline]
+ pub fn addr(&self) -> CompiledAddr {
+ self.start
+ }
+
+ #[doc(hidden)]
+ #[inline]
+ pub fn as_slice(&self) -> &[u8] {
+ &self.data[self.end..]
+ }
+
+ #[doc(hidden)]
+ #[inline]
+ pub fn state(&self) -> &'static str {
+ match self.state {
+ State::OneTransNext(_) => "OTN",
+ State::OneTrans(_) => "OT",
+ State::AnyTrans(_) => "AT",
+ State::EmptyFinal => "EF",
+ }
+ }
+
+ fn compile<W: io::Write>(
+ wtr: W,
+ last_addr: CompiledAddr,
+ addr: CompiledAddr,
+ node: &BuilderNode,
+ ) -> io::Result<()> {
+ assert!(node.trans.len() <= 256);
+ if node.trans.is_empty()
+ && node.is_final
+ && node.final_output.is_zero()
+ {
+ return Ok(());
+ } else if node.trans.len() != 1 || node.is_final {
+ StateAnyTrans::compile(wtr, addr, node)
+ } else {
+ if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() {
+ StateOneTransNext::compile(wtr, addr, node.trans[0].inp)
+ } else {
+ StateOneTrans::compile(wtr, addr, node.trans[0])
+ }
+ }
+ }
+}
+
+impl BuilderNode {
+ pub fn compile_to<W: io::Write>(
+ &self,
+ wtr: W,
+ last_addr: CompiledAddr,
+ addr: CompiledAddr,
+ ) -> io::Result<()> {
+ Node::compile(wtr, last_addr, addr, self)
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum State {
+ OneTransNext(StateOneTransNext),
+ OneTrans(StateOneTrans),
+ AnyTrans(StateAnyTrans),
+ EmptyFinal,
+}
+
+// one trans flag (1), next flag (1), common input
+#[derive(Clone, Copy, Debug)]
+struct StateOneTransNext(u8);
+// one trans flag (1), next flag (0), common input
+#[derive(Clone, Copy, Debug)]
+struct StateOneTrans(u8);
+// one trans flag (0), final flag, # transitions
+#[derive(Clone, Copy, Debug)]
+struct StateAnyTrans(u8);
+
+impl State {
+ fn new(data: &[u8], addr: CompiledAddr) -> State {
+ if addr == EMPTY_ADDRESS {
+ return State::EmptyFinal;
+ }
+ let v = data[addr];
+ match (v & 0b11_000000) >> 6 {
+ 0b11 => State::OneTransNext(StateOneTransNext(v)),
+ 0b10 => State::OneTrans(StateOneTrans(v)),
+ _ => State::AnyTrans(StateAnyTrans(v)),
+ }
+ }
+}
+
+impl StateOneTransNext {
+ fn compile<W: io::Write>(
+ mut wtr: W,
+ _: CompiledAddr,
+ input: u8,
+ ) -> io::Result<()> {
+ let mut state = StateOneTransNext::new();
+ state.set_common_input(input);
+ if state.common_input().is_none() {
+ wtr.write_all(&[input])?;
+ }
+ wtr.write_all(&[state.0])?;
+ Ok(())
+ }
+
+ #[inline]
+ fn new() -> StateOneTransNext {
+ StateOneTransNext(0b11_000000)
+ }
+
+ #[inline]
+ fn set_common_input(&mut self, input: u8) {
+ self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111);
+ }
+
+ #[inline]
+ fn common_input(&self) -> Option<u8> {
+ common_input(self.0 & 0b00_111111)
+ }
+
+ #[inline]
+ fn input_len(&self) -> usize {
+ if self.common_input().is_none() {
+ 1
+ } else {
+ 0
+ }
+ }
+
+ #[inline]
+ fn end_addr(&self, data: &[u8]) -> usize {
+ data.len() - 1 - self.input_len()
+ }
+
+ #[inline]
+ fn input(&self, node: &Node<'_>) -> u8 {
+ if let Some(inp) = self.common_input() {
+ inp
+ } else {
+ node.data[node.start - 1]
+ }
+ }
+
+ #[inline]
+ fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr {
+ node.end as CompiledAddr - 1
+ }
+}
+
+impl StateOneTrans {
+ fn compile<W: io::Write>(
+ mut wtr: W,
+ addr: CompiledAddr,
+ trans: Transition,
+ ) -> io::Result<()> {
+ let out = trans.out.value();
+ let output_pack_size =
+ if out == 0 { 0 } else { bytes::pack_uint(&mut wtr, out)? };
+ let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?;
+
+ let mut pack_sizes = PackSizes::new();
+ pack_sizes.set_output_pack_size(output_pack_size);
+ pack_sizes.set_transition_pack_size(trans_pack_size);
+ wtr.write_all(&[pack_sizes.encode()])?;
+
+ let mut state = StateOneTrans::new();
+ state.set_common_input(trans.inp);
+ if state.common_input().is_none() {
+ wtr.write_all(&[trans.inp])?;
+ }
+ wtr.write_all(&[state.0])?;
+ Ok(())
+ }
+
+ #[inline]
+ fn new() -> StateOneTrans {
+ StateOneTrans(0b10_000000)
+ }
+
+ #[inline]
+ fn set_common_input(&mut self, input: u8) {
+ self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111);
+ }
+
+ #[inline]
+ fn common_input(&self) -> Option<u8> {
+ common_input(self.0 & 0b00_111111)
+ }
+
+ #[inline]
+ fn input_len(&self) -> usize {
+ if self.common_input().is_none() {
+ 1
+ } else {
+ 0
+ }
+ }
+
+ #[inline]
+ fn sizes(&self, data: &[u8]) -> PackSizes {
+ let i = data.len() - 1 - self.input_len() - 1;
+ PackSizes::decode(data[i])
+ }
+
+ #[inline]
+ fn end_addr(&self, data: &[u8], sizes: PackSizes) -> usize {
+ data.len() - 1
+ - self.input_len()
+ - 1 // pack size
+ - sizes.transition_pack_size()
+ - sizes.output_pack_size()
+ }
+
+ #[inline]
+ fn input(&self, node: &Node<'_>) -> u8 {
+ if let Some(inp) = self.common_input() {
+ inp
+ } else {
+ node.data[node.start - 1]
+ }
+ }
+
+ #[inline]
+ fn output(&self, node: &Node<'_>) -> Output {
+ let osize = node.sizes.output_pack_size();
+ if osize == 0 {
+ return Output::zero();
+ }
+ let tsize = node.sizes.transition_pack_size();
+ let i = node.start
+ - self.input_len()
+ - 1 // pack size
+ - tsize - osize;
+ Output::new(bytes::unpack_uint(&node.data[i..], osize as u8))
+ }
+
+ #[inline]
+ fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr {
+ let tsize = node.sizes.transition_pack_size();
+ let i = node.start
+ - self.input_len()
+ - 1 // pack size
+ - tsize;
+ unpack_delta(&node.data[i..], tsize, node.end)
+ }
+}
+
+impl StateAnyTrans {
+ fn compile<W: io::Write>(
+ mut wtr: W,
+ addr: CompiledAddr,
+ node: &BuilderNode,
+ ) -> io::Result<()> {
+ assert!(node.trans.len() <= 256);
+
+ let mut tsize = 0;
+ let mut osize = bytes::pack_size(node.final_output.value());
+ let mut any_outs = !node.final_output.is_zero();
+ for t in &node.trans {
+ tsize = cmp::max(tsize, pack_delta_size(addr, t.addr));
+ osize = cmp::max(osize, bytes::pack_size(t.out.value()));
+ any_outs = any_outs || !t.out.is_zero();
+ }
+
+ let mut pack_sizes = PackSizes::new();
+ if any_outs {
+ pack_sizes.set_output_pack_size(osize);
+ } else {
+ pack_sizes.set_output_pack_size(0);
+ }
+ pack_sizes.set_transition_pack_size(tsize);
+
+ let mut state = StateAnyTrans::new();
+ state.set_final_state(node.is_final);
+ state.set_state_ntrans(node.trans.len() as u8);
+
+ if any_outs {
+ if node.is_final {
+ bytes::pack_uint_in(
+ &mut wtr,
+ node.final_output.value(),
+ osize,
+ )?;
+ }
+ for t in node.trans.iter().rev() {
+ bytes::pack_uint_in(&mut wtr, t.out.value(), osize)?;
+ }
+ }
+ for t in node.trans.iter().rev() {
+ pack_delta_in(&mut wtr, addr, t.addr, tsize)?;
+ }
+ for t in node.trans.iter().rev() {
+ wtr.write_all(&[t.inp])?;
+ }
+ if node.trans.len() > TRANS_INDEX_THRESHOLD {
+ // A value of 255 indicates that no transition exists for the byte
+ // at that index. (Except when there are 256 transitions.) Namely,
+ // any value greater than or equal to the number of transitions in
+ // this node indicates an absent transition.
+ let mut index = [255u8; 256];
+ for (i, t) in node.trans.iter().enumerate() {
+ index[t.inp as usize] = i as u8;
+ }
+ wtr.write_all(&index)?;
+ }
+
+ wtr.write_all(&[pack_sizes.encode()])?;
+ if state.state_ntrans().is_none() {
+ if node.trans.len() == 256 {
+ // 256 can't be represented in a u8, so we abuse the fact that
+ // the # of transitions can never be 1 here, since 1 is always
+ // encoded in the state byte.
+ wtr.write_all(&[1])?;
+ } else {
+ wtr.write_all(&[node.trans.len() as u8])?;
+ }
+ }
+ wtr.write_all(&[state.0])?;
+ Ok(())
+ }
+
+ #[inline]
+ fn new() -> StateAnyTrans {
+ StateAnyTrans(0b00_000000)
+ }
+
+ #[inline]
+ fn set_final_state(&mut self, yes: bool) {
+ if yes {
+ self.0 |= 0b01_000000;
+ }
+ }
+
+ #[inline]
+ fn is_final_state(&self) -> bool {
+ self.0 & 0b01_000000 == 0b01_000000
+ }
+
+ #[inline]
+ fn set_state_ntrans(&mut self, n: u8) {
+ if n <= 0b00_111111 {
+ self.0 = (self.0 & 0b11_000000) | n;
+ }
+ }
+
+ #[inline]
+ fn state_ntrans(&self) -> Option<u8> {
+ let n = self.0 & 0b00_111111;
+ if n == 0 {
+ None
+ } else {
+ Some(n)
+ }
+ }
+
+ #[inline]
+ fn sizes(&self, data: &[u8]) -> PackSizes {
+ let i = data.len() - 1 - self.ntrans_len() - 1;
+ PackSizes::decode(data[i])
+ }
+
+ #[inline]
+ fn total_trans_size(
+ &self,
+ version: u64,
+ sizes: PackSizes,
+ ntrans: usize,
+ ) -> usize {
+ let index_size = self.trans_index_size(version, ntrans);
+ ntrans + (ntrans * sizes.transition_pack_size()) + index_size
+ }
+
+ #[inline]
+ fn trans_index_size(&self, version: u64, ntrans: usize) -> usize {
+ if version >= 2 && ntrans > TRANS_INDEX_THRESHOLD {
+ 256
+ } else {
+ 0
+ }
+ }
+
+ #[inline]
+ fn ntrans_len(&self) -> usize {
+ if self.state_ntrans().is_none() {
+ 1
+ } else {
+ 0
+ }
+ }
+
+ #[inline]
+ fn ntrans(&self, data: &[u8]) -> usize {
+ if let Some(n) = self.state_ntrans() {
+ n as usize
+ } else {
+ let n = data[data.len() - 2] as usize;
+ if n == 1 {
+ // "1" is never a normal legal value here, because if there
+ // is only 1 transition, then it is encoded in the state byte.
+ 256
+ } else {
+ n
+ }
+ }
+ }
+
+ #[inline]
+ fn final_output(
+ &self,
+ version: u64,
+ data: &[u8],
+ sizes: PackSizes,
+ ntrans: usize,
+ ) -> Output {
+ let osize = sizes.output_pack_size();
+ if osize == 0 || !self.is_final_state() {
+ return Output::zero();
+ }
+ let at = data.len() - 1
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.total_trans_size(version, sizes, ntrans)
+ - (ntrans * osize) // output values
+ - osize; // the desired output value
+ Output::new(bytes::unpack_uint(&data[at..], osize as u8))
+ }
+
+ #[inline]
+ fn end_addr(
+ &self,
+ version: u64,
+ data: &[u8],
+ sizes: PackSizes,
+ ntrans: usize,
+ ) -> usize {
+ let osize = sizes.output_pack_size();
+ let final_osize = if !self.is_final_state() { 0 } else { osize };
+ data.len() - 1
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.total_trans_size(version, sizes, ntrans)
+ - (ntrans * osize) // output values
+ - final_osize // final output
+ }
+
+ #[inline]
+ fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr {
+ assert!(i < node.ntrans);
+ let tsize = node.sizes.transition_pack_size();
+ let at = node.start
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.trans_index_size(node.version, node.ntrans)
+ - node.ntrans // inputs
+ - (i * tsize) // the previous transition addresses
+ - tsize; // the desired transition address
+ unpack_delta(&node.data[at..], tsize, node.end)
+ }
+
+ #[inline]
+ fn input(&self, node: &Node<'_>, i: usize) -> u8 {
+ let at = node.start
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.trans_index_size(node.version, node.ntrans)
+ - i
+ - 1; // the input byte
+ node.data[at]
+ }
+
+ #[inline]
+ fn find_input(&self, node: &Node<'_>, b: u8) -> Option<usize> {
+ if node.version >= 2 && node.ntrans > TRANS_INDEX_THRESHOLD {
+ let start = node.start
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.trans_index_size(node.version, node.ntrans);
+ let i = node.data[start + b as usize] as usize;
+ if i >= node.ntrans {
+ None
+ } else {
+ Some(i)
+ }
+ } else {
+ let start = node.start
+ - self.ntrans_len()
+ - 1 // pack size
+ - node.ntrans; // inputs
+ let end = start + node.ntrans;
+ let inputs = &node.data[start..end];
+ inputs.iter().position(|&b2| b == b2).map(|i| node.ntrans - i - 1)
+ }
+ }
+
+ #[inline]
+ fn output(&self, node: &Node<'_>, i: usize) -> Output {
+ let osize = node.sizes.output_pack_size();
+ if osize == 0 {
+ return Output::zero();
+ }
+ let at = node.start
+ - self.ntrans_len()
+ - 1 // pack size
+ - self.total_trans_size(node.version, node.sizes, node.ntrans)
+ - (i * osize) // the previous outputs
+ - osize; // the desired output value
+ Output::new(bytes::unpack_uint(&node.data[at..], osize as u8))
+ }
+}
+
+// high 4 bits is transition address packed size.
+// low 4 bits is output value packed size.
+//
+// `0` is a legal value which means there are no transitions/outputs.
+#[derive(Clone, Copy, Debug)]
+struct PackSizes(u8);
+
+impl PackSizes {
+ #[inline]
+ fn new() -> PackSizes {
+ PackSizes(0)
+ }
+
+ #[inline]
+ fn decode(v: u8) -> PackSizes {
+ PackSizes(v)
+ }
+
+ #[inline]
+ fn encode(&self) -> u8 {
+ self.0
+ }
+
+ #[inline]
+ fn set_transition_pack_size(&mut self, size: u8) {
+ assert!(size <= 8);
+ self.0 = (self.0 & 0b0000_1111) | (size << 4);
+ }
+
+ #[inline]
+ fn transition_pack_size(&self) -> usize {
+ ((self.0 & 0b1111_0000) >> 4) as usize
+ }
+
+ #[inline]
+ fn set_output_pack_size(&mut self, size: u8) {
+ assert!(size <= 8);
+ self.0 = (self.0 & 0b1111_0000) | size;
+ }
+
+ #[inline]
+ fn output_pack_size(&self) -> usize {
+ (self.0 & 0b0000_1111) as usize
+ }
+}
+
+/// An iterator over all transitions in a node.
+///
+/// `'f` is the lifetime of the underlying fst and `'n` is the lifetime of
+/// the underlying `Node`.
+pub struct Transitions<'f, 'n> {
+ node: &'n Node<'f>,
+ range: Range<usize>,
+}
+
+impl<'f, 'n> Iterator for Transitions<'f, 'n> {
+ type Item = Transition;
+
+ #[inline]
+ fn next(&mut self) -> Option<Transition> {
+ self.range.next().map(|i| self.node.transition(i))
+ }
+}
+
+/// common_idx translate a byte to an index in the COMMON_INPUTS_INV array.
+///
+/// I wonder if it would be prudent to store this mapping in the FST itself.
+/// The advantage of doing so would mean that common inputs would reflect the
+/// specific data in the FST. The problem of course is that this table has to
+/// be computed up front, which is pretty much at odds with the streaming
+/// nature of the builder.
+///
+/// Nevertheless, the *caller* may have a priori knowledge that could be
+/// supplied to the builder manually, which could then be embedded in the FST.
+#[inline]
+fn common_idx(input: u8, max: u8) -> u8 {
+ let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8;
+ if val > max {
+ 0
+ } else {
+ val
+ }
+}
+
+/// common_input translates a common input index stored in a serialized FST
+/// to the corresponding byte.
+#[inline]
+fn common_input(idx: u8) -> Option<u8> {
+ if idx == 0 {
+ None
+ } else {
+ Some(COMMON_INPUTS_INV[(idx - 1) as usize])
+ }
+}
+
+#[inline]
+fn pack_delta<W: io::Write>(
+ wtr: W,
+ node_addr: CompiledAddr,
+ trans_addr: CompiledAddr,
+) -> io::Result<u8> {
+ let nbytes = pack_delta_size(node_addr, trans_addr);
+ pack_delta_in(wtr, node_addr, trans_addr, nbytes)?;
+ Ok(nbytes)
+}
+
+#[inline]
+fn pack_delta_in<W: io::Write>(
+ wtr: W,
+ node_addr: CompiledAddr,
+ trans_addr: CompiledAddr,
+ nbytes: u8,
+) -> io::Result<()> {
+ let delta_addr = if trans_addr == EMPTY_ADDRESS {
+ EMPTY_ADDRESS
+ } else {
+ node_addr - trans_addr
+ };
+ bytes::pack_uint_in(wtr, delta_addr as u64, nbytes)
+}
+
+#[inline]
+fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 {
+ let delta_addr = if trans_addr == EMPTY_ADDRESS {
+ EMPTY_ADDRESS
+ } else {
+ node_addr - trans_addr
+ };
+ bytes::pack_size(delta_addr as u64)
+}
+
+#[inline]
+fn unpack_delta(
+ slice: &[u8],
+ trans_pack_size: usize,
+ node_addr: usize,
+) -> CompiledAddr {
+ let delta = bytes::unpack_uint(slice, trans_pack_size as u8);
+ let delta_addr = u64_to_usize(delta);
+ if delta_addr == EMPTY_ADDRESS {
+ EMPTY_ADDRESS
+ } else {
+ node_addr - delta_addr
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use quickcheck::{quickcheck, TestResult};
+
+ use crate::raw::build::BuilderNode;
+ use crate::raw::node::Node;
+ use crate::raw::{Builder, CompiledAddr, Output, Transition, VERSION};
+ use crate::stream::Streamer;
+
+ const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr;
+
+ #[test]
+ fn prop_emits_inputs() {
+ fn p(mut bs: Vec<Vec<u8>>) -> TestResult {
+ bs.sort();
+ bs.dedup();
+
+ let mut bfst = Builder::memory();
+ for word in &bs {
+ bfst.add(word).unwrap();
+ }
+ let fst = bfst.into_fst();
+ let mut rdr = fst.stream();
+ let mut words = vec![];
+ while let Some(w) = rdr.next() {
+ words.push(w.0.to_owned());
+ }
+ TestResult::from_bool(bs == words)
+ }
+ quickcheck(p as fn(Vec<Vec<u8>>) -> TestResult)
+ }
+
+ fn nodes_equal(compiled: &Node, uncompiled: &BuilderNode) -> bool {
+ println!("{:?}", compiled);
+ assert_eq!(compiled.is_final(), uncompiled.is_final);
+ assert_eq!(compiled.len(), uncompiled.trans.len());
+ assert_eq!(compiled.final_output(), uncompiled.final_output);
+ for (ct, ut) in
+ compiled.transitions().zip(uncompiled.trans.iter().cloned())
+ {
+ assert_eq!(ct.inp, ut.inp);
+ assert_eq!(ct.out, ut.out);
+ assert_eq!(ct.addr, ut.addr);
+ }
+ true
+ }
+
+ fn compile(node: &BuilderNode) -> (CompiledAddr, Vec<u8>) {
+ let mut buf = vec![0; 24];
+ node.compile_to(&mut buf, NEVER_LAST, 24).unwrap();
+ (buf.len() as CompiledAddr - 1, buf)
+ }
+
+ fn roundtrip(bnode: &BuilderNode) -> bool {
+ let (addr, bytes) = compile(bnode);
+ let node = Node::new(VERSION, addr, &bytes);
+ nodes_equal(&node, &bnode)
+ }
+
+ fn trans(addr: CompiledAddr, inp: u8) -> Transition {
+ Transition { inp, out: Output::zero(), addr }
+ }
+
+ #[test]
+ fn bin_no_trans() {
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![],
+ };
+ let (addr, buf) = compile(&bnode);
+ let node = Node::new(VERSION, addr, &buf);
+ assert_eq!(node.as_slice().len(), 3);
+ roundtrip(&bnode);
+ }
+
+ #[test]
+ fn bin_one_trans_common() {
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![trans(20, b'a')],
+ };
+ let (addr, buf) = compile(&bnode);
+ let node = Node::new(VERSION, addr, &buf);
+ assert_eq!(node.as_slice().len(), 3);
+ roundtrip(&bnode);
+ }
+
+ #[test]
+ fn bin_one_trans_not_common() {
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![trans(2, b'\xff')],
+ };
+ let (addr, buf) = compile(&bnode);
+ let node = Node::new(VERSION, addr, &buf);
+ assert_eq!(node.as_slice().len(), 4);
+ roundtrip(&bnode);
+ }
+
+ #[test]
+ fn bin_many_trans() {
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![
+ trans(2, b'a'),
+ trans(3, b'b'),
+ trans(4, b'c'),
+ trans(5, b'd'),
+ trans(6, b'e'),
+ trans(7, b'f'),
+ ],
+ };
+ let (addr, buf) = compile(&bnode);
+ let node = Node::new(VERSION, addr, &buf);
+ assert_eq!(node.as_slice().len(), 14);
+ roundtrip(&bnode);
+ }
+
+ #[test]
+ fn node_max_trans() {
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: (0..256).map(|i| trans(0, i as u8)).collect(),
+ };
+ let (addr, buf) = compile(&bnode);
+ let node = Node::new(VERSION, addr, &buf);
+ assert_eq!(node.transitions().count(), 256);
+ assert_eq!(node.len(), node.transitions().count());
+ roundtrip(&bnode);
+ }
+}
diff --git a/vendor/fst/src/raw/ops.rs b/vendor/fst/src/raw/ops.rs
new file mode 100644
index 000000000..9baf85540
--- /dev/null
+++ b/vendor/fst/src/raw/ops.rs
@@ -0,0 +1,643 @@
+use std::cmp;
+use std::collections::BinaryHeap;
+use std::iter::FromIterator;
+
+use crate::raw::Output;
+use crate::stream::{IntoStreamer, Streamer};
+
+/// Permits stream operations to be hetergeneous with respect to streams.
+type BoxedStream<'f> =
+ Box<dyn for<'a> Streamer<'a, Item = (&'a [u8], Output)> + 'f>;
+
+/// A value indexed by a stream.
+///
+/// Indexed values are used to indicate the presence of a key in multiple
+/// streams during a set operation. Namely, the index corresponds to the stream
+/// (by the order in which it was added to the operation, starting at `0`)
+/// and the value corresponds to the value associated with a particular key
+/// in that stream.
+#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct IndexedValue {
+ /// The index of the stream that produced this value (starting at `0`).
+ pub index: usize,
+ /// The value.
+ pub value: u64,
+}
+
+/// A builder for collecting fst streams on which to perform set operations
+/// on the keys of fsts.
+///
+/// Set operations include intersection, union, difference and symmetric
+/// difference. The result of each set operation is itself a stream that emits
+/// pairs of keys and a sequence of each occurrence of that key in the
+/// participating streams. This information allows one to perform set
+/// operations on fsts and customize how conflicting output values are handled.
+///
+/// All set operations work efficiently on an arbitrary number of
+/// streams with memory proportional to the number of streams.
+///
+/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
+/// where `n1, n2, n3, ...` correspond to the number of elements in each
+/// stream.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying set.
+pub struct OpBuilder<'f> {
+ streams: Vec<BoxedStream<'f>>,
+}
+
+impl<'f> OpBuilder<'f> {
+ /// Create a new set operation builder.
+ #[inline]
+ pub fn new() -> OpBuilder<'f> {
+ OpBuilder { streams: vec![] }
+ }
+
+ /// Add a stream to this set operation.
+ ///
+ /// This is useful for a chaining style pattern, e.g.,
+ /// `builder.add(stream1).add(stream2).union()`.
+ ///
+ /// The stream must emit a lexicographically ordered sequence of key-value
+ /// pairs.
+ pub fn add<I, S>(mut self, stream: I) -> OpBuilder<'f>
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ self.push(stream);
+ self
+ }
+
+ /// Add a stream to this set operation.
+ ///
+ /// The stream must emit a lexicographically ordered sequence of key-value
+ /// pairs.
+ pub fn push<I, S>(&mut self, stream: I)
+ where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+ {
+ self.streams.push(Box::new(stream.into_stream()));
+ }
+
+ /// Performs a union operation on all streams that have been added.
+ ///
+ /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
+ /// first element of the tuple is the byte string key. The second element
+ /// of the tuple is a list of all occurrences of that key in participating
+ /// streams. The `IndexedValue` contains an index and the value associated
+ /// with that key in that stream. The index uniquely identifies each
+ /// stream, which is an integer that is auto-incremented when a stream
+ /// is added to this operation (starting at `0`).
+ #[inline]
+ pub fn union(self) -> Union<'f> {
+ Union {
+ heap: StreamHeap::new(self.streams),
+ outs: vec![],
+ cur_slot: None,
+ }
+ }
+
+ /// Performs an intersection operation on all streams that have been added.
+ ///
+ /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
+ /// first element of the tuple is the byte string key. The second element
+ /// of the tuple is a list of all occurrences of that key in participating
+ /// streams. The `IndexedValue` contains an index and the value associated
+ /// with that key in that stream. The index uniquely identifies each
+ /// stream, which is an integer that is auto-incremented when a stream
+ /// is added to this operation (starting at `0`).
+ #[inline]
+ pub fn intersection(self) -> Intersection<'f> {
+ Intersection {
+ heap: StreamHeap::new(self.streams),
+ outs: vec![],
+ cur_slot: None,
+ }
+ }
+
+ /// Performs a difference operation with respect to the first stream added.
+ /// That is, this returns a stream of all elements in the first stream
+ /// that don't exist in any other stream that has been added.
+ ///
+ /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
+ /// first element of the tuple is the byte string key. The second element
+ /// of the tuple is a list of all occurrences of that key in participating
+ /// streams. The `IndexedValue` contains an index and the value associated
+ /// with that key in that stream. The index uniquely identifies each
+ /// stream, which is an integer that is auto-incremented when a stream
+ /// is added to this operation (starting at `0`).
+ ///
+ /// The interface is the same for all the operations, but due to the nature
+ /// of `difference`, each yielded key contains exactly one `IndexValue` with
+ /// `index` set to 0.
+ #[inline]
+ pub fn difference(mut self) -> Difference<'f> {
+ let first = self.streams.swap_remove(0);
+ Difference {
+ set: first,
+ key: vec![],
+ heap: StreamHeap::new(self.streams),
+ outs: vec![],
+ }
+ }
+
+ /// Performs a symmetric difference operation on all of the streams that
+ /// have been added.
+ ///
+ /// When there are only two streams, then the keys returned correspond to
+ /// keys that are in either stream but *not* in both streams.
+ ///
+ /// More generally, for any number of streams, keys that occur in an odd
+ /// number of streams are returned.
+ ///
+ /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
+ /// first element of the tuple is the byte string key. The second element
+ /// of the tuple is a list of all occurrences of that key in participating
+ /// streams. The `IndexedValue` contains an index and the value associated
+ /// with that key in that stream. The index uniquely identifies each
+ /// stream, which is an integer that is auto-incremented when a stream
+ /// is added to this operation (starting at `0`).
+ #[inline]
+ pub fn symmetric_difference(self) -> SymmetricDifference<'f> {
+ SymmetricDifference {
+ heap: StreamHeap::new(self.streams),
+ outs: vec![],
+ cur_slot: None,
+ }
+ }
+}
+
+impl<'f, I, S> Extend<I> for OpBuilder<'f>
+where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+{
+ fn extend<T>(&mut self, it: T)
+ where
+ T: IntoIterator<Item = I>,
+ {
+ for stream in it {
+ self.push(stream);
+ }
+ }
+}
+
+impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
+where
+ I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
+ S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
+{
+ fn from_iter<T>(it: T) -> OpBuilder<'f>
+ where
+ T: IntoIterator<Item = I>,
+ {
+ let mut op = OpBuilder::new();
+ op.extend(it);
+ op
+ }
+}
+
+/// A stream of set union over multiple fst streams in lexicographic order.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying map.
+pub struct Union<'f> {
+ heap: StreamHeap<'f>,
+ outs: Vec<IndexedValue>,
+ cur_slot: Option<Slot>,
+}
+
+impl<'a, 'f> Streamer<'a> for Union<'f> {
+ type Item = (&'a [u8], &'a [IndexedValue]);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
+ if let Some(slot) = self.cur_slot.take() {
+ self.heap.refill(slot);
+ }
+ let slot = match self.heap.pop() {
+ None => return None,
+ Some(slot) => {
+ self.cur_slot = Some(slot);
+ self.cur_slot.as_ref().unwrap()
+ }
+ };
+ self.outs.clear();
+ self.outs.push(slot.indexed_value());
+ while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
+ self.outs.push(slot2.indexed_value());
+ self.heap.refill(slot2);
+ }
+ Some((slot.input(), &self.outs))
+ }
+}
+
+/// A stream of set intersection over multiple fst streams in lexicographic
+/// order.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct Intersection<'f> {
+ heap: StreamHeap<'f>,
+ outs: Vec<IndexedValue>,
+ cur_slot: Option<Slot>,
+}
+
+impl<'a, 'f> Streamer<'a> for Intersection<'f> {
+ type Item = (&'a [u8], &'a [IndexedValue]);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
+ if let Some(slot) = self.cur_slot.take() {
+ self.heap.refill(slot);
+ }
+ loop {
+ let slot = match self.heap.pop() {
+ None => return None,
+ Some(slot) => slot,
+ };
+ self.outs.clear();
+ self.outs.push(slot.indexed_value());
+ let mut popped: usize = 1;
+ while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
+ self.outs.push(slot2.indexed_value());
+ self.heap.refill(slot2);
+ popped += 1;
+ }
+ if popped < self.heap.num_slots() {
+ self.heap.refill(slot);
+ } else {
+ self.cur_slot = Some(slot);
+ let key = self.cur_slot.as_ref().unwrap().input();
+ return Some((key, &self.outs));
+ }
+ }
+ }
+}
+
+/// A stream of set difference over multiple fst streams in lexicographic
+/// order.
+///
+/// The difference operation is taken with respect to the first stream and the
+/// rest of the streams. i.e., All elements in the first stream that do not
+/// appear in any other streams.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct Difference<'f> {
+ set: BoxedStream<'f>,
+ key: Vec<u8>,
+ heap: StreamHeap<'f>,
+ outs: Vec<IndexedValue>,
+}
+
+impl<'a, 'f> Streamer<'a> for Difference<'f> {
+ type Item = (&'a [u8], &'a [IndexedValue]);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
+ loop {
+ match self.set.next() {
+ None => return None,
+ Some((key, out)) => {
+ self.key.clear();
+ self.key.extend(key);
+ self.outs.clear();
+ self.outs
+ .push(IndexedValue { index: 0, value: out.value() });
+ }
+ };
+ let mut unique = true;
+ while let Some(slot) = self.heap.pop_if_le(&self.key) {
+ if slot.input() == &*self.key {
+ unique = false;
+ }
+ self.heap.refill(slot);
+ }
+ if unique {
+ return Some((&self.key, &self.outs));
+ }
+ }
+ }
+}
+
+/// A stream of set symmetric difference over multiple fst streams in
+/// lexicographic order.
+///
+/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
+pub struct SymmetricDifference<'f> {
+ heap: StreamHeap<'f>,
+ outs: Vec<IndexedValue>,
+ cur_slot: Option<Slot>,
+}
+
+impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> {
+ type Item = (&'a [u8], &'a [IndexedValue]);
+
+ fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> {
+ if let Some(slot) = self.cur_slot.take() {
+ self.heap.refill(slot);
+ }
+ loop {
+ let slot = match self.heap.pop() {
+ None => return None,
+ Some(slot) => slot,
+ };
+ self.outs.clear();
+ self.outs.push(slot.indexed_value());
+ let mut popped: usize = 1;
+ while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
+ self.outs.push(slot2.indexed_value());
+ self.heap.refill(slot2);
+ popped += 1;
+ }
+ // This key is in the symmetric difference if and only if it
+ // appears in an odd number of sets.
+ if popped % 2 == 0 {
+ self.heap.refill(slot);
+ } else {
+ self.cur_slot = Some(slot);
+ let key = self.cur_slot.as_ref().unwrap().input();
+ return Some((key, &self.outs));
+ }
+ }
+ }
+}
+
+struct StreamHeap<'f> {
+ rdrs: Vec<BoxedStream<'f>>,
+ heap: BinaryHeap<Slot>,
+}
+
+impl<'f> StreamHeap<'f> {
+ fn new(streams: Vec<BoxedStream<'f>>) -> StreamHeap<'f> {
+ let mut u = StreamHeap { rdrs: streams, heap: BinaryHeap::new() };
+ for i in 0..u.rdrs.len() {
+ u.refill(Slot::new(i));
+ }
+ u
+ }
+
+ fn pop(&mut self) -> Option<Slot> {
+ self.heap.pop()
+ }
+
+ fn peek_is_duplicate(&self, key: &[u8]) -> bool {
+ self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
+ }
+
+ fn pop_if_equal(&mut self, key: &[u8]) -> Option<Slot> {
+ if self.peek_is_duplicate(key) {
+ self.pop()
+ } else {
+ None
+ }
+ }
+
+ fn pop_if_le(&mut self, key: &[u8]) -> Option<Slot> {
+ if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
+ self.pop()
+ } else {
+ None
+ }
+ }
+
+ fn num_slots(&self) -> usize {
+ self.rdrs.len()
+ }
+
+ fn refill(&mut self, mut slot: Slot) {
+ if let Some((input, output)) = self.rdrs[slot.idx].next() {
+ slot.set_input(input);
+ slot.set_output(output);
+ self.heap.push(slot);
+ }
+ }
+}
+
+#[derive(Debug, Eq, PartialEq)]
+struct Slot {
+ idx: usize,
+ input: Vec<u8>,
+ output: Output,
+}
+
+impl Slot {
+ fn new(rdr_idx: usize) -> Slot {
+ Slot {
+ idx: rdr_idx,
+ input: Vec::with_capacity(64),
+ output: Output::zero(),
+ }
+ }
+
+ fn indexed_value(&self) -> IndexedValue {
+ IndexedValue { index: self.idx, value: self.output.value() }
+ }
+
+ fn input(&self) -> &[u8] {
+ &self.input
+ }
+
+ fn set_input(&mut self, input: &[u8]) {
+ self.input.clear();
+ self.input.extend(input);
+ }
+
+ fn set_output(&mut self, output: Output) {
+ self.output = output;
+ }
+}
+
+impl PartialOrd for Slot {
+ fn partial_cmp(&self, other: &Slot) -> Option<cmp::Ordering> {
+ (&self.input, self.output)
+ .partial_cmp(&(&other.input, other.output))
+ .map(|ord| ord.reverse())
+ }
+}
+
+impl Ord for Slot {
+ fn cmp(&self, other: &Slot) -> cmp::Ordering {
+ self.partial_cmp(other).unwrap()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::raw::tests::{fst_map, fst_set};
+ use crate::raw::Fst;
+ use crate::stream::{IntoStreamer, Streamer};
+
+ use super::OpBuilder;
+
+ fn s(string: &str) -> String {
+ string.to_owned()
+ }
+
+ macro_rules! create_set_op {
+ ($name:ident, $op:ident) => {
+ fn $name(sets: Vec<Vec<&str>>) -> Vec<String> {
+ let fsts: Vec<Fst<_>> =
+ sets.into_iter().map(fst_set).collect();
+ let op: OpBuilder = fsts.iter().collect();
+ let mut stream = op.$op().into_stream();
+ let mut keys = vec![];
+ while let Some((key, _)) = stream.next() {
+ keys.push(String::from_utf8(key.to_vec()).unwrap());
+ }
+ keys
+ }
+ };
+ }
+
+ macro_rules! create_map_op {
+ ($name:ident, $op:ident) => {
+ fn $name(sets: Vec<Vec<(&str, u64)>>) -> Vec<(String, u64)> {
+ let fsts: Vec<Fst<_>> =
+ sets.into_iter().map(fst_map).collect();
+ let op: OpBuilder = fsts.iter().collect();
+ let mut stream = op.$op().into_stream();
+ let mut keys = vec![];
+ while let Some((key, outs)) = stream.next() {
+ let merged = outs.iter().fold(0, |a, b| a + b.value);
+ let s = String::from_utf8(key.to_vec()).unwrap();
+ keys.push((s, merged));
+ }
+ keys
+ }
+ };
+ }
+
+ create_set_op!(fst_union, union);
+ create_set_op!(fst_intersection, intersection);
+ create_set_op!(fst_symmetric_difference, symmetric_difference);
+ create_set_op!(fst_difference, difference);
+ create_map_op!(fst_union_map, union);
+ create_map_op!(fst_intersection_map, intersection);
+ create_map_op!(fst_symmetric_difference_map, symmetric_difference);
+ create_map_op!(fst_difference_map, difference);
+
+ #[test]
+ fn union_set() {
+ let v = fst_union(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]);
+ assert_eq!(v, vec!["a", "b", "c", "x", "y", "z"]);
+ }
+
+ #[test]
+ fn union_set_dupes() {
+ let v = fst_union(vec![vec!["aa", "b", "cc"], vec!["b", "cc", "z"]]);
+ assert_eq!(v, vec!["aa", "b", "cc", "z"]);
+ }
+
+ #[test]
+ fn union_map() {
+ let v = fst_union_map(vec![
+ vec![("a", 1), ("b", 2), ("c", 3)],
+ vec![("x", 1), ("y", 2), ("z", 3)],
+ ]);
+ assert_eq!(
+ v,
+ vec![
+ (s("a"), 1),
+ (s("b"), 2),
+ (s("c"), 3),
+ (s("x"), 1),
+ (s("y"), 2),
+ (s("z"), 3),
+ ]
+ );
+ }
+
+ #[test]
+ fn union_map_dupes() {
+ let v = fst_union_map(vec![
+ vec![("aa", 1), ("b", 2), ("cc", 3)],
+ vec![("b", 1), ("cc", 2), ("z", 3)],
+ vec![("b", 1)],
+ ]);
+ assert_eq!(
+ v,
+ vec![(s("aa"), 1), (s("b"), 4), (s("cc"), 5), (s("z"), 3),]
+ );
+ }
+
+ #[test]
+ fn intersect_set() {
+ let v =
+ fst_intersection(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]);
+ assert_eq!(v, Vec::<String>::new());
+ }
+
+ #[test]
+ fn intersect_set_dupes() {
+ let v = fst_intersection(vec![
+ vec!["aa", "b", "cc"],
+ vec!["b", "cc", "z"],
+ ]);
+ assert_eq!(v, vec!["b", "cc"]);
+ }
+
+ #[test]
+ fn intersect_map() {
+ let v = fst_intersection_map(vec![
+ vec![("a", 1), ("b", 2), ("c", 3)],
+ vec![("x", 1), ("y", 2), ("z", 3)],
+ ]);
+ assert_eq!(v, Vec::<(String, u64)>::new());
+ }
+
+ #[test]
+ fn intersect_map_dupes() {
+ let v = fst_intersection_map(vec![
+ vec![("aa", 1), ("b", 2), ("cc", 3)],
+ vec![("b", 1), ("cc", 2), ("z", 3)],
+ vec![("b", 1)],
+ ]);
+ assert_eq!(v, vec![(s("b"), 4)]);
+ }
+
+ #[test]
+ fn symmetric_difference() {
+ let v = fst_symmetric_difference(vec![
+ vec!["a", "b", "c"],
+ vec!["a", "b"],
+ vec!["a"],
+ ]);
+ assert_eq!(v, vec!["a", "c"]);
+ }
+
+ #[test]
+ fn symmetric_difference_map() {
+ let v = fst_symmetric_difference_map(vec![
+ vec![("a", 1), ("b", 2), ("c", 3)],
+ vec![("a", 1), ("b", 2)],
+ vec![("a", 1)],
+ ]);
+ assert_eq!(v, vec![(s("a"), 3), (s("c"), 3)]);
+ }
+
+ #[test]
+ fn difference() {
+ let v = fst_difference(vec![
+ vec!["a", "b", "c"],
+ vec!["a", "b"],
+ vec!["a"],
+ ]);
+ assert_eq!(v, vec!["c"]);
+ }
+
+ #[test]
+ fn difference2() {
+ // Regression test: https://github.com/BurntSushi/fst/issues/19
+ let v = fst_difference(vec![vec!["a", "c"], vec!["b", "c"]]);
+ assert_eq!(v, vec!["a"]);
+ let v = fst_difference(vec![vec!["bar", "foo"], vec!["baz", "foo"]]);
+ assert_eq!(v, vec!["bar"]);
+ }
+
+ #[test]
+ fn difference_map() {
+ let v = fst_difference_map(vec![
+ vec![("a", 1), ("b", 2), ("c", 3)],
+ vec![("a", 1), ("b", 2)],
+ vec![("a", 1)],
+ ]);
+ assert_eq!(v, vec![(s("c"), 3)]);
+ }
+}
diff --git a/vendor/fst/src/raw/registry.rs b/vendor/fst/src/raw/registry.rs
new file mode 100644
index 000000000..3b00a5b0c
--- /dev/null
+++ b/vendor/fst/src/raw/registry.rs
@@ -0,0 +1,264 @@
+use crate::raw::build::BuilderNode;
+use crate::raw::{CompiledAddr, NONE_ADDRESS};
+
+#[derive(Debug)]
+pub struct Registry {
+ table: Vec<RegistryCell>,
+ table_size: usize, // number of rows
+ mru_size: usize, // number of columns
+}
+
+#[derive(Debug)]
+struct RegistryCache<'a> {
+ cells: &'a mut [RegistryCell],
+}
+
+#[derive(Clone, Debug)]
+pub struct RegistryCell {
+ addr: CompiledAddr,
+ node: BuilderNode,
+}
+
+#[derive(Debug)]
+pub enum RegistryEntry<'a> {
+ Found(CompiledAddr),
+ NotFound(&'a mut RegistryCell),
+ Rejected,
+}
+
+impl Registry {
+ pub fn new(table_size: usize, mru_size: usize) -> Registry {
+ let empty_cell = RegistryCell::none();
+ let ncells = table_size.checked_mul(mru_size).unwrap();
+ Registry { table: vec![empty_cell; ncells], table_size, mru_size }
+ }
+
+ pub fn entry<'a>(&'a mut self, node: &BuilderNode) -> RegistryEntry<'a> {
+ if self.table.is_empty() {
+ return RegistryEntry::Rejected;
+ }
+ let bucket = self.hash(node);
+ let start = self.mru_size * bucket;
+ let end = start + self.mru_size;
+ RegistryCache { cells: &mut self.table[start..end] }.entry(node)
+ }
+
+ fn hash(&self, node: &BuilderNode) -> usize {
+ // Basic FNV-1a hash as described:
+ // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+ //
+ // In unscientific experiments, this provides the same compression
+ // as `std::hash::SipHasher` but is much much faster.
+ const FNV_PRIME: u64 = 1099511628211;
+ let mut h = 14695981039346656037;
+ h = (h ^ (node.is_final as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ node.final_output.value()).wrapping_mul(FNV_PRIME);
+ for t in &node.trans {
+ h = (h ^ (t.inp as u64)).wrapping_mul(FNV_PRIME);
+ h = (h ^ t.out.value()).wrapping_mul(FNV_PRIME);
+ h = (h ^ (t.addr as u64)).wrapping_mul(FNV_PRIME);
+ }
+ (h as usize) % self.table_size
+ }
+}
+
+impl<'a> RegistryCache<'a> {
+ fn entry(mut self, node: &BuilderNode) -> RegistryEntry<'a> {
+ if self.cells.len() == 1 {
+ let cell = &mut self.cells[0];
+ if !cell.is_none() && &cell.node == node {
+ RegistryEntry::Found(cell.addr)
+ } else {
+ cell.node.clone_from(node);
+ RegistryEntry::NotFound(cell)
+ }
+ } else if self.cells.len() == 2 {
+ let cell1 = &mut self.cells[0];
+ if !cell1.is_none() && &cell1.node == node {
+ return RegistryEntry::Found(cell1.addr);
+ }
+
+ let cell2 = &mut self.cells[1];
+ if !cell2.is_none() && &cell2.node == node {
+ let addr = cell2.addr;
+ self.cells.swap(0, 1);
+ return RegistryEntry::Found(addr);
+ }
+
+ self.cells[1].node.clone_from(node);
+ self.cells.swap(0, 1);
+ RegistryEntry::NotFound(&mut self.cells[0])
+ } else {
+ let find = |c: &RegistryCell| !c.is_none() && &c.node == node;
+ if let Some(i) = self.cells.iter().position(find) {
+ let addr = self.cells[i].addr;
+ self.promote(i); // most recently used
+ RegistryEntry::Found(addr)
+ } else {
+ let last = self.cells.len() - 1;
+ self.cells[last].node.clone_from(node); // discard LRU
+ self.promote(last);
+ RegistryEntry::NotFound(&mut self.cells[0])
+ }
+ }
+ }
+
+ fn promote(&mut self, mut i: usize) {
+ assert!(i < self.cells.len());
+ while i > 0 {
+ self.cells.swap(i - 1, i);
+ i -= 1;
+ }
+ }
+}
+
+impl RegistryCell {
+ fn none() -> RegistryCell {
+ RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() }
+ }
+
+ fn is_none(&self) -> bool {
+ self.addr == NONE_ADDRESS
+ }
+
+ pub fn insert(&mut self, addr: CompiledAddr) {
+ self.addr = addr;
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{Registry, RegistryCache, RegistryCell, RegistryEntry};
+ use crate::raw::build::BuilderNode;
+ use crate::raw::{Output, Transition};
+
+ fn assert_rejected(entry: RegistryEntry) {
+ match entry {
+ RegistryEntry::Rejected => {}
+ entry => panic!("expected rejected entry, got: {:?}", entry),
+ }
+ }
+
+ fn assert_not_found(entry: RegistryEntry) {
+ match entry {
+ RegistryEntry::NotFound(_) => {}
+ entry => panic!("expected nout found entry, got: {:?}", entry),
+ }
+ }
+
+ fn assert_insert_and_found(reg: &mut Registry, bnode: &BuilderNode) {
+ match reg.entry(&bnode) {
+ RegistryEntry::NotFound(cell) => cell.insert(1234),
+ entry => panic!("unexpected not found entry, got: {:?}", entry),
+ }
+ match reg.entry(&bnode) {
+ RegistryEntry::Found(addr) => assert_eq!(addr, 1234),
+ entry => panic!("unexpected found entry, got: {:?}", entry),
+ }
+ }
+
+ #[test]
+ fn empty_is_ok() {
+ let mut reg = Registry::new(0, 0);
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![],
+ };
+ assert_rejected(reg.entry(&bnode));
+ }
+
+ #[test]
+ fn one_final_is_ok() {
+ let mut reg = Registry::new(1, 1);
+ let bnode = BuilderNode {
+ is_final: true,
+ final_output: Output::zero(),
+ trans: vec![],
+ };
+ assert_insert_and_found(&mut reg, &bnode);
+ }
+
+ #[test]
+ fn one_with_trans_is_ok() {
+ let mut reg = Registry::new(1, 1);
+ let bnode = BuilderNode {
+ is_final: false,
+ final_output: Output::zero(),
+ trans: vec![Transition {
+ addr: 0,
+ inp: b'a',
+ out: Output::zero(),
+ }],
+ };
+ assert_insert_and_found(&mut reg, &bnode);
+ assert_not_found(
+ reg.entry(&BuilderNode { is_final: true, ..bnode.clone() }),
+ );
+ assert_not_found(reg.entry(&BuilderNode {
+ trans: vec![Transition {
+ addr: 0,
+ inp: b'b',
+ out: Output::zero(),
+ }],
+ ..bnode.clone()
+ }));
+ assert_not_found(reg.entry(&BuilderNode {
+ trans: vec![Transition {
+ addr: 0,
+ inp: b'a',
+ out: Output::new(1),
+ }],
+ ..bnode.clone()
+ }));
+ }
+
+ #[test]
+ fn cache_works() {
+ let mut reg = Registry::new(1, 1);
+
+ let bnode1 = BuilderNode { is_final: true, ..BuilderNode::default() };
+ assert_insert_and_found(&mut reg, &bnode1);
+
+ let bnode2 =
+ BuilderNode { final_output: Output::new(1), ..bnode1.clone() };
+ assert_insert_and_found(&mut reg, &bnode2);
+ assert_not_found(reg.entry(&bnode1));
+ }
+
+ #[test]
+ fn promote() {
+ let bn = BuilderNode::default();
+ let mut bnodes = vec![
+ RegistryCell { addr: 1, node: bn.clone() },
+ RegistryCell { addr: 2, node: bn.clone() },
+ RegistryCell { addr: 3, node: bn.clone() },
+ RegistryCell { addr: 4, node: bn.clone() },
+ ];
+ let mut cache = RegistryCache { cells: &mut bnodes };
+
+ cache.promote(0);
+ assert_eq!(cache.cells[0].addr, 1);
+ assert_eq!(cache.cells[1].addr, 2);
+ assert_eq!(cache.cells[2].addr, 3);
+ assert_eq!(cache.cells[3].addr, 4);
+
+ cache.promote(1);
+ assert_eq!(cache.cells[0].addr, 2);
+ assert_eq!(cache.cells[1].addr, 1);
+ assert_eq!(cache.cells[2].addr, 3);
+ assert_eq!(cache.cells[3].addr, 4);
+
+ cache.promote(3);
+ assert_eq!(cache.cells[0].addr, 4);
+ assert_eq!(cache.cells[1].addr, 2);
+ assert_eq!(cache.cells[2].addr, 1);
+ assert_eq!(cache.cells[3].addr, 3);
+
+ cache.promote(2);
+ assert_eq!(cache.cells[0].addr, 1);
+ assert_eq!(cache.cells[1].addr, 4);
+ assert_eq!(cache.cells[2].addr, 2);
+ assert_eq!(cache.cells[3].addr, 3);
+ }
+}
diff --git a/vendor/fst/src/raw/registry_minimal.rs b/vendor/fst/src/raw/registry_minimal.rs
new file mode 100644
index 000000000..663b0123a
--- /dev/null
+++ b/vendor/fst/src/raw/registry_minimal.rs
@@ -0,0 +1,53 @@
+// This module is a drop-in but inefficient replacement of the LRU registry.
+// In particular, this registry will never forget a node. In other words, if
+// this registry is used during construction, then you're guaranteed a minimal
+// FST.
+//
+// This is really only meant to be used for debugging and experiments. It is
+// a memory/CPU hog.
+//
+// One "easy" improvement here is to use an FNV hash instead of the super
+// expensive SipHasher.
+
+#![allow(dead_code)]
+
+use std::collections::hash_map::{Entry, HashMap};
+
+use crate::raw::build::BuilderNode;
+use crate::raw::CompiledAddr;
+
+#[derive(Debug)]
+pub struct Registry {
+ table: HashMap<BuilderNode, RegistryCell>,
+}
+
+#[derive(Debug)]
+pub enum RegistryEntry<'a> {
+ Found(CompiledAddr),
+ NotFound(&'a mut RegistryCell),
+ Rejected,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub struct RegistryCell(CompiledAddr);
+
+impl Registry {
+ pub fn new(table_size: usize, _lru_size: usize) -> Registry {
+ Registry { table: HashMap::with_capacity(table_size) }
+ }
+
+ pub fn entry<'a>(&'a mut self, bnode: &BuilderNode) -> RegistryEntry<'a> {
+ match self.table.entry(bnode.clone()) {
+ Entry::Occupied(v) => RegistryEntry::Found(v.get().0),
+ Entry::Vacant(v) => {
+ RegistryEntry::NotFound(v.insert(RegistryCell(0)))
+ }
+ }
+ }
+}
+
+impl RegistryCell {
+ pub fn insert(&mut self, addr: CompiledAddr) {
+ self.0 = addr;
+ }
+}
diff --git a/vendor/fst/src/raw/tests.rs b/vendor/fst/src/raw/tests.rs
new file mode 100644
index 000000000..c9e77a5f7
--- /dev/null
+++ b/vendor/fst/src/raw/tests.rs
@@ -0,0 +1,589 @@
+use crate::automaton::AlwaysMatch;
+use crate::error::Error;
+use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION};
+use crate::stream::Streamer;
+
+const TEXT: &'static str = include_str!("./../../data/words-100000");
+
+pub fn fst_set<I, S>(ss: I) -> Fst<Vec<u8>>
+where
+ I: IntoIterator<Item = S>,
+ S: AsRef<[u8]>,
+{
+ let mut bfst = Builder::memory();
+ let mut ss: Vec<Vec<u8>> =
+ ss.into_iter().map(|s| s.as_ref().to_vec()).collect();
+ ss.sort();
+ ss.dedup();
+ for s in ss.iter().into_iter() {
+ bfst.add(s).unwrap();
+ }
+ let fst = bfst.into_fst();
+ assert_eq!(fst.len(), ss.len());
+ fst
+}
+
+pub fn fst_map<I, S>(ss: I) -> Fst<Vec<u8>>
+where
+ I: IntoIterator<Item = (S, u64)>,
+ S: AsRef<[u8]>,
+{
+ let mut bfst = Builder::memory();
+ let mut ss: Vec<(Vec<u8>, u64)> =
+ ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect();
+ ss.sort();
+ ss.dedup();
+ for (s, o) in ss.into_iter() {
+ bfst.insert(s, o).unwrap();
+ }
+ bfst.into_fst()
+}
+
+pub fn fst_inputs<D: AsRef<[u8]>>(fst: &Fst<D>) -> Vec<Vec<u8>> {
+ let mut words = vec![];
+ let mut rdr = fst.stream();
+ while let Some((word, _)) = rdr.next() {
+ words.push(word.to_vec());
+ }
+ words
+}
+
+pub fn fst_inputs_outputs<D: AsRef<[u8]>>(
+ fst: &Fst<D>,
+) -> Vec<(Vec<u8>, u64)> {
+ let mut words = vec![];
+ let mut rdr = fst.stream();
+ while let Some((word, out)) = rdr.next() {
+ words.push((word.to_vec(), out.value()));
+ }
+ words
+}
+
+macro_rules! test_set {
+ ($name:ident, $($s:expr),+) => {
+ #[test]
+ fn $name() {
+ let mut items = vec![$($s),*];
+ let fst = fst_set(&items);
+ let mut rdr = fst.stream();
+ items.sort();
+ items.dedup();
+ for item in &items {
+ assert_eq!(rdr.next().unwrap().0, item.as_bytes());
+ }
+ assert_eq!(rdr.next(), None);
+ for item in &items {
+ assert!(fst.get(item).is_some());
+ }
+ }
+ }
+}
+
+macro_rules! test_set_fail {
+ ($name:ident, $($s:expr),+) => {
+ #[test]
+ #[should_panic]
+ fn $name() {
+ let mut bfst = Builder::memory();
+ $(bfst.add($s).unwrap();)*
+ }
+ }
+}
+
+test_set!(fst_set_only_empty, "");
+test_set!(fst_set_one, "a");
+test_set!(fst_set_dupe_empty, "", "");
+test_set!(fst_set_dupe1, "a", "a");
+test_set!(fst_set_dupe2, "a", "b", "b");
+test_set!(fst_set_two1, "a", "b");
+test_set!(fst_set_two2, "a", "ab");
+test_set!(fst_set_jan, "jam", "jbm", "jcm", "jdm", "jem", "jfm", "jgm");
+
+test_set_fail!(fst_set_order1, "b", "a");
+test_set_fail!(fst_set_order2, "a", "b", "c", "a");
+
+#[test]
+fn fst_set_100000() {
+ let words: Vec<Vec<u8>> =
+ TEXT.lines().map(|s| s.as_bytes().to_vec()).collect();
+ let fst = fst_set(words.clone());
+ assert_eq!(words, fst_inputs(&fst));
+ for word in &words {
+ assert!(
+ fst.get(word).is_some(),
+ "failed to find word: {}",
+ std::str::from_utf8(word).unwrap()
+ );
+ }
+}
+
+macro_rules! test_map {
+ ($name:ident, $($s:expr, $o:expr),+) => {
+ #[test]
+ fn $name() {
+ let fst = fst_map(vec![$(($s, $o)),*]);
+ let mut rdr = fst.stream();
+ $({
+ let (s, o) = rdr.next().unwrap();
+ assert_eq!((s, o.value()), ($s.as_bytes(), $o));
+ })*
+ assert_eq!(rdr.next(), None);
+ $({
+ assert_eq!(fst.get($s.as_bytes()), Some(Output::new($o)));
+ })*
+ }
+ }
+}
+
+macro_rules! test_map_fail {
+ ($name:ident, $($s:expr, $o:expr),+) => {
+ #[test]
+ #[should_panic]
+ fn $name() {
+ let mut bfst = Builder::memory();
+ $(bfst.insert($s, $o).unwrap();)*
+ }
+ }
+}
+
+test_map!(fst_map_only_empty1, "", 0);
+test_map!(fst_map_only_empty2, "", 100);
+test_map!(fst_map_only_empty3, "", 9999999999);
+test_map!(fst_map_one1, "a", 0);
+test_map!(fst_map_one2, "a", 100);
+test_map!(fst_map_one3, "a", 999999999);
+test_map!(fst_map_two, "a", 1, "b", 2);
+test_map!(fst_map_many1, "a", 34786, "ab", 26);
+test_map!(
+ fst_map_many2,
+ "a",
+ 34786,
+ "ab",
+ 26,
+ "abc",
+ 58976,
+ "abcd",
+ 25,
+ "z",
+ 58,
+ "zabc",
+ 6798
+);
+test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0);
+
+test_map_fail!(fst_map_dupe_empty, "", 0, "", 0);
+test_map_fail!(fst_map_dupe1, "a", 0, "a", 0);
+test_map_fail!(fst_map_dupe2, "a", 0, "b", 0, "b", 0);
+test_map_fail!(fst_map_order1, "b", 0, "a", 0);
+test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0);
+
+#[test]
+fn fst_map_100000_increments() {
+ let words: Vec<(Vec<u8>, u64)> = TEXT
+ .lines()
+ .enumerate()
+ .map(|(i, s)| (s.as_bytes().to_vec(), i as u64))
+ .collect();
+ let fst = fst_map(words.clone());
+ assert_eq!(words, fst_inputs_outputs(&fst));
+ for &(ref word, out) in &words {
+ assert_eq!(fst.get(word), Some(Output::new(out)));
+ }
+}
+
+#[test]
+fn fst_map_100000_lengths() {
+ let words: Vec<(Vec<u8>, u64)> = TEXT
+ .lines()
+ .map(|s| (s.as_bytes().to_vec(), s.len() as u64))
+ .collect();
+ let fst = fst_map(words.clone());
+ assert_eq!(words, fst_inputs_outputs(&fst));
+ for &(ref word, out) in &words {
+ assert_eq!(fst.get(word), Some(Output::new(out)));
+ }
+}
+
+#[test]
+fn invalid_version() {
+ match Fst::new(vec![0; 36]) {
+ Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0),
+ Err(err) => panic!("expected version error, got {:?}", err),
+ Ok(_) => panic!("expected version error, got FST"),
+ }
+}
+
+#[test]
+fn invalid_version_crate_too_old() {
+ let mut buf = vec![0; 36];
+ crate::bytes::write_u64_le(VERSION + 1, &mut buf);
+ match Fst::new(buf) {
+ Err(Error::Fst(raw::Error::Version { got, .. })) => {
+ assert_eq!(got, VERSION + 1);
+ }
+ Err(err) => panic!("expected version error, got {:?}", err),
+ Ok(_) => panic!("expected version error, got FST"),
+ }
+}
+
+#[test]
+fn invalid_format() {
+ match Fst::new(vec![0; 0]) {
+ Err(Error::Fst(raw::Error::Format { .. })) => {}
+ Err(err) => panic!("expected format error, got {:?}", err),
+ Ok(_) => panic!("expected format error, got FST"),
+ }
+}
+
+#[test]
+fn fst_set_zero() {
+ let fst = fst_set::<_, String>(vec![]);
+ let mut rdr = fst.stream();
+ assert_eq!(rdr.next(), None);
+}
+
+macro_rules! test_range {
+ (
+ $name:ident,
+ min: $min:expr,
+ max: $max:expr,
+ imin: $imin:expr,
+ imax: $imax:expr,
+ $($s:expr),*
+ ) => {
+ #[test]
+ fn $name() {
+ let items: Vec<&'static str> = vec![$($s),*];
+ let items: Vec<_> = items
+ .into_iter()
+ .enumerate()
+ .map(|(i, k)| (k, i as u64))
+ .collect();
+ let fst = fst_map(items.clone());
+ let mut rdr = Stream::new(fst.as_ref(), AlwaysMatch, $min, $max);
+ for i in $imin..$imax {
+ assert_eq!(
+ rdr.next().unwrap(),
+ (items[i].0.as_bytes(), Output::new(items[i].1)),
+ );
+ }
+ assert_eq!(rdr.next(), None);
+ }
+ }
+}
+
+test_range! {
+ fst_range_empty_1,
+ min: Bound::Unbounded, max: Bound::Unbounded,
+ imin: 0, imax: 0,
+}
+
+test_range! {
+ fst_range_empty_2,
+ min: Bound::Unbounded, max: Bound::Unbounded,
+ imin: 0, imax: 1,
+ ""
+}
+
+test_range! {
+ fst_range_empty_3,
+ min: Bound::Included(vec![]), max: Bound::Unbounded,
+ imin: 0, imax: 1,
+ ""
+}
+
+test_range! {
+ fst_range_empty_4,
+ min: Bound::Excluded(vec![]), max: Bound::Unbounded,
+ imin: 0, imax: 0,
+ ""
+}
+
+test_range! {
+ fst_range_empty_5,
+ min: Bound::Included(vec![]), max: Bound::Unbounded,
+ imin: 0, imax: 2,
+ "", "a"
+}
+
+test_range! {
+ fst_range_empty_6,
+ min: Bound::Excluded(vec![]), max: Bound::Unbounded,
+ imin: 1, imax: 2,
+ "", "a"
+}
+
+test_range! {
+ fst_range_empty_7,
+ min: Bound::Unbounded, max: Bound::Unbounded,
+ imin: 0, imax: 2,
+ "", "a"
+}
+
+test_range! {
+ fst_range_empty_8,
+ min: Bound::Unbounded, max: Bound::Included(vec![]),
+ imin: 0, imax: 1,
+ ""
+}
+
+test_range! {
+ fst_range_empty_9,
+ min: Bound::Unbounded, max: Bound::Excluded(vec![]),
+ imin: 0, imax: 0,
+ ""
+}
+
+test_range! {
+ fst_range_empty_10,
+ min: Bound::Unbounded, max: Bound::Included(vec![]),
+ imin: 0, imax: 1,
+ "", "a"
+}
+
+test_range! {
+ fst_range_empty_11,
+ min: Bound::Included(vec![]), max: Bound::Included(vec![]),
+ imin: 0, imax: 1,
+ ""
+}
+
+test_range! {
+ fst_range_1,
+ min: Bound::Included(vec![b'a']), max: Bound::Included(vec![b'z']),
+ imin: 0, imax: 4,
+ "a", "b", "y", "z"
+}
+
+test_range! {
+ fst_range_2,
+ min: Bound::Excluded(vec![b'a']), max: Bound::Included(vec![b'y']),
+ imin: 1, imax: 3,
+ "a", "b", "y", "z"
+}
+
+test_range! {
+ fst_range_3,
+ min: Bound::Excluded(vec![b'a']), max: Bound::Excluded(vec![b'y']),
+ imin: 1, imax: 2,
+ "a", "b", "y", "z"
+}
+
+test_range! {
+ fst_range_4,
+ min: Bound::Unbounded, max: Bound::Unbounded,
+ imin: 0, imax: 4,
+ "a", "b", "y", "z"
+}
+
+test_range! {
+ fst_range_5,
+ min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
+ imin: 0, imax: 0,
+ "a", "ab", "abc", "abcd", "abcde"
+}
+
+test_range! {
+ fst_range_6,
+ min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
+ imin: 5, imax: 6,
+ "a", "ab", "abc", "abcd", "abcde", "abe"
+}
+
+test_range! {
+ fst_range_7,
+ min: Bound::Excluded(b"abd".to_vec()), max: Bound::Unbounded,
+ imin: 5, imax: 6,
+ "a", "ab", "abc", "abcd", "abcde", "abe"
+}
+
+test_range! {
+ fst_range_8,
+ min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded,
+ imin: 5, imax: 6,
+ "a", "ab", "abc", "abcd", "abcde", "xyz"
+}
+
+test_range! {
+ fst_range_9,
+ min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
+ imin: 0, imax: 5,
+ "a", "ab", "abc", "abcd", "abcde", "abe"
+}
+
+test_range! {
+ fst_range_10,
+ min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
+ imin: 0, imax: 6,
+ "a", "ab", "abc", "abcd", "abcde", "abd"
+}
+
+test_range! {
+ fst_range_11,
+ min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()),
+ imin: 0, imax: 6,
+ "a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
+}
+
+test_range! {
+ fst_range_12,
+ min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
+ imin: 0, imax: 5,
+ "a", "ab", "abc", "abcd", "abcde", "abe"
+}
+
+test_range! {
+ fst_range_13,
+ min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
+ imin: 0, imax: 5,
+ "a", "ab", "abc", "abcd", "abcde", "abd"
+}
+
+test_range! {
+ fst_range_14,
+ min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()),
+ imin: 0, imax: 5,
+ "a", "ab", "abc", "abcd", "abcde", "abd", "abdx"
+}
+
+test_range! {
+ fst_range_15,
+ min: Bound::Included(vec![b'd']), max: Bound::Included(vec![b'c']),
+ imin: 0, imax: 0,
+ "a", "b", "c", "d", "e", "f"
+}
+
+test_range! {
+ fst_range_16,
+ min: Bound::Included(vec![b'c']), max: Bound::Included(vec![b'c']),
+ imin: 2, imax: 3,
+ "a", "b", "c", "d", "e", "f"
+}
+
+test_range! {
+ fst_range_17,
+ min: Bound::Excluded(vec![b'c']), max: Bound::Excluded(vec![b'c']),
+ imin: 0, imax: 0,
+ "a", "b", "c", "d", "e", "f"
+}
+
+test_range! {
+ fst_range_18,
+ min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'c']),
+ imin: 0, imax: 0,
+ "a", "b", "c", "d", "e", "f"
+}
+
+test_range! {
+ fst_range_19,
+ min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'd']),
+ imin: 2, imax: 3,
+ "a", "b", "c", "d", "e", "f"
+}
+
+#[test]
+fn one_vec_multiple_fsts() {
+ let mut bfst1 = Builder::memory();
+ bfst1.add(b"bar").unwrap();
+ bfst1.add(b"baz").unwrap();
+ let bytes = bfst1.into_inner().unwrap();
+ let fst1_len = bytes.len();
+
+ let mut bfst2 = Builder::new(bytes).unwrap();
+ bfst2.add(b"bar").unwrap();
+ bfst2.add(b"foo").unwrap();
+
+ let bytes = bfst2.into_inner().unwrap();
+ let slice1 = &bytes[0..fst1_len];
+ let slice2 = &bytes[fst1_len..bytes.len()];
+
+ let fst1 = Fst::new(slice1).unwrap();
+ let fst2 = Fst::new(slice2).unwrap();
+
+ assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]);
+ assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]);
+}
+
+#[test]
+fn bytes_written() {
+ let mut bfst1 = Builder::memory();
+ bfst1.add(b"bar").unwrap();
+ bfst1.add(b"baz").unwrap();
+ let counted_len = bfst1.bytes_written();
+ let bytes = bfst1.into_inner().unwrap();
+ let fst1_len = bytes.len() as u64;
+ let footer_size = 28;
+ assert_eq!(counted_len + footer_size, fst1_len);
+}
+
+#[test]
+fn get_key_simple() {
+ let map = fst_map(vec![("abc", 2), ("xyz", 3)]);
+ assert_eq!(map.get_key(0), None);
+ assert_eq!(map.get_key(1), None);
+ assert_eq!(map.get_key(2), Some(b"abc".to_vec()));
+ assert_eq!(map.get_key(3), Some(b"xyz".to_vec()));
+ assert_eq!(map.get_key(4), None);
+}
+
+#[test]
+fn get_key_words() {
+ let words: Vec<(Vec<u8>, u64)> = TEXT
+ .lines()
+ .enumerate()
+ .map(|(i, line)| (line.as_bytes().to_vec(), i as u64))
+ .collect();
+ let map = fst_map(words.clone());
+ for (key, value) in words {
+ assert_eq!(map.get_key(value), Some(key));
+ }
+}
+
+#[test]
+fn get_key_words_discontiguous() {
+ let words: Vec<(Vec<u8>, u64)> = TEXT
+ .lines()
+ .enumerate()
+ .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2))
+ .collect();
+ let map = fst_map(words.clone());
+ for (key, value) in words {
+ assert_eq!(map.get_key(value), Some(key));
+ }
+}
+
+#[test]
+fn verify_ok_nonempty() {
+ let words: Vec<(Vec<u8>, u64)> = TEXT
+ .lines()
+ .enumerate()
+ .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2))
+ .collect();
+ let map = fst_map(words.clone());
+ assert!(map.verify().is_ok());
+}
+
+#[test]
+fn verify_ok_empty() {
+ let map = fst_map(Vec::<(&str, u64)>::new());
+ assert!(map.verify().is_ok());
+}
+
+#[test]
+fn verify_err() {
+ let mut b = Builder::memory();
+ b.add(b"bar").unwrap();
+ b.add(b"baz").unwrap();
+ let mut bytes = b.into_inner().unwrap();
+
+ {
+ let fst = Fst::new(&bytes).unwrap();
+ assert!(fst.verify().is_ok());
+ }
+
+ bytes[17] = b'\xFF';
+ {
+ let fst = Fst::new(&bytes).unwrap();
+ assert!(fst.verify().is_err());
+ }
+}