diff options
Diffstat (limited to 'vendor/snap/src/compress.rs')
-rw-r--r-- | vendor/snap/src/compress.rs | 539 |
1 files changed, 539 insertions, 0 deletions
diff --git a/vendor/snap/src/compress.rs b/vendor/snap/src/compress.rs new file mode 100644 index 000000000..1a6638df4 --- /dev/null +++ b/vendor/snap/src/compress.rs @@ -0,0 +1,539 @@ +use std::fmt; +use std::ops::{Deref, DerefMut}; +use std::ptr; + +use crate::bytes; +use crate::error::{Error, Result}; +use crate::{MAX_BLOCK_SIZE, MAX_INPUT_SIZE}; + +/// The total number of slots we permit for our hash table of 4 byte repeat +/// sequences. +const MAX_TABLE_SIZE: usize = 1 << 14; + +/// The size of a small hash table. This is useful for reducing overhead when +/// compressing very small blocks of bytes. +const SMALL_TABLE_SIZE: usize = 1 << 10; + +/// The total number of bytes that we always leave uncompressed at the end +/// of the buffer. This in particular affords us some wiggle room during +/// compression such that faster copy operations can be used. +const INPUT_MARGIN: usize = 16 - 1; + +/// The minimum block size that we're willing to consider for compression. +/// Anything smaller than this gets emitted as a literal. +const MIN_NON_LITERAL_BLOCK_SIZE: usize = 1 + 1 + INPUT_MARGIN; + +/// Nice names for the various Snappy tags. +enum Tag { + Literal = 0b00, + Copy1 = 0b01, + Copy2 = 0b10, + // Compression never actually emits a Copy4 operation and decompression + // uses tricks so that we never explicitly do case analysis on the copy + // operation type, therefore leading to the fact that we never use Copy4. + #[allow(dead_code)] + Copy4 = 0b11, +} + +/// Returns the maximum compressed size given the uncompressed size. +/// +/// If the uncompressed size exceeds the maximum allowable size then this +/// returns 0. +pub fn max_compress_len(input_len: usize) -> usize { + let input_len = input_len as u64; + if input_len > MAX_INPUT_SIZE { + return 0; + } + let max = 32 + input_len + (input_len / 6); + if max > MAX_INPUT_SIZE { + 0 + } else { + max as usize + } +} + +/// Encoder is a raw encoder for compressing bytes in the Snappy format. +/// +/// Thie encoder does not use the Snappy frame format and simply compresses the +/// given bytes in one big Snappy block (that is, it has a single header). +/// +/// Unless you explicitly need the low-level control, you should use +/// [`read::FrameEncoder`](../read/struct.FrameEncoder.html) +/// or +/// [`write::FrameEncoder`](../write/struct.FrameEncoder.html) +/// instead, which compresses to the Snappy frame format. +/// +/// It is beneficial to reuse an Encoder when possible. +pub struct Encoder { + small: [u16; SMALL_TABLE_SIZE], + big: Vec<u16>, +} + +impl fmt::Debug for Encoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Encoder(...)") + } +} + +impl Encoder { + /// Return a new encoder that can be used for compressing bytes. + pub fn new() -> Encoder { + Encoder { small: [0; SMALL_TABLE_SIZE], big: vec![] } + } + + /// Compresses all bytes in `input` into `output`. + /// + /// `input` can be any arbitrary sequence of bytes. + /// + /// `output` must be large enough to hold the maximum possible compressed + /// size of `input`, which can be computed using `max_compress_len`. + /// + /// On success, this returns the number of bytes written to `output`. + /// + /// # Errors + /// + /// This method returns an error in the following circumstances: + /// + /// * The total number of bytes to compress exceeds `2^32 - 1`. + /// * `output` has length less than `max_compress_len(input.len())`. + pub fn compress( + &mut self, + mut input: &[u8], + output: &mut [u8], + ) -> Result<usize> { + match max_compress_len(input.len()) { + 0 => { + return Err(Error::TooBig { + given: input.len() as u64, + max: MAX_INPUT_SIZE, + }); + } + min if output.len() < min => { + return Err(Error::BufferTooSmall { + given: output.len() as u64, + min: min as u64, + }); + } + _ => {} + } + // Handle an edge case specially. + if input.is_empty() { + // Encodes a varint of 0, denoting the total size of uncompressed + // bytes. + output[0] = 0; + return Ok(1); + } + // Write the Snappy header, which is just the total number of + // uncompressed bytes. + let mut d = bytes::write_varu64(output, input.len() as u64); + while !input.is_empty() { + // Find the next block. + let mut src = input; + if src.len() > MAX_BLOCK_SIZE { + src = &src[..MAX_BLOCK_SIZE as usize]; + } + input = &input[src.len()..]; + + // If the block is smallish, then don't waste time on it and just + // emit a literal. + let mut block = Block::new(src, output, d); + if block.src.len() < MIN_NON_LITERAL_BLOCK_SIZE { + let lit_end = block.src.len(); + unsafe { + // SAFETY: next_emit is zero (in bounds) and the end is + // the length of the block (in bounds). + block.emit_literal(lit_end); + } + } else { + let table = self.block_table(block.src.len()); + block.compress(table); + } + d = block.d; + } + Ok(d) + } + + /// Compresses all bytes in `input` into a freshly allocated `Vec`. + /// + /// This is just like the `compress` method, except it allocates a `Vec` + /// with the right size for you. (This is intended to be a convenience + /// method.) + /// + /// This method returns an error under the same circumstances that + /// `compress` does. + pub fn compress_vec(&mut self, input: &[u8]) -> Result<Vec<u8>> { + let mut buf = vec![0; max_compress_len(input.len())]; + let n = self.compress(input, &mut buf)?; + buf.truncate(n); + Ok(buf) + } +} + +struct Block<'s, 'd> { + src: &'s [u8], + s: usize, + s_limit: usize, + dst: &'d mut [u8], + d: usize, + next_emit: usize, +} + +impl<'s, 'd> Block<'s, 'd> { + #[inline(always)] + fn new(src: &'s [u8], dst: &'d mut [u8], d: usize) -> Block<'s, 'd> { + Block { + src: src, + s: 0, + s_limit: src.len(), + dst: dst, + d: d, + next_emit: 0, + } + } + + #[inline(always)] + fn compress(&mut self, mut table: BlockTable<'_>) { + debug_assert!(!table.is_empty()); + debug_assert!(self.src.len() >= MIN_NON_LITERAL_BLOCK_SIZE); + + self.s += 1; + self.s_limit -= INPUT_MARGIN; + let mut next_hash = + table.hash(bytes::read_u32_le(&self.src[self.s..])); + loop { + let mut skip = 32; + let mut candidate; + let mut s_next = self.s; + loop { + self.s = s_next; + let bytes_between_hash_lookups = skip >> 5; + s_next = self.s + bytes_between_hash_lookups; + skip += bytes_between_hash_lookups; + if s_next > self.s_limit { + return self.done(); + } + unsafe { + // SAFETY: next_hash is always computed by table.hash + // which is guaranteed to be in bounds. + candidate = *table.get_unchecked(next_hash) as usize; + *table.get_unchecked_mut(next_hash) = self.s as u16; + + let srcp = self.src.as_ptr(); + // SAFETY: s_next is guaranteed to be less than s_limit by + // the conditional above, which implies s_next is in + // bounds. + let x = bytes::loadu_u32_le(srcp.add(s_next)); + next_hash = table.hash(x); + // SAFETY: self.s is always less than s_next, so it is also + // in bounds by the argument above. + // + // candidate is extracted from table, which is only ever + // set to valid positions in the block and is therefore + // also in bounds. + // + // We only need to compare y/z for equality, so we don't + // need to both with endianness. cur corresponds to the + // bytes at the current position and cand corresponds to + // a potential match. If they're equal, we declare victory + // and move below to try and extend the match. + let cur = bytes::loadu_u32_ne(srcp.add(self.s)); + let cand = bytes::loadu_u32_ne(srcp.add(candidate)); + if cur == cand { + break; + } + } + } + // While the above found a candidate for compression, before we + // emit a copy operation for it, we need to make sure that we emit + // any bytes between the last copy operation and this one as a + // literal. + let lit_end = self.s; + unsafe { + // SAFETY: next_emit is set to a previous value of self.s, + // which is guaranteed to be less than s_limit (in bounds). + // lit_end is set to the current value of self.s, also + // guaranteed to be less than s_limit (in bounds). + self.emit_literal(lit_end); + } + loop { + // Look for more matching bytes starting at the position of + // the candidate and the current src position. We increment + // self.s and candidate by 4 since we already know the first 4 + // bytes match. + let base = self.s; + self.s += 4; + unsafe { + // SAFETY: candidate is always set to a value from our + // hash table, which only contains positions in self.src + // that have been seen for this block that occurred before + // self.s. + self.extend_match(candidate + 4); + } + let (offset, len) = (base - candidate, self.s - base); + self.emit_copy(offset, len); + self.next_emit = self.s; + if self.s >= self.s_limit { + return self.done(); + } + // Update the hash table with the byte sequences + // self.src[self.s - 1..self.s + 3] and + // self.src[self.s..self.s + 4]. Instead of reading 4 bytes + // twice, we read 8 bytes once. + // + // If we happen to get a hit on self.src[self.s..self.s + 4], + // then continue this loop and extend the match. + unsafe { + let srcp = self.src.as_ptr(); + // SAFETY: self.s can never exceed s_limit given by the + // conditional above and self.s is guaranteed to be + // non-zero and is therefore in bounds. + let x = bytes::loadu_u64_le(srcp.add(self.s - 1)); + // The lower 4 bytes of x correspond to + // self.src[self.s - 1..self.s + 3]. + let prev_hash = table.hash(x as u32); + // SAFETY: Hash values are guaranteed to be in bounds. + *table.get_unchecked_mut(prev_hash) = (self.s - 1) as u16; + // The lower 4 bytes of x>>8 correspond to + // self.src[self.s..self.s + 4]. + let cur_hash = table.hash((x >> 8) as u32); + // SAFETY: Hash values are guaranteed to be in bounds. + candidate = *table.get_unchecked(cur_hash) as usize; + *table.get_unchecked_mut(cur_hash) = self.s as u16; + + // SAFETY: candidate is set from table, which always + // contains valid positions in the current block. + let y = bytes::loadu_u32_le(srcp.add(candidate)); + if (x >> 8) as u32 != y { + // If we didn't get a hit, update the next hash + // and move on. Our initial 8 byte read continues to + // pay off. + next_hash = table.hash((x >> 16) as u32); + self.s += 1; + break; + } + } + } + } + } + + /// Emits one or more copy operations with the given offset and length. + /// offset must be in the range [1, 65535] and len must be in the range + /// [4, 65535]. + #[inline(always)] + fn emit_copy(&mut self, offset: usize, mut len: usize) { + debug_assert!(1 <= offset && offset <= 65535); + // Copy operations only allow lengths up to 64, but we'll allow bigger + // lengths and emit as many operations as we need. + // + // N.B. Since our block size is 64KB, we never actually emit a copy 4 + // operation. + debug_assert!(4 <= len && len <= 65535); + + // Emit copy 2 operations until we don't have to. + // We check on 68 here and emit a shorter copy than 64 below because + // it is cheaper to, e.g., encode a length 67 copy as a length 60 + // copy 2 followed by a length 7 copy 1 than to encode it as a length + // 64 copy 2 followed by a length 3 copy 2. They key here is that a + // copy 1 operation requires at least length 4 which forces a length 3 + // copy to use a copy 2 operation. + while len >= 68 { + self.emit_copy2(offset, 64); + len -= 64; + } + if len > 64 { + self.emit_copy2(offset, 60); + len -= 60; + } + // If we can squeeze the last copy into a copy 1 operation, do it. + if len <= 11 && offset <= 2047 { + self.dst[self.d] = (((offset >> 8) as u8) << 5) + | (((len - 4) as u8) << 2) + | (Tag::Copy1 as u8); + self.dst[self.d + 1] = offset as u8; + self.d += 2; + } else { + self.emit_copy2(offset, len); + } + } + + /// Emits a "copy 2" operation with the given offset and length. The + /// offset and length must be valid for a copy 2 operation. i.e., offset + /// must be in the range [1, 65535] and len must be in the range [1, 64]. + #[inline(always)] + fn emit_copy2(&mut self, offset: usize, len: usize) { + debug_assert!(1 <= offset && offset <= 65535); + debug_assert!(1 <= len && len <= 64); + self.dst[self.d] = (((len - 1) as u8) << 2) | (Tag::Copy2 as u8); + bytes::write_u16_le(offset as u16, &mut self.dst[self.d + 1..]); + self.d += 3; + } + + /// Attempts to extend a match from the current position in self.src with + /// the candidate position given. + /// + /// This method uses unaligned loads and elides bounds checks, so the + /// caller must guarantee that cand points to a valid location in self.src + /// and is less than the current position in src. + #[inline(always)] + unsafe fn extend_match(&mut self, mut cand: usize) { + debug_assert!(cand < self.s); + while self.s + 8 <= self.src.len() { + let srcp = self.src.as_ptr(); + // SAFETY: The loop invariant guarantees that there is at least + // 8 bytes to read at self.src + self.s. Since cand must be + // guaranteed by the caller to be valid and less than self.s, it + // also has enough room to read 8 bytes. + // + // TODO(ag): Despite my best efforts, I couldn't get this to + // autovectorize with 128-bit loads. The logic after the loads + // appears to be a little too clever... + let x = bytes::loadu_u64_ne(srcp.add(self.s)); + let y = bytes::loadu_u64_ne(srcp.add(cand)); + if x == y { + // If all 8 bytes are equal, move on... + self.s += 8; + cand += 8; + } else { + // Otherwise, find the last byte that was equal. We can do + // this efficiently by interpreted x/y as little endian + // numbers, which lets us use the number of trailing zeroes + // as a proxy for the number of equivalent bits (after an XOR). + let z = x.to_le() ^ y.to_le(); + self.s += z.trailing_zeros() as usize / 8; + return; + } + } + // When we have fewer than 8 bytes left in the block, fall back to the + // slow loop. + while self.s < self.src.len() && self.src[self.s] == self.src[cand] { + self.s += 1; + cand += 1; + } + } + + /// Executes any cleanup when the current block has finished compressing. + /// In particular, it emits any leftover bytes as a literal. + #[inline(always)] + fn done(&mut self) { + if self.next_emit < self.src.len() { + let lit_end = self.src.len(); + unsafe { + // SAFETY: Both next_emit and lit_end are trivially in bounds + // given the conditional and definition above. + self.emit_literal(lit_end); + } + } + } + + /// Emits a literal from self.src[self.next_emit..lit_end]. + /// + /// This uses unaligned loads and elides bounds checks, so the caller must + /// guarantee that self.src[self.next_emit..lit_end] is valid. + #[inline(always)] + unsafe fn emit_literal(&mut self, lit_end: usize) { + let lit_start = self.next_emit; + let len = lit_end - lit_start; + let n = len.checked_sub(1).unwrap(); + if n <= 59 { + self.dst[self.d] = ((n as u8) << 2) | (Tag::Literal as u8); + self.d += 1; + if len <= 16 && lit_start + 16 <= self.src.len() { + // SAFETY: lit_start is equivalent to self.next_emit, which is + // only set to self.s immediately following a copy emit. The + // conditional above also ensures that there is at least 16 + // bytes of room in both src and dst. + // + // dst is big enough because the buffer is guaranteed to + // be big enough to hold biggest possible compressed size plus + // an extra 32 bytes, which exceeds the 16 byte copy here. + let srcp = self.src.as_ptr().add(lit_start); + let dstp = self.dst.as_mut_ptr().add(self.d); + ptr::copy_nonoverlapping(srcp, dstp, 16); + self.d += len; + return; + } + } else if n < 256 { + self.dst[self.d] = (60 << 2) | (Tag::Literal as u8); + self.dst[self.d + 1] = n as u8; + self.d += 2; + } else { + self.dst[self.d] = (61 << 2) | (Tag::Literal as u8); + bytes::write_u16_le(n as u16, &mut self.dst[self.d + 1..]); + self.d += 3; + } + // SAFETY: lit_start is equivalent to self.next_emit, which is only set + // to self.s immediately following a copy, which implies that it always + // points to valid bytes in self.src. + // + // We can't guarantee that there are at least len bytes though, which + // must be guaranteed by the caller and is why this method is unsafe. + let srcp = self.src.as_ptr().add(lit_start); + let dstp = self.dst.as_mut_ptr().add(self.d); + ptr::copy_nonoverlapping(srcp, dstp, len); + self.d += len; + } +} + +/// `BlockTable` is a map from 4 byte sequences to positions of their most +/// recent occurrence in a block. In particular, this table lets us quickly +/// find candidates for compression. +/// +/// We expose the `hash` method so that callers can be fastidious about the +/// number of times a hash is computed. +struct BlockTable<'a> { + table: &'a mut [u16], + /// The number of bits required to shift the hash such that the result + /// is less than table.len(). + shift: u32, +} + +impl Encoder { + fn block_table(&mut self, block_size: usize) -> BlockTable<'_> { + let mut shift: u32 = 32 - 8; + let mut table_size = 256; + while table_size < MAX_TABLE_SIZE && table_size < block_size { + shift -= 1; + table_size *= 2; + } + // If our block size is small, then use a small stack allocated table + // instead of putting a bigger one on the heap. This particular + // optimization is important if the caller is using Snappy to compress + // many small blocks. (The memset savings alone is considerable.) + let table: &mut [u16] = if table_size <= SMALL_TABLE_SIZE { + &mut self.small[0..table_size] + } else { + if self.big.is_empty() { + // Interestingly, using `self.big.resize` here led to some + // very weird code getting generated that led to a large + // slow down. Forcing the issue with a new vec seems to + // fix it. ---AG + self.big = vec![0; MAX_TABLE_SIZE]; + } + &mut self.big[0..table_size] + }; + for x in &mut *table { + *x = 0; + } + BlockTable { table: table, shift: shift } + } +} + +impl<'a> BlockTable<'a> { + #[inline(always)] + fn hash(&self, x: u32) -> usize { + (x.wrapping_mul(0x1E35A7BD) >> self.shift) as usize + } +} + +impl<'a> Deref for BlockTable<'a> { + type Target = [u16]; + fn deref(&self) -> &[u16] { + self.table + } +} + +impl<'a> DerefMut for BlockTable<'a> { + fn deref_mut(&mut self) -> &mut [u16] { + self.table + } +} |