diff options
Diffstat (limited to 'third_party/rust/deflate/src/output_writer.rs')
-rw-r--r-- | third_party/rust/deflate/src/output_writer.rs | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/third_party/rust/deflate/src/output_writer.rs b/third_party/rust/deflate/src/output_writer.rs new file mode 100644 index 0000000000..508ba5c675 --- /dev/null +++ b/third_party/rust/deflate/src/output_writer.rs @@ -0,0 +1,154 @@ +use std::u16; + +use lzvalue::LZValue; +use huffman_table::{NUM_LITERALS_AND_LENGTHS, NUM_DISTANCE_CODES, END_OF_BLOCK_POSITION, + get_distance_code, get_length_code}; + +/// The type used for representing how many times a literal, length or distance code has been ouput +/// to the current buffer. +/// As we are limiting the blocks to be at most 2^16 bytes long, we can represent frequencies using +/// 16-bit values. +pub type FrequencyType = u16; + +/// The maximum number of literals/lengths in the buffer, which in practice also means the maximum +/// number of literals/lengths output before a new block is started. +/// This should not be larger than the maximum value `FrequencyType` can represent to prevent +/// overflowing (which would degrade, or in the worst case break compression). +pub const MAX_BUFFER_LENGTH: usize = 1024 * 31; + +#[derive(Debug, PartialEq)] +pub enum BufferStatus { + NotFull, + Full, +} + +/// Struct that buffers lz77 data and keeps track of the usage of different codes +pub struct DynamicWriter { + buffer: Vec<LZValue>, + // The two last length codes are not actually used, but only participates in code construction + // Therefore, we ignore them to get the correct number of lengths + frequencies: [FrequencyType; NUM_LITERALS_AND_LENGTHS], + distance_frequencies: [FrequencyType; NUM_DISTANCE_CODES], +} + +impl DynamicWriter { + #[inline] + pub fn check_buffer_length(&self) -> BufferStatus { + if self.buffer.len() >= MAX_BUFFER_LENGTH { + BufferStatus::Full + } else { + BufferStatus::NotFull + } + } + + #[inline] + pub fn write_literal(&mut self, literal: u8) -> BufferStatus { + debug_assert!(self.buffer.len() < MAX_BUFFER_LENGTH); + self.buffer.push(LZValue::literal(literal)); + self.frequencies[usize::from(literal)] += 1; + self.check_buffer_length() + } + + #[inline] + pub fn write_length_distance(&mut self, length: u16, distance: u16) -> BufferStatus { + self.buffer.push(LZValue::length_distance(length, distance)); + let l_code_num = get_length_code(length); + // As we limit the buffer to 2^16 values, this should be safe from overflowing. + if cfg!(debug_assertions) { + self.frequencies[l_code_num] += 1; + } else { + // #Safety + // None of the values in the table of length code numbers will give a value + // that is out of bounds. + // There is a test to ensure that these functions can not produce too large values. + unsafe { + *self.frequencies.get_unchecked_mut(l_code_num) += 1; + } + } + let d_code_num = get_distance_code(distance); + // The compiler seems to be able to evade the bounds check here somehow. + self.distance_frequencies[usize::from(d_code_num)] += 1; + self.check_buffer_length() + } + + pub fn buffer_length(&self) -> usize { + self.buffer.len() + } + + pub fn get_buffer(&self) -> &[LZValue] { + &self.buffer + } + + pub fn new() -> DynamicWriter { + let mut w = DynamicWriter { + buffer: Vec::with_capacity(MAX_BUFFER_LENGTH), + frequencies: [0; NUM_LITERALS_AND_LENGTHS], + distance_frequencies: [0; NUM_DISTANCE_CODES], + }; + // This will always be 1, + // since there will always only be one end of block marker in each block + w.frequencies[END_OF_BLOCK_POSITION] = 1; + w + } + + /// Special output function used with RLE compression + /// that avoids bothering to lookup a distance code. + #[inline] + pub fn write_length_rle(&mut self, length: u16) -> BufferStatus { + self.buffer.push(LZValue::length_distance(length, 1)); + let l_code_num = get_length_code(length); + // As we limit the buffer to 2^16 values, this should be safe from overflowing. + if cfg!(debug_assertions) { + self.frequencies[l_code_num] += 1; + } else { + // #Safety + // None of the values in the table of length code numbers will give a value + // that is out of bounds. + // There is a test to ensure that these functions won't produce too large values. + unsafe { + *self.frequencies.get_unchecked_mut(l_code_num) += 1; + } + } + self.distance_frequencies[0] += 1; + self.check_buffer_length() + } + + pub fn get_frequencies(&self) -> (&[u16], &[u16]) { + (&self.frequencies, &self.distance_frequencies) + } + + pub fn clear_frequencies(&mut self) { + self.frequencies = [0; NUM_LITERALS_AND_LENGTHS]; + self.distance_frequencies = [0; NUM_DISTANCE_CODES]; + self.frequencies[END_OF_BLOCK_POSITION] = 1; + } + + pub fn clear_data(&mut self) { + self.buffer.clear() + } + + pub fn clear(&mut self) { + self.clear_frequencies(); + self.clear_data(); + } +} + +#[cfg(test)] +mod test { + use super::*; + use huffman_table::{get_length_code, get_distance_code}; + #[test] + /// Ensure that these function won't produce values that would overflow the output_writer + /// tables since we use some unsafe indexing. + fn array_bounds() { + let w = DynamicWriter::new(); + + for i in 0..u16::max_value() { + get_length_code(i) < w.frequencies.len(); + } + + for i in 0..u16::max_value() { + get_distance_code(i) < w.distance_frequencies.len() as u8; + } + } +} |