//! This module contains functionality for doing lz77 compression of data. #![macro_use] use std::cmp; use std::ops::{Range, RangeFrom}; use std::iter::{self, Iterator}; use std::slice::Iter; use std::fmt; use input_buffer::InputBuffer; use matching::longest_match; #[cfg(test)] use lzvalue::{LZValue, LZType}; use huffman_table; use chained_hash_table::{ChainedHashTable, update_hash}; #[cfg(test)] use compression_options::{HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN}; use output_writer::{BufferStatus, DynamicWriter}; use compress::Flush; use rle::process_chunk_greedy_rle; const MAX_MATCH: usize = huffman_table::MAX_MATCH as usize; const MIN_MATCH: usize = huffman_table::MIN_MATCH as usize; const NO_RLE: u16 = 43212; /// An enum describing whether we use lazy or greedy matching. #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum MatchingType { /// Use greedy matching: the matching algorithm simply uses a match right away /// if found. Greedy, /// Use lazy matching: after finding a match, the next input byte is checked, to see /// if there is a better match starting at that byte. /// /// As a special case, if max_hash_checks is set to 0, compression using only run-length /// (i.e maximum match distance of 1) is performed instead. Lazy, } impl fmt::Display for MatchingType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { MatchingType::Greedy => write!(f, "Greedy matching"), MatchingType::Lazy => write!(f, "Lazy matching"), } } } /// A struct that contains the hash table, and keeps track of where we are in the input data pub struct LZ77State { /// Struct containing hash chains that will be used to find matches. hash_table: ChainedHashTable, /// True if this is the first window that is being processed. is_first_window: bool, /// Set to true when the last block has been processed. is_last_block: bool, /// How many bytes the last match in the previous window extended into the current one. overlap: usize, /// How many bytes of input the current block contains. current_block_input_bytes: u64, /// The maximum number of hash entries to search. max_hash_checks: u16, /// Only lazy match if we have a match length less than this. lazy_if_less_than: u16, /// Whether to use greedy or lazy parsing matching_type: MatchingType, /// Keep track of the previous match and byte in case the buffer is full when lazy matching. match_state: ChunkState, /// Keep track of how many bytes in the lookahead that was part of a match, but has not been /// added to the hash chain yet. bytes_to_hash: usize, /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be /// hashed. was_synced: bool, } impl LZ77State { /// Creates a new LZ77 state pub fn new( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> LZ77State { LZ77State { hash_table: ChainedHashTable::new(), is_first_window: true, is_last_block: false, overlap: 0, current_block_input_bytes: 0, max_hash_checks: max_hash_checks, lazy_if_less_than: lazy_if_less_than, matching_type: matching_type, match_state: ChunkState::new(), bytes_to_hash: 0, was_synced: false, } } /// Resets the state excluding max_hash_checks and lazy_if_less_than pub fn reset(&mut self) { self.hash_table.reset(); self.is_first_window = true; self.is_last_block = false; self.overlap = 0; self.current_block_input_bytes = 0; self.match_state = ChunkState::new(); self.bytes_to_hash = 0 } pub fn set_last(&mut self) { self.is_last_block = true; } /// Is this the last block we are outputting? pub fn is_last_block(&self) -> bool { self.is_last_block } /// How many bytes of input the current block contains. pub fn current_block_input_bytes(&self) -> u64 { self.current_block_input_bytes } /// Sets the number of input bytes for the current block to 0. pub fn reset_input_bytes(&mut self) { self.current_block_input_bytes = 0; } /// Is there a buffered byte that has not been output yet? pub fn pending_byte(&self) -> bool { self.match_state.add } /// Returns 1 if pending_byte is true, 0 otherwise. pub fn pending_byte_as_num(&self) -> usize { // This could be implemented by using `as usize` as the documentation states this would give // the same result, but not sure if that should be relied upon. if self.match_state.add { 1 } else { 0 } } } const DEFAULT_WINDOW_SIZE: usize = 32768; #[derive(Debug)] /// Status after calling `process_chunk`. pub enum ProcessStatus { /// All the input data was processed. Ok, /// The output buffer was full. /// /// The argument is the position of the next byte to be checked by `process_chunk`. BufferFull(usize), } #[derive(Debug)] /// A struct to keep track of status between calls of `process_chunk_lazy` /// /// This is needed as the output buffer might become full before having output all pending data. pub struct ChunkState { /// Length of the last match that was found, if any. current_length: u16, /// Distance of the last match that was found, if any. current_distance: u16, /// The previous byte checked in process_chunk. prev_byte: u8, /// The current byte being checked in process_chunk. cur_byte: u8, /// Whether prev_byte still needs to be output. add: bool, } impl ChunkState { pub fn new() -> ChunkState { ChunkState { current_length: 0, current_distance: 0, prev_byte: 0, cur_byte: 0, add: false, } } } pub fn buffer_full(position: usize) -> ProcessStatus { ProcessStatus::BufferFull(position) } fn process_chunk( data: &[u8], iterated_data: &Range, mut match_state: &mut ChunkState, hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, matching_type: MatchingType, ) -> (usize, ProcessStatus) { let avoid_rle = if cfg!(test) { // Avoid RLE if lazy_if_less than is a specific value. // This is used in some tests, ideally we should probably do this in a less clunky way, // but we use a value here that is higher than the maximum sensible one anyhow, and will // be truncated by deflate_state for calls from outside the library. lazy_if_less_than == NO_RLE as usize } else { false }; match matching_type { MatchingType::Greedy => { process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks) } MatchingType::Lazy => { if max_hash_checks > 0 || avoid_rle { process_chunk_lazy( data, iterated_data, &mut match_state, hash_table, writer, max_hash_checks, lazy_if_less_than, ) } else { // Use the RLE method if max_hash_checks is set to 0. process_chunk_greedy_rle(data, iterated_data, writer) } } } } /// Add the specified number of bytes to the hash table from the iterators /// adding `start` to the position supplied to the hash table. fn add_to_hash_table( bytes_to_add: usize, insert_it: &mut iter::Zip, Iter>, hash_it: &mut Iter, hash_table: &mut ChainedHashTable, ) { let taker = insert_it.by_ref().take(bytes_to_add); let mut hash_taker = hash_it.by_ref().take(bytes_to_add); // Update the hash manually here to keep it in a register. let mut hash = hash_table.current_hash(); // Advance the iterators and add the bytes we jump over to the hash table and // checksum for (ipos, _) in taker { if let Some(&i_hash_byte) = hash_taker.next() { hash = update_hash(hash, i_hash_byte); hash_table.add_with_hash(ipos, hash); } } // Write the hash back once we are done. hash_table.set_hash(hash); } /// Write the specified literal `byte` to the writer `w`, and return /// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing. /// /// `pos` should indicate the byte to start at in the next call to `process_chunk`, /// `is_hashed` should be set to true of the byte at pos has been added to the hash chain. macro_rules! write_literal{ ($w:ident, $byte:expr, $pos:expr) => { let b_status = $w.write_literal($byte); if let BufferStatus::Full = b_status { return (0, buffer_full($pos)); } }; } /// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take /// up more space than it would save. #[inline] fn match_too_far(match_len: usize, match_dist: usize) -> bool { const TOO_FAR: usize = 8 * 1024; match_len == MIN_MATCH && match_dist > TOO_FAR } ///Create the iterators used when processing through a chunk of data. fn create_iterators<'a>( data: &'a [u8], iterated_data: &Range, ) -> ( usize, iter::Zip, Iter<'a, u8>>, Iter<'a, u8>, ) { let end = cmp::min(data.len(), iterated_data.end); let start = iterated_data.start; let current_chunk = &data[start..end]; let insert_it = (start..).zip(current_chunk.iter()); let hash_it = { let hash_start = if data.len() - start > 2 { start + 2 } else { data.len() }; (&data[hash_start..]).iter() }; (end, insert_it, hash_it) } fn process_chunk_lazy( data: &[u8], iterated_data: &Range, state: &mut ChunkState, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, ) -> (usize, ProcessStatus) { let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); const NO_LENGTH: u16 = 0; // The previous match length, if any. let mut prev_length = state.current_length; // The distance of the previous match if any. let mut prev_distance = state.current_distance; state.current_length = 0; state.current_distance = 0; // The number of bytes past end that was added due to finding a match that extends into // the lookahead window. let mut overlap = 0; // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`, // indicating that we won't lazy match (check for a better match at the next byte). // If we had a good match, carry this over from the previous call. let mut ignore_next = prev_length as usize >= lazy_if_less_than; // This is to output the correct byte in case there is one pending to be output // from the previous call. state.prev_byte = state.cur_byte; // Iterate through the slice, adding literals or length/distance pairs while let Some((position, &b)) = insert_it.next() { state.cur_byte = b; if let Some(&hash_byte) = hash_it.next() { hash_table.add_hash_value(position, hash_byte); // Only lazy match if we have a match shorter than a set value // TODO: This should be cleaned up a bit if !ignore_next { let (mut match_len, match_dist) = { // If there already was a decent match at the previous byte // and we are lazy matching, do less match checks in this step. let max_hash_checks = if prev_length >= 32 { max_hash_checks >> 2 } else { max_hash_checks }; // Check if we can find a better match here than the one we had at // the previous byte. longest_match( data, hash_table, position, prev_length as usize, max_hash_checks, ) }; // If the match is only 3 bytes long and very far back, it's probably not worth // outputting. if match_too_far(match_len, match_dist) { match_len = NO_LENGTH as usize; }; if match_len >= lazy_if_less_than { // We found a decent match, so we won't check for a better one at the next byte. ignore_next = true; } state.current_length = match_len as u16; state.current_distance = match_dist as u16; } else { // We already had a decent match, so we don't bother checking for another one. state.current_length = NO_LENGTH; state.current_distance = 0; // Make sure we check again next time. ignore_next = false; }; if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 { // The previous match was better so we add it. // Casting note: length and distance is already bounded by the longest match // function. Usize is just used for convenience. let b_status = writer.write_length_distance(prev_length as u16, prev_distance as u16); // We add the bytes to the hash table and checksum. // Since we've already added two of them, we need to add two less than // the length. let bytes_to_add = prev_length - 2; add_to_hash_table( bytes_to_add as usize, &mut insert_it, &mut hash_it, &mut hash_table, ); // If the match is longer than the current window, we have note how many // bytes we overlap, since we don't need to do any matching on these bytes // in the next call of this function. // We don't have to worry about setting overlap to 0 if this is false, as the // function will stop after this condition is true, and overlap is not altered // elsewhere. if position + prev_length as usize > end { // We need to subtract 1 since the byte at pos is also included. overlap = position + prev_length as usize - end - 1; }; state.add = false; // Note that there is no current match. state.current_length = 0; state.current_distance = 0; if let BufferStatus::Full = b_status { // MATCH(lazy) return (overlap, buffer_full(position + prev_length as usize - 1)); } ignore_next = false; } else if state.add { // We found a better match (or there was no previous match) // so output the previous byte. // BETTER OR NO MATCH write_literal!(writer, state.prev_byte, position + 1); } else { state.add = true } prev_length = state.current_length; prev_distance = state.current_distance; state.prev_byte = b; } else { // If there is a match at this point, it will not have been added, so we need to add it. if prev_length >= MIN_MATCH as u16 { let b_status = writer.write_length_distance(prev_length as u16, prev_distance as u16); state.current_length = 0; state.current_distance = 0; state.add = false; // As this will be a 3-length match at the end of the input data, there can't be any // overlap. // TODO: Not sure if we need to signal that the buffer is full here. // It's only needed in the case of syncing. if let BufferStatus::Full = b_status { // TODO: These bytes should be hashed when doing a sync flush. // This can't be done here as the new input data does not exist yet. return (0, buffer_full(end)); } else { return (0, ProcessStatus::Ok); } }; if state.add { // We may still have a leftover byte at this point, so we add it here if needed. state.add = false; // ADD write_literal!(writer, state.prev_byte, position + 1); }; // We are at the last two bytes we want to add, so there is no point // searching for matches here. // AFTER ADD write_literal!(writer, b, position + 1); } } (overlap, ProcessStatus::Ok) } fn process_chunk_greedy( data: &[u8], iterated_data: &Range, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, ) -> (usize, ProcessStatus) { let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); const NO_LENGTH: usize = 0; // The number of bytes past end that was added due to finding a match that extends into // the lookahead window. let mut overlap = 0; // Iterate through the slice, adding literals or length/distance pairs. while let Some((position, &b)) = insert_it.next() { if let Some(&hash_byte) = hash_it.next() { hash_table.add_hash_value(position, hash_byte); // TODO: This should be cleaned up a bit. let (match_len, match_dist) = { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) }; if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) { // Casting note: length and distance is already bounded by the longest match // function. Usize is just used for convenience. let b_status = writer.write_length_distance(match_len as u16, match_dist as u16); // We add the bytes to the hash table and checksum. // Since we've already added one of them, we need to add one less than // the length. let bytes_to_add = match_len - 1; add_to_hash_table( bytes_to_add, &mut insert_it, &mut hash_it, &mut hash_table, ); // If the match is longer than the current window, we have note how many // bytes we overlap, since we don't need to do any matching on these bytes // in the next call of this function. if position + match_len > end { // We need to subtract 1 since the byte at pos is also included. overlap = position + match_len - end; }; if let BufferStatus::Full = b_status { // MATCH return (overlap, buffer_full(position + match_len)); } } else { // NO MATCH write_literal!(writer, b, position + 1); } } else { // We are at the last two bytes we want to add, so there is no point // searching for matches here. // END write_literal!(writer, b, position + 1); } } (overlap, ProcessStatus::Ok) } #[derive(Eq, PartialEq, Clone, Copy, Debug)] pub enum LZ77Status { /// Waiting for more input before doing any processing NeedInput, /// The output buffer is full, so the current block needs to be ended so the /// buffer can be flushed. EndBlock, /// All pending data has been processed. Finished, } #[cfg(test)] pub fn lz77_compress_block_finish( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, ) -> (usize, LZ77Status) { let (consumed, status, _) = lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish); (consumed, status) } /// Compress a slice with lz77 compression. /// /// This function processes one window at a time, and returns when there is no input left, /// or it determines it's time to end a block. /// /// Returns the number of bytes of the input that were consumed, a status describing /// whether there is no input, it's time to finish, or it's time to end the block, and the position /// of the first byte in the input buffer that has not been output (but may have been checked for /// matches). pub fn lz77_compress_block( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, flush: Flush, ) -> (usize, LZ77Status, usize) { // Currently we only support the maximum window size let window_size = DEFAULT_WINDOW_SIZE; // Indicates whether we should try to process all the data including the lookahead, or if we // should wait until we have at least one window size of data before doing anything. let finish = flush == Flush::Finish || flush == Flush::Sync; let sync = flush == Flush::Sync; let mut current_position = 0; // The current status of the encoding. let mut status = LZ77Status::EndBlock; // Whether warm up the hash chain with the two first values. let mut add_initial = true; // If we have synced, add the two first bytes to the hash as they couldn't be added before. if state.was_synced { if buffer.current_end() > 2 { let pos_add = buffer.current_end() - 2; for (n, &b) in data.iter().take(2).enumerate() { state.hash_table.add_hash_value(n + pos_add, b); } add_initial = false; } state.was_synced = false; } // Add data to the input buffer and keep a reference to the slice of data not added yet. let mut remaining_data = buffer.add_data(data); loop { // Note if there is a pending byte from the previous call to process_chunk, // so we get the block input size right. let pending_previous = state.pending_byte_as_num(); assert!(writer.buffer_length() <= (window_size * 2)); // The process is a bit different for the first 32k bytes. // TODO: There is a lot of duplicate code between the two branches here, we should be able // to simplify this. if state.is_first_window { // Don't do anything until we are either flushing, or we have at least one window of // data. if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish { if buffer.get_buffer().len() >= 2 && add_initial && state.current_block_input_bytes == 0 { let b = buffer.get_buffer(); // Warm up the hash with the two first values, so we can find matches at // index 0. state.hash_table.add_initial_hash_values(b[0], b[1]); add_initial = false; } let first_chunk_end = cmp::min(window_size, buffer.current_end()); let start = state.overlap; let (overlap, p_status) = process_chunk( buffer.get_buffer(), &(start..first_chunk_end), &mut state.match_state, &mut state.hash_table, &mut writer, state.max_hash_checks, state.lazy_if_less_than as usize, state.matching_type, ); state.overlap = overlap; state.bytes_to_hash = overlap; // If the buffer is full, we want to end the block. if let ProcessStatus::BufferFull(written) = p_status { state.overlap = if overlap > 0 { overlap } else { written }; status = LZ77Status::EndBlock; current_position = written - state.pending_byte_as_num(); state.current_block_input_bytes += (written - start + overlap + pending_previous - state.pending_byte_as_num()) as u64; break; } // Update the length of how many input bytes the current block is representing, // taking into account pending bytes. state.current_block_input_bytes += (first_chunk_end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64; // We are at the first window so we don't need to slide the hash table yet. // If finishing or syncing, we stop here. if first_chunk_end >= buffer.current_end() && finish { current_position = first_chunk_end - state.pending_byte_as_num(); if !sync { state.set_last(); state.is_first_window = false; } else { state.overlap = first_chunk_end; state.was_synced = true; } debug_assert!( !state.pending_byte(), "Bug! Ended compression wit a pending byte!" ); status = LZ77Status::Finished; break; } // Otherwise, continue. state.is_first_window = false; } else { status = LZ77Status::NeedInput; break; } } else if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish { if buffer.current_end() >= window_size + 2 { for (n, &h) in buffer.get_buffer()[window_size + 2..] .iter() .enumerate() .take(state.bytes_to_hash) { state.hash_table.add_hash_value(window_size + n, h); } state.bytes_to_hash = 0; } // This isn't the first chunk, so we start reading at one window in in the // buffer plus any additional overlap from earlier. let start = window_size + state.overlap; // Determine where we have to stop iterating to slide the buffer and hash, // or stop because we are at the end of the input data. let end = cmp::min(window_size * 2, buffer.current_end()); let (overlap, p_status) = process_chunk( buffer.get_buffer(), &(start..end), &mut state.match_state, &mut state.hash_table, &mut writer, state.max_hash_checks, state.lazy_if_less_than as usize, state.matching_type, ); state.bytes_to_hash = overlap; if let ProcessStatus::BufferFull(written) = p_status { state.current_block_input_bytes += (written - start + pending_previous - state.pending_byte_as_num()) as u64; // If the buffer is full, return and end the block. // If overlap is non-zero, the buffer was full after outputting the last byte, // otherwise we have to skip to the point in the buffer where we stopped in the // next call. state.overlap = if overlap > 0 { // If we are at the end of the window, make sure we slide the buffer and the // hash table. if state.max_hash_checks > 0 { state.hash_table.slide(window_size); } remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); overlap } else { written - window_size }; current_position = written - state.pending_byte_as_num(); // Status is already EndBlock at this point. // status = LZ77Status::EndBlock; break; } state.current_block_input_bytes += (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64; // The buffer is not full, but we still need to note if there is any overlap into the // next window. state.overlap = overlap; if remaining_data.is_none() && finish && end == buffer.current_end() { current_position = buffer.current_end(); debug_assert!( !state.pending_byte(), "Bug! Ended compression wit a pending byte!" ); // We stopped before or at the window size, so we are at the end. if !sync { // If we are finishing and not syncing, we simply indicate that we are done. state.set_last(); } else { // For sync flushing we need to slide the buffer and the hash chains so that the // next call to this function starts at the right place. // There won't be any overlap, since when syncing, we process to the end of the. // pending data. state.overlap = buffer.current_end() - window_size; state.was_synced = true; } status = LZ77Status::Finished; break; } else { // We are not at the end, so slide and continue. // We slide the hash table back to make space for new hash values // We only need to remember 2^15 bytes back (the maximum distance allowed by the // deflate spec). if state.max_hash_checks > 0 { state.hash_table.slide(window_size); } // Also slide the buffer, discarding data we no longer need and adding new data. remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); } } else { // The caller has not indicated that they want to finish or flush, and there is less // than a window + lookahead of new data, so we wait for more. status = LZ77Status::NeedInput; break; } } ( data.len() - remaining_data.unwrap_or(&[]).len(), status, current_position, ) } #[cfg(test)] pub fn decompress_lz77(input: &[LZValue]) -> Vec { decompress_lz77_with_backbuffer(input, &[]) } #[cfg(test)] pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec { let mut output = Vec::new(); for p in input { match p.value() { LZType::Literal(l) => output.push(l), LZType::StoredLengthDistance(l, d) => { // We found a match, so we have to get the data that the match refers to. let d = d as usize; let prev_output_len = output.len(); // Get match data from the back buffer if the match extends that far. let consumed = if d > output.len() { let into_back_buffer = d - output.len(); assert!( into_back_buffer <= back_buffer.len(), "ERROR: Attempted to refer to a match in non-existing data!\ into_back_buffer: {}, back_buffer len {}, d {}, l {:?}", into_back_buffer, back_buffer.len(), d, l ); let start = back_buffer.len() - into_back_buffer; let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize); output.extend_from_slice(&back_buffer[start..end]); end - start } else { 0 }; // Get match data from the currently decompressed data. let start = prev_output_len.saturating_sub(d); let mut n = 0; while n < (l.actual_length() as usize).saturating_sub(consumed) { let b = output[start + n]; output.push(b); n += 1; } } } } output } #[cfg(test)] pub struct TestStruct { state: LZ77State, buffer: InputBuffer, writer: DynamicWriter, } #[cfg(test)] impl TestStruct { fn new() -> TestStruct { TestStruct::with_config( HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN, MatchingType::Lazy, ) } fn with_config( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> TestStruct { TestStruct { state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type), buffer: InputBuffer::empty(), writer: DynamicWriter::new(), } } fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) { lz77_compress_block( data, &mut self.state, &mut self.buffer, &mut self.writer, if flush { Flush::Finish } else { Flush::None }, ) } } #[cfg(test)] pub fn lz77_compress(data: &[u8]) -> Option> { lz77_compress_conf( data, HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN, MatchingType::Lazy, ) } /// Compress a slice, not storing frequency information /// /// This is a convenience function for compression with fixed huffman values /// Only used in tests for now #[cfg(test)] pub fn lz77_compress_conf( data: &[u8], max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> Option> { let mut test_boxed = Box::new(TestStruct::with_config( max_hash_checks, lazy_if_less_than, matching_type, )); let mut out = Vec::::with_capacity(data.len() / 3); { let test = test_boxed.as_mut(); let mut slice = data; while !test.state.is_last_block { let bytes_written = lz77_compress_block_finish( slice, &mut test.state, &mut test.buffer, &mut test.writer, ).0; slice = &slice[bytes_written..]; out.extend(test.writer.get_buffer()); test.writer.clear(); } } Some(out) } #[cfg(test)] mod test { use super::*; use lzvalue::{LZValue, LZType, lit, ld}; use chained_hash_table::WINDOW_SIZE; use compression_options::DEFAULT_LAZY_IF_LESS_THAN; use test_utils::get_test_data; use output_writer::MAX_BUFFER_LENGTH; /// Helper function to print the output from the lz77 compression function fn print_output(input: &[LZValue]) { let mut output = vec![]; for l in input { match l.value() { LZType::Literal(l) => output.push(l), LZType::StoredLengthDistance(l, d) => { output.extend(format!("", l.actual_length()).into_bytes()); output.extend(format!("", d).into_bytes()) } } } println!("\"{}\"", String::from_utf8(output).unwrap()); } /// Test that a short string from an example on SO compresses correctly #[test] fn compress_short() { let test_bytes = String::from("Deflate late").into_bytes(); let res = lz77_compress(&test_bytes).unwrap(); let decompressed = decompress_lz77(&res); assert_eq!(test_bytes, decompressed); assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5)); } /// Test that compression is working for a longer file #[test] fn compress_long() { let input = get_test_data(); let compressed = lz77_compress(&input).unwrap(); assert!(compressed.len() < input.len()); let decompressed = decompress_lz77(&compressed); println!("compress_long length: {}", input.len()); // This is to check where the compression fails, if it were to for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() { if a != b { println!("First difference at {}, input: {}, output: {}", n, a, b); break; } } assert_eq!(input.len(), decompressed.len()); assert!(&decompressed == &input); } /// Check that lazy matching is working as intended #[test] fn lazy() { // We want to match on `badger` rather than `nba` as it is longer // let data = b" nba nbadg badger nbadger"; let data = b"nba badger nbadger"; let compressed = lz77_compress(data).unwrap(); let test = compressed[compressed.len() - 1]; if let LZType::StoredLengthDistance(l, _) = test.value() { assert_eq!(l.actual_length(), 6); } else { print_output(&compressed); panic!(); } } fn roundtrip(data: &[u8]) { let compressed = super::lz77_compress(&data).unwrap(); let decompressed = decompress_lz77(&compressed); assert!(decompressed == data); } // Check that data with the exact window size is working properly #[test] #[allow(unused)] fn exact_window_size() { use std::io::Write; let mut data = vec![0; WINDOW_SIZE]; roundtrip(&data); { data.write(&[22; WINDOW_SIZE]); } roundtrip(&data); { data.write(&[55; WINDOW_SIZE]); } roundtrip(&data); } /// Test that matches at the window border are working correctly #[test] fn border() { use chained_hash_table::WINDOW_SIZE; let mut data = vec![35; WINDOW_SIZE]; data.extend(b"Test"); let compressed = super::lz77_compress(&data).unwrap(); assert!(compressed.len() < data.len()); let decompressed = decompress_lz77(&compressed); assert_eq!(decompressed.len(), data.len()); assert!(decompressed == data); } #[test] fn border_multiple_blocks() { use chained_hash_table::WINDOW_SIZE; let mut data = vec![0; (WINDOW_SIZE * 2) + 50]; data.push(1); let compressed = super::lz77_compress(&data).unwrap(); assert!(compressed.len() < data.len()); let decompressed = decompress_lz77(&compressed); assert_eq!(decompressed.len(), data.len()); assert!(decompressed == data); } #[test] fn compress_block_status() { use input_buffer::InputBuffer; let data = b"Test data data"; let mut writer = DynamicWriter::new(); let mut buffer = InputBuffer::empty(); let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer); assert_eq!(status.1, LZ77Status::Finished); assert!(&buffer.get_buffer()[..data.len()] == data); assert_eq!(buffer.current_end(), data.len()); } #[test] fn compress_block_multiple_windows() { use input_buffer::InputBuffer; use output_writer::DynamicWriter; let data = get_test_data(); assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH); let mut writer = DynamicWriter::new(); let mut buffer = InputBuffer::empty(); let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); let (bytes_consumed, status) = lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer); assert_eq!( buffer.get_buffer().len(), (WINDOW_SIZE * 2) + super::MAX_MATCH ); assert_eq!(status, LZ77Status::EndBlock); let buf_len = buffer.get_buffer().len(); assert!(buffer.get_buffer()[..] == data[..buf_len]); writer.clear(); let (_, status) = lz77_compress_block_finish( &data[bytes_consumed..], &mut state, &mut buffer, &mut writer, ); assert_eq!(status, LZ77Status::EndBlock); } #[test] fn multiple_inputs() { let data = b"Badger badger bababa test data 25 asfgestghresjkgh"; let comp1 = lz77_compress(data).unwrap(); let comp2 = { const SPLIT: usize = 25; let first_part = &data[..SPLIT]; let second_part = &data[SPLIT..]; let mut state = TestStruct::new(); let (bytes_written, status, _) = state.compress_block(first_part, false); assert_eq!(bytes_written, first_part.len()); assert_eq!(status, LZ77Status::NeedInput); let (bytes_written, status, _) = state.compress_block(second_part, true); assert_eq!(bytes_written, second_part.len()); assert_eq!(status, LZ77Status::Finished); Vec::from(state.writer.get_buffer()) }; assert!(comp1 == comp2); } #[test] /// Test that the exit from process_chunk when buffer is full is working correctly. fn buffer_fill() { let data = get_test_data(); // The comments above these calls refers the positions with the // corersponding comments in process_chunk_{greedy/lazy}. // POS BETTER OR NO MATCH buffer_test_literals(&data); // POS END // POS NO MATCH buffer_test_last_bytes(MatchingType::Greedy, &data); // POS ADD // POS AFTER ADD buffer_test_last_bytes(MatchingType::Lazy, &data); // POS MATCH buffer_test_match(MatchingType::Greedy); // POS MATCH(lazy) buffer_test_match(MatchingType::Lazy); // POS END buffer_test_add_end(&data); } /// Test buffer fill when a byte is added due to no match being found. fn buffer_test_literals(data: &[u8]) { let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy); let (bytes_consumed, status, position) = state.compress_block(&data, false); // There should be enough data for the block to have ended. assert_eq!(status, LZ77Status::EndBlock); assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH); // The buffer should be full. assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH); assert_eq!(position, state.writer.get_buffer().len()); // Since all literals have been input, the block should have the exact number of litlens // as there were input bytes. assert_eq!( state.state.current_block_input_bytes() as usize, MAX_BUFFER_LENGTH ); state.state.reset_input_bytes(); let mut out = decompress_lz77(state.writer.get_buffer()); state.writer.clear(); // The buffer should now be cleared. assert_eq!(state.writer.get_buffer().len(), 0); assert!(data[..out.len()] == out[..]); let _ = state.compress_block(&data[bytes_consumed..], false); // We should have some new data in the buffer at this point. assert!(state.writer.get_buffer().len() > 0); assert_eq!( state.state.current_block_input_bytes() as usize, MAX_BUFFER_LENGTH ); out.extend_from_slice(&decompress_lz77(state.writer.get_buffer())); assert!(data[..out.len()] == out[..]); } /// Test buffer fill at the last two bytes that are not hashed. fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) { const BYTES_USED: usize = MAX_BUFFER_LENGTH; assert!( &data[..BYTES_USED] == &decompress_lz77(&lz77_compress_conf( &data[..BYTES_USED], 0, NO_RLE, matching_type, ).unwrap()) [..] ); assert!( &data[..BYTES_USED + 1] == &decompress_lz77(&lz77_compress_conf( &data[..BYTES_USED + 1], 0, NO_RLE, matching_type, ).unwrap()) [..] ); } /// Test buffer fill when buffer is full at a match. fn buffer_test_match(matching_type: MatchingType) { // TODO: Also test this for the second block to make sure // buffer is slid. let mut state = TestStruct::with_config(1, 0, matching_type); for _ in 0..MAX_BUFFER_LENGTH - 4 { assert!(state.writer.write_literal(0) == BufferStatus::NotFull); } state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true); assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3)); } /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end. fn buffer_test_add_end(_data: &[u8]) { // This is disabled while the buffer size has not been stabilized. /* let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); // For the test file, this is how much data needs to be added to get the buffer // full at the right spot to test that this buffer full exit is workong correctly. for i in 0..31743 { assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i); } let dec = decompress_lz77(&state.writer.get_buffer()[pos..]); assert!(dec.len() > 0); assert!(dec[..] == data[..dec.len()]); */ } /// Check that decompressing lz77-data that refers to the back-buffer works. #[test] fn test_decompress_with_backbuffer() { let bb = vec![5; WINDOW_SIZE]; let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)]; let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb); // ------------l2 l4 <-ld4,20-> l1 l1 <---ld5,10--> assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]); } } #[cfg(all(test, feature = "benchmarks"))] mod bench { use test_std::Bencher; use test_utils::get_test_data; use super::lz77_compress; #[bench] fn test_file_zlib_lz77_only(b: &mut Bencher) { let test_data = get_test_data(); b.iter(|| lz77_compress(&test_data)); } }