use std::cmp; use chained_hash_table::{ChainedHashTable, WINDOW_SIZE}; use huffman_table; const MAX_MATCH: usize = huffman_table::MAX_MATCH as usize; #[cfg(test)] const MIN_MATCH: usize = huffman_table::MIN_MATCH as usize; /// Get the length of the checked match /// The function returns number of bytes at and including `current_pos` that are the same as the /// ones at `pos_to_check` #[inline] pub fn get_match_length(data: &[u8], current_pos: usize, pos_to_check: usize) -> usize { // Unsafe version using unaligned loads for comparison. // Faster when benching the matching function alone, // but not as significant when running the full thing. /* type Comp = u64; use std::mem::size_of; let max = cmp::min(data.len() - current_pos, MAX_MATCH); let mut left = max; let s = size_of::(); unsafe { let mut cur = data.as_ptr().offset(current_pos as isize); let mut tc = data.as_ptr().offset(pos_to_check as isize); while left >= s && (*(cur as *const Comp) == *(tc as *const Comp)) { left -= s; cur = cur.offset(s as isize); tc = tc.offset(s as isize); } while left > 0 && *cur == *tc { left -= 1; cur = cur.offset(1); tc = tc.offset(1); } } max - left */ // Slightly faster than naive in single bench. // Does not use unaligned loads. // let l = cmp::min(MAX_MATCH, data.len() - current_pos); // let a = unsafe{&data.get_unchecked(current_pos..current_pos + l)}; // let b = unsafe{&data.get_unchecked(pos_to_check..)}; // let mut len = 0; // for (l, r) in a // .iter() // .zip(b.iter()) { // if *l == *r { // len += 1; // continue; // } else { // break; // } // } // len as usize // Naive version data[current_pos..] .iter() .zip(data[pos_to_check..].iter()) .take(MAX_MATCH) .take_while(|&(&a, &b)| a == b) .count() } /// Try finding the position and length of the longest match in the input data. /// # Returns /// (length, distance from position) /// If no match is found that was better than `prev_length` or at all, or we are at the start, /// the length value returned will be 2. /// /// # Arguments: /// `data`: The data to search in. /// `hash_table`: Hash table to use for searching. /// `position`: The position in the data to match against. /// `prev_length`: The length of the previous `longest_match` check to compare against. /// `max_hash_checks`: The maximum number of matching hash chain positions to check. pub fn longest_match( data: &[u8], hash_table: &ChainedHashTable, position: usize, prev_length: usize, max_hash_checks: u16, ) -> (usize, usize) { // debug_assert_eq!(position, hash_table.current_head() as usize); // If we already have a match at the maximum length, // or we can't grow further, we stop here. if prev_length >= MAX_MATCH || position + prev_length >= data.len() { return (0, 0); } let limit = if position > WINDOW_SIZE { position - WINDOW_SIZE } else { 0 }; // Make sure the length is at least one to simplify the matching code, as // otherwise the matching code might underflow. let prev_length = cmp::max(prev_length, 1); let max_length = cmp::min(data.len() - position, MAX_MATCH); // The position in the hash chain we are currently checking. let mut current_head = position; // The best match length we've found so far, and it's distance. let mut best_length = prev_length; let mut best_distance = 0; // The position of the previous value in the hash chain. let mut prev_head; for _ in 0..max_hash_checks { prev_head = current_head; current_head = hash_table.get_prev(current_head) as usize; if current_head >= prev_head || current_head < limit { // If the current hash chain value refers to itself, or is referring to // a value that's higher (we only move backwars through the chain), // we are at the end and can stop. break; } // We only check further if the match length can actually increase // Checking if the end byte and the potential next byte matches is generally // more likely to give a quick answer rather than checking from the start first, given // that the hashes match. // If there is no previous match, best_length will be 1 and the two first bytes will // be checked instead. // Since we've made sure best_length is always at least 1, this shouldn't underflow. if data[position + best_length - 1..position + best_length + 1] == data[current_head + best_length - 1..current_head + best_length + 1] { // Actually check how many bytes match. // At the moment this will check the two bytes we just checked again, // though adding code for skipping these bytes may not result in any speed // gain due to the added complexity. let length = get_match_length(data, position, current_head); if length > best_length { best_length = length; best_distance = position - current_head; if length == max_length { // We are at the max length, so there is no point // searching any longer break; } } } } if best_length > prev_length { (best_length, best_distance) } else { (0, 0) } } /// Try finding the position and length of the longest match in the input data using fast zlib /// hash skipping algorithm. /// # Returns /// (length, distance from position) /// If no match is found that was better than `prev_length` or at all, or we are at the start, /// the length value returned will be 2. /// /// # Arguments: /// `data`: The data to search in. /// `hash_table`: Hash table to use for searching. /// `position`: The position in the data to match against. /// `prev_length`: The length of the previous `longest_match` check to compare against. /// `max_hash_checks`: The maximum number of matching hash chain positions to check. #[cfg(test)] pub fn longest_match_fast( data: &[u8], hash_table: &ChainedHashTable, position: usize, prev_length: usize, max_hash_checks: u16, ) -> (usize, usize) { // debug_assert_eq!(position, hash_table.current_head() as usize); // If we already have a match at the maximum length, // or we can't grow further, we stop here. if prev_length >= MAX_MATCH || position + prev_length >= data.len() { return (0, 0); } let limit = if position > WINDOW_SIZE { position - WINDOW_SIZE } else { 0 }; // Make sure the length is at least one to simplify the matching code, as // otherwise the matching code might underflow. let prev_length = cmp::max(prev_length, 1); let max_length = cmp::min((data.len() - position), MAX_MATCH); // The position in the hash chain we are currently checking. let mut current_head = position; // The best match length we've found so far, and it's distance. let mut best_length = prev_length; let mut best_distance = 0; // The offset from the start of the match of the hash chain we are traversing. let mut offset = 0; // The position of the previous value in the hash chain. let mut prev_head; for _ in 0..max_hash_checks { prev_head = current_head; current_head = hash_table.get_prev(current_head) as usize; if current_head >= prev_head || current_head < limit + offset { // If the current hash chain value refers to itself, or is referring to // a value that's higher (we only move backwars through the chain), // we are at the end and can stop. break; } let offset_head = current_head - offset; // We only check further if the match length can actually increase // Checking if the end byte and the potential next byte matches is generally // more likely to give a quick answer rather than checking from the start first, given // that the hashes match. // If there is no previous match, best_length will be 1 and the two first bytes will // be checked instead. // Since we've made sure best_length is always at least 1, this shouldn't underflow. if data[position + best_length - 1..position + best_length + 1] == data[offset_head + best_length - 1..offset_head + best_length + 1] { // Actually check how many bytes match. // At the moment this will check the two bytes we just checked again, // though adding code for skipping these bytes may not result in any speed // gain due to the added complexity. let length = get_match_length(data, position, offset_head); if length > best_length { best_length = length; best_distance = position - offset_head; if length == max_length { // We are at the max length, so there is no point // searching any longer break; } // Find the position in the match where the next has position is the furthest away. // By moving to a different hash chain we can potentially skip a lot of checks, // saving time. // We avoid doing this for matches that extend past the starting position, as // those will contain positions that are not in the hash table yet. if best_distance > best_length { offset = hash_table.farthest_next(offset_head, length); current_head = offset_head + offset; } } } } if best_length > prev_length { (best_length, best_distance) } else { (0, 0) } } // Get the longest match from the current position of the hash table. #[inline] #[cfg(test)] pub fn longest_match_current(data: &[u8], hash_table: &ChainedHashTable) -> (usize, usize) { use compression_options::MAX_HASH_CHECKS; longest_match( data, hash_table, hash_table.current_head() as usize, MIN_MATCH as usize - 1, MAX_HASH_CHECKS, ) } #[cfg(test)] mod test { use chained_hash_table::{filled_hash_table, HASH_BYTES, ChainedHashTable}; use super::{get_match_length, longest_match, longest_match_fast}; /// Test that match lengths are calculated correctly #[test] fn match_length() { let test_arr = [5u8, 5, 5, 5, 5, 9, 9, 2, 3, 5, 5, 5, 5, 5]; let l = get_match_length(&test_arr, 9, 0); assert_eq!(l, 5); let l2 = get_match_length(&test_arr, 9, 7); assert_eq!(l2, 0); let l3 = get_match_length(&test_arr, 10, 0); assert_eq!(l3, 4); } /// Test that we get the longest of the matches #[test] fn get_longest_match() { let test_data = b"xTest data, Test_data,zTest data"; let hash_table = filled_hash_table(&test_data[..23 + 1 + HASH_BYTES - 1]); let (length, distance) = super::longest_match_current(test_data, &hash_table); // We check that we get the longest match, rather than the shorter, but closer one. assert_eq!(distance, 22); assert_eq!(length, 9); let test_arr2 = [ 10u8, 10, 10, 10, 10, 10, 10, 10, 2, 3, 5, 10, 10, 10, 10, 10, ]; let hash_table = filled_hash_table(&test_arr2[..HASH_BYTES + 1 + 1 + 2]); let (length, distance) = super::longest_match_current(&test_arr2, &hash_table); assert_eq!(distance, 1); assert_eq!(length, 4); } /// Make sure we can get a match at index zero #[test] fn match_index_zero() { let test_data = b"AAAAAAA"; let mut hash_table = ChainedHashTable::from_starting_values(test_data[0], test_data[1]); for (n, &b) in test_data[2..5].iter().enumerate() { hash_table.add_hash_value(n, b); } let (match_length, match_dist) = longest_match(test_data, &hash_table, 1, 0, 4096); assert_eq!(match_dist, 1); assert!(match_length == 6); } /// Test for fast_zlib algorithm. /// Check that it doesn't give worse matches than the default one. /// ignored by default as it's slow, and best ran in release mode. #[ignore] #[test] fn fast_match_at_least_equal() { use test_utils::get_test_data; for start_pos in 10000..50000 { const NUM_CHECKS: u16 = 400; let data = get_test_data(); let hash_table = filled_hash_table(&data[..start_pos + 1]); let pos = hash_table.current_head() as usize; let naive_match = longest_match(&data[..], &hash_table, pos, 0, NUM_CHECKS); let fast_match = longest_match_fast(&data[..], &hash_table, pos, 0, NUM_CHECKS); if fast_match.0 > naive_match.0 { println!("Fast match found better match!"); } assert!(fast_match.0 >= naive_match.0, "naive match had better length! start_pos: {}, naive: {:?}, fast {:?}" , start_pos, naive_match, fast_match); assert!(fast_match.1 >= naive_match.1, "naive match had better dist! start_pos: {} naive {:?}, fast {:?}" , start_pos, naive_match, fast_match); } } } #[cfg(all(test, feature = "benchmarks"))] mod bench { use test_std::Bencher; use test_utils::get_test_data; use chained_hash_table::filled_hash_table; use super::{longest_match, longest_match_fast}; #[bench] fn matching(b: &mut Bencher) { const POS: usize = 29000; let data = get_test_data(); let hash_table = filled_hash_table(&data[..POS + 1]); let pos = hash_table.current_head() as usize; println!("M: {:?}", longest_match(&data[..], &hash_table, pos, 0, 4096)); b.iter( || longest_match(&data[..], &hash_table, pos, 0, 4096) ); } #[bench] fn fast_matching(b: &mut Bencher) { const POS: usize = 29000; let data = get_test_data(); let hash_table = filled_hash_table(&data[..POS + 1]); let pos = hash_table.current_head() as usize; println!("M: {:?}", longest_match_fast(&data[..], &hash_table, pos, 0, 4096)); b.iter( || longest_match_fast(&data[..], &hash_table, pos, 0, 4096) ); } }