use std::convert::TryInto; /// pub mod decode { /// The error returned by [`decode()`][super::decode()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error("{}", message)] Corrupt { message: &'static str }, } } /// Decode `data` as EWAH bitmap. pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> { use self::decode::Error; use crate::decode; let (num_bits, data) = decode::u32(data).ok_or(Error::Corrupt { message: "eof reading amount of bits", })?; let (len, data) = decode::u32(data).ok_or(Error::Corrupt { message: "eof reading chunk length", })?; let len = len as usize; // NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop. // Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe // one day somebody will find out that it's worth it to use unsafe here. let (mut bits, data) = decode::split_at_pos(data, len * std::mem::size_of::()).ok_or(Error::Corrupt { message: "eof while reading bit data", })?; let mut buf = std::vec::Vec::::with_capacity(len); for _ in 0..len { let (bit_num, rest) = bits.split_at(std::mem::size_of::()); bits = rest; buf.push(u64::from_be_bytes(bit_num.try_into().unwrap())) } let (rlw, data) = decode::u32(data).ok_or(Error::Corrupt { message: "eof while reading run length width", })?; Ok(( Vec { num_bits, bits: buf, rlw: rlw.into(), }, data, )) } mod access { use std::convert::{TryFrom, TryInto}; use super::Vec; impl Vec { /// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array. /// If `f` returns `None` the iteration will be stopped and `None` is returned. /// /// The index is sequential like in any other vector. pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> { let mut index = 0usize; let mut iter = self.bits.iter(); while let Some(word) = iter.next() { if rlw_runbit_is_set(word) { let len = rlw_running_len_bits(word); for _ in 0..len { f(index)?; index += 1; } } else { index += usize::try_from(rlw_running_len_bits(word)).ok()?; } for _ in 0..rlw_literal_words(word) { let word = iter .next() .expect("BUG: ran out of words while going through uncompressed portion"); for bit_index in 0..64 { if word & (1 << bit_index) != 0 { f(index)?; } index += 1; } } } Some(()) } /// The amount of bits we are currently holding. pub fn num_bits(&self) -> usize { self.num_bits.try_into().expect("we are not on 16 bit systems") } } #[inline] fn rlw_running_len_bits(w: &u64) -> u64 { rlw_running_len(w) * 64 } #[inline] fn rlw_running_len(w: &u64) -> u64 { (w >> 1) & RLW_LARGEST_RUNNING_COUNT } #[inline] fn rlw_literal_words(w: &u64) -> u64 { w >> (1 + RLW_RUNNING_BITS) } #[inline] fn rlw_runbit_is_set(w: &u64) -> bool { w & 1 == 1 } const RLW_RUNNING_BITS: u64 = 4 * 8; const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1; } /// A growable collection of u64 that are seen as stream of individual bits. #[allow(dead_code)] #[derive(Clone)] pub struct Vec { num_bits: u32, bits: std::vec::Vec, /// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially. rlw: u64, }