diff options
Diffstat (limited to 'vendor/gix-pack/src/cache')
-rw-r--r-- | vendor/gix-pack/src/cache/delta/from_offsets.rs | 161 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/delta/mod.rs | 216 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/delta/traverse/mod.rs | 177 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/delta/traverse/resolve.rs | 154 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/delta/traverse/util.rs | 63 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/lru.rs | 165 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/mod.rs | 55 | ||||
-rw-r--r-- | vendor/gix-pack/src/cache/object.rs | 123 |
8 files changed, 1114 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/cache/delta/from_offsets.rs b/vendor/gix-pack/src/cache/delta/from_offsets.rs new file mode 100644 index 000000000..8acb4a802 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/from_offsets.rs @@ -0,0 +1,161 @@ +use std::{ + convert::TryFrom, + fs, io, + io::{BufRead, Read, Seek, SeekFrom}, + sync::atomic::{AtomicBool, Ordering}, + time::Instant, +}; + +use gix_features::progress::{self, Progress}; + +use crate::{cache::delta::Tree, data}; + +/// Returned by [`Tree::from_offsets_in_pack()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("{message}")] + Io { source: io::Error, message: &'static str }, + #[error(transparent)] + Header(#[from] crate::data::header::decode::Error), + #[error("Could find object with id {id} in this pack. Thin packs are not supported")] + UnresolvedRefDelta { id: gix_hash::ObjectId }, + #[error(transparent)] + Tree(#[from] crate::cache::delta::Error), + #[error("Interrupted")] + Interrupted, +} + +const PACK_HEADER_LEN: usize = 12; + +/// Generate tree from certain input +impl<T> Tree<T> { + /// Create a new `Tree` from any data sorted by offset, ascending as returned by the `data_sorted_by_offsets` iterator. + /// * `get_pack_offset(item: &T`) -> data::Offset` is a function returning the pack offset of the given item, which can be used + /// for obtaining the objects entry within the pack. + /// * `pack_path` is the path to the pack file itself and from which to read the entry data, which is a pack file matching the offsets + /// returned by `get_pack_offset(…)`. + /// * `progress` is used to track progress when creating the tree. + /// * `resolve_in_pack_id(gix_hash::oid) -> Option<data::Offset>` takes an object ID and tries to resolve it to an object within this pack if + /// possible. Failing to do so aborts the operation, and this function is not expected to be called in usual packs. It's a theoretical + /// possibility though as old packs might have referred to their objects using the 20 bytes hash, instead of their encoded offset from the base. + /// + /// Note that the sort order is ascending. The given pack file path must match the provided offsets. + pub fn from_offsets_in_pack( + pack_path: impl AsRef<std::path::Path>, + data_sorted_by_offsets: impl Iterator<Item = T>, + get_pack_offset: impl Fn(&T) -> data::Offset, + resolve_in_pack_id: impl Fn(&gix_hash::oid) -> Option<data::Offset>, + mut progress: impl Progress, + should_interrupt: &AtomicBool, + object_hash: gix_hash::Kind, + ) -> Result<Self, Error> { + let mut r = io::BufReader::with_capacity( + 8192 * 8, // this value directly corresponds to performance, 8k (default) is about 4x slower than 64k + fs::File::open(pack_path).map_err(|err| Error::Io { + source: err, + message: "open pack path", + })?, + ); + + let anticipated_num_objects = if let Some(num_objects) = data_sorted_by_offsets.size_hint().1 { + progress.init(Some(num_objects), progress::count("objects")); + num_objects + } else { + 0 + }; + let mut tree = Tree::with_capacity(anticipated_num_objects)?; + + { + // safety check - assure ourselves it's a pack we can handle + let mut buf = [0u8; PACK_HEADER_LEN]; + r.read_exact(&mut buf).map_err(|err| Error::Io { + source: err, + message: "reading header buffer with at least 12 bytes failed - pack file truncated?", + })?; + crate::data::header::decode(&buf)?; + } + + let then = Instant::now(); + + let mut previous_cursor_position = None::<u64>; + + let hash_len = object_hash.len_in_bytes(); + for (idx, data) in data_sorted_by_offsets.enumerate() { + let pack_offset = get_pack_offset(&data); + if let Some(previous_offset) = previous_cursor_position { + Self::advance_cursor_to_pack_offset(&mut r, pack_offset, previous_offset)?; + }; + let entry = crate::data::Entry::from_read(&mut r, pack_offset, hash_len).map_err(|err| Error::Io { + source: err, + message: "EOF while parsing header", + })?; + previous_cursor_position = Some(pack_offset + entry.header_size() as u64); + + use crate::data::entry::Header::*; + match entry.header { + Tree | Blob | Commit | Tag => { + tree.add_root(pack_offset, data)?; + } + RefDelta { base_id } => { + resolve_in_pack_id(base_id.as_ref()) + .ok_or(Error::UnresolvedRefDelta { id: base_id }) + .and_then(|base_pack_offset| { + tree.add_child(base_pack_offset, pack_offset, data).map_err(Into::into) + })?; + } + OfsDelta { base_distance } => { + let base_pack_offset = pack_offset + .checked_sub(base_distance) + .expect("in bound distance for deltas"); + tree.add_child(base_pack_offset, pack_offset, data)?; + } + }; + progress.inc(); + if idx % 10_000 == 0 && should_interrupt.load(Ordering::SeqCst) { + return Err(Error::Interrupted); + } + } + + progress.show_throughput(then); + Ok(tree) + } + + fn advance_cursor_to_pack_offset( + r: &mut io::BufReader<fs::File>, + pack_offset: u64, + previous_offset: u64, + ) -> Result<(), Error> { + let bytes_to_skip: u64 = pack_offset + .checked_sub(previous_offset) + .expect("continuously ascending pack offsets"); + if bytes_to_skip == 0 { + return Ok(()); + } + let buf = r.fill_buf().map_err(|err| Error::Io { + source: err, + message: "skip bytes", + })?; + if buf.is_empty() { + // This means we have reached the end of file and can't make progress anymore, before we have satisfied our need + // for more + return Err(Error::Io { + source: io::Error::new( + io::ErrorKind::UnexpectedEof, + "ran out of bytes before reading desired amount of bytes", + ), + message: "index file is damaged or corrupt", + }); + } + if bytes_to_skip <= u64::try_from(buf.len()).expect("sensible buffer size") { + // SAFETY: bytes_to_skip <= buf.len() <= usize::MAX + r.consume(bytes_to_skip as usize); + } else { + r.seek(SeekFrom::Start(pack_offset)).map_err(|err| Error::Io { + source: err, + message: "seek to next entry", + })?; + } + Ok(()) + } +} diff --git a/vendor/gix-pack/src/cache/delta/mod.rs b/vendor/gix-pack/src/cache/delta/mod.rs new file mode 100644 index 000000000..f4c1b6fc6 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/mod.rs @@ -0,0 +1,216 @@ +/// Returned when using various methods on a [`Tree`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("Pack offsets must only increment. The previous pack offset was {last_pack_offset}, the current one is {pack_offset}")] + InvariantIncreasingPackOffset { + /// The last seen pack offset + last_pack_offset: crate::data::Offset, + /// The invariant violating offset + pack_offset: crate::data::Offset, + }, +} + +/// +pub mod traverse; + +/// +pub mod from_offsets; + +/// An item stored within the [`Tree`] +pub struct Item<T> { + /// The offset into the pack file at which the pack entry's data is located. + pub offset: crate::data::Offset, + /// The offset of the next item in the pack file. + pub next_offset: crate::data::Offset, + /// Data to store with each Item, effectively data associated with each entry in a pack. + pub data: T, + /// Indices into our Tree's `items`, one for each pack entry that depends on us. + /// + /// Limited to u32 as that's the maximum amount of objects in a pack. + children: Vec<u32>, +} + +/// Identify what kind of node we have last seen +enum NodeKind { + Root, + Child, +} + +/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process, +/// while being shareable among threads without a lock. +/// It does this by making the guarantee that iteration only happens once. +pub struct Tree<T> { + /// The root nodes, i.e. base objects + root_items: Vec<Item<T>>, + /// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects + child_items: Vec<Item<T>>, + /// The last encountered node was either a root or a child. + last_seen: Option<NodeKind>, + /// Future child offsets, associating their offset into the pack with their index in the items array. + /// (parent_offset, child_index) + future_child_offsets: Vec<(crate::data::Offset, usize)>, +} + +impl<T> Tree<T> { + /// Instantiate a empty tree capable of storing `num_objects` amounts of items. + pub fn with_capacity(num_objects: usize) -> Result<Self, Error> { + Ok(Tree { + root_items: Vec::with_capacity(num_objects / 2), + child_items: Vec::with_capacity(num_objects / 2), + last_seen: None, + future_child_offsets: Vec::new(), + }) + } + + fn num_items(&self) -> usize { + self.root_items.len() + self.child_items.len() + } + + fn assert_is_incrementing_and_update_next_offset(&mut self, offset: crate::data::Offset) -> Result<(), Error> { + let items = match &self.last_seen { + Some(NodeKind::Root) => &mut self.root_items, + Some(NodeKind::Child) => &mut self.child_items, + None => return Ok(()), + }; + let item = &mut items.last_mut().expect("last seen won't lie"); + if offset <= item.offset { + return Err(Error::InvariantIncreasingPackOffset { + last_pack_offset: item.offset, + pack_offset: offset, + }); + } + item.next_offset = offset; + Ok(()) + } + + fn set_pack_entries_end_and_resolve_ref_offsets( + &mut self, + pack_entries_end: crate::data::Offset, + ) -> Result<(), traverse::Error> { + if !self.future_child_offsets.is_empty() { + for (parent_offset, child_index) in self.future_child_offsets.drain(..) { + if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.child_items[i].children.push(child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.root_items[i].children.push(child_index as u32); + } else { + return Err(traverse::Error::OutOfPackRefDelta { + base_pack_offset: parent_offset, + }); + } + } + } + + self.assert_is_incrementing_and_update_next_offset(pack_entries_end) + .expect("BUG: pack now is smaller than all previously seen entries"); + Ok(()) + } + + /// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate + /// custom `data` with it. + pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + self.last_seen = NodeKind::Root.into(); + self.root_items.push(Item { + offset, + next_offset: 0, + data, + children: Default::default(), + }); + Ok(()) + } + + /// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it. + pub fn add_child( + &mut self, + base_offset: crate::data::Offset, + offset: crate::data::Offset, + data: T, + ) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + + let next_child_index = self.child_items.len(); + if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.child_items[i].children.push(next_child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.root_items[i].children.push(next_child_index as u32); + } else { + self.future_child_offsets.push((base_offset, next_child_index)); + } + + self.last_seen = NodeKind::Child.into(); + self.child_items.push(Item { + offset, + next_offset: 0, + data, + children: Default::default(), + }); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + mod tree { + mod from_offsets_in_pack { + use std::sync::atomic::AtomicBool; + + use crate as pack; + + const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"; + const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack"; + + const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"; + const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack"; + + use gix_testtools::fixture_path; + + #[test] + fn v1() -> Result<(), Box<dyn std::error::Error>> { + tree(INDEX_V1, PACK_FOR_INDEX_V1) + } + + #[test] + fn v2() -> Result<(), Box<dyn std::error::Error>> { + tree(SMALL_PACK_INDEX, SMALL_PACK) + } + + fn tree(index_path: &str, pack_path: &str) -> Result<(), Box<dyn std::error::Error>> { + let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?; + crate::cache::delta::Tree::from_offsets_in_pack( + fixture_path(pack_path), + idx.sorted_offsets().into_iter(), + |ofs| *ofs, + |id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)), + gix_features::progress::Discard, + &AtomicBool::new(false), + gix_hash::Kind::Sha1, + )?; + Ok(()) + } + } + } + + #[test] + fn size_of_pack_tree_item() { + use super::Item; + assert_eq!(std::mem::size_of::<[Item<()>; 7_500_000]>(), 300_000_000); + } + + #[test] + fn size_of_pack_verify_data_structure() { + use super::Item; + pub struct EntryWithDefault { + _index_entry: crate::index::Entry, + _kind: gix_object::Kind, + _object_size: u64, + _decompressed_size: u64, + _compressed_size: u64, + _header_size: u16, + _level: u16, + } + + assert_eq!(std::mem::size_of::<[Item<EntryWithDefault>; 7_500_000]>(), 840_000_000); + } +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/mod.rs b/vendor/gix-pack/src/cache/delta/traverse/mod.rs new file mode 100644 index 000000000..bfe2ec687 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/mod.rs @@ -0,0 +1,177 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +use gix_features::{ + parallel::in_parallel_with_slice, + progress::{self, Progress}, + threading::{lock, Mutable, OwnShared}, +}; + +use crate::{ + cache::delta::{traverse::util::ItemSliceSend, Item, Tree}, + data::EntryRange, +}; + +mod resolve; +pub(crate) mod util; + +/// Returned by [`Tree::traverse()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("{message}")] + ZlibInflate { + source: gix_features::zlib::inflate::Error, + message: &'static str, + }, + #[error("The resolver failed to obtain the pack entry bytes for the entry at {pack_offset}")] + ResolveFailed { pack_offset: u64 }, + #[error("One of the object inspectors failed")] + Inspect(#[from] Box<dyn std::error::Error + Send + Sync>), + #[error("Interrupted")] + Interrupted, + #[error( + "The base at {base_pack_offset} was referred to by a ref-delta, but it was never added to the tree as if the pack was still thin." + )] + OutOfPackRefDelta { + /// The base's offset which was from a resolved ref-delta that didn't actually get added to the tree + base_pack_offset: crate::data::Offset, + }, +} + +/// Additional context passed to the `inspect_object(…)` function of the [`Tree::traverse()`] method. +pub struct Context<'a, S> { + /// The pack entry describing the object + pub entry: &'a crate::data::Entry, + /// The offset at which `entry` ends in the pack, useful to learn about the exact range of `entry` within the pack. + pub entry_end: u64, + /// The decompressed object itself, ready to be decoded. + pub decompressed: &'a [u8], + /// Custom state known to the function + pub state: &'a mut S, + /// The depth at which this object resides in the delta-tree. It represents the amount of base objects, with 0 indicating + /// an 'undeltified' object, and higher values indicating delta objects with the given amount of bases. + pub level: u16, +} + +/// Options for [`Tree::traverse()`]. +pub struct Options<'a, P1, P2> { + /// is a progress instance to track progress for each object in the traversal. + pub object_progress: P1, + /// is a progress instance to track the overall progress. + pub size_progress: P2, + /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on + /// the amount of available logical cores. + pub thread_limit: Option<usize>, + /// Abort the operation if the value is `true`. + pub should_interrupt: &'a AtomicBool, + /// specifies what kind of hashes we expect to be stored in oid-delta entries, which is viable to decoding them + /// with the correct size. + pub object_hash: gix_hash::Kind, +} + +/// The outcome of [`Tree::traverse()`] +pub struct Outcome<T> { + /// The items that have no children in the pack, i.e. base objects. + pub roots: Vec<Item<T>>, + /// The items that children to a root object, i.e. delta objects. + pub children: Vec<Item<T>>, +} + +impl<T> Tree<T> +where + T: Send, +{ + /// Traverse this tree of delta objects with a function `inspect_object` to process each object at will. + /// + /// * `should_run_in_parallel() -> bool` returns true if the underlying pack is big enough to warrant parallel traversal at all. + /// * `resolve(EntrySlice, &mut Vec<u8>) -> Option<()>` resolves the bytes in the pack for the given `EntrySlice` and stores them in the + /// output vector. It returns `Some(())` if the object existed in the pack, or `None` to indicate a resolution error, which would abort the + /// operation as well. + /// * `pack_entries_end` marks one-past-the-last byte of the last entry in the pack, as the last entries size would otherwise + /// be unknown as it's not part of the index file. + /// * `new_thread_state() -> State` is a function to create state to be used in each thread, invoked once per thread. + /// * `inspect_object(node_data: &mut T, progress: Progress, context: Context<ThreadLocal State>) -> Result<(), CustomError>` is a function + /// running for each thread receiving fully decoded objects along with contextual information, which either succeeds with `Ok(())` + /// or returns a `CustomError`. + /// Note that `node_data` can be modified to allow storing maintaining computation results on a per-object basis. + /// + /// This method returns a vector of all tree items, along with their potentially modified custom node data. + /// + /// _Note_ that this method consumed the Tree to assure safe parallel traversal with mutation support. + pub fn traverse<F, P1, P2, MBFN, S, E>( + mut self, + resolve: F, + pack_entries_end: u64, + new_thread_state: impl Fn() -> S + Send + Clone, + inspect_object: MBFN, + Options { + thread_limit, + object_progress, + mut size_progress, + should_interrupt, + object_hash, + }: Options<'_, P1, P2>, + ) -> Result<Outcome<T>, Error> + where + F: for<'r> Fn(EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone, + P1: Progress, + P2: Progress, + MBFN: Fn(&mut T, &mut <P1 as Progress>::SubProgress, Context<'_, S>) -> Result<(), E> + Send + Clone, + E: std::error::Error + Send + Sync + 'static, + { + self.set_pack_entries_end_and_resolve_ref_offsets(pack_entries_end)?; + let object_progress = OwnShared::new(Mutable::new(object_progress)); + + let num_objects = self.num_items(); + let object_counter = { + let mut progress = lock(&object_progress); + progress.init(Some(num_objects), progress::count("objects")); + progress.counter() + }; + size_progress.init(None, progress::bytes()); + let size_counter = size_progress.counter(); + let child_items = self.child_items.as_mut_slice(); + + let start = std::time::Instant::now(); + in_parallel_with_slice( + &mut self.root_items, + thread_limit, + { + let object_progress = object_progress.clone(); + let child_items = ItemSliceSend(child_items as *mut [Item<T>]); + move |thread_index| { + ( + Vec::<u8>::with_capacity(4096), + lock(&object_progress) + .add_child_with_id(format!("thread {thread_index}"), gix_features::progress::UNKNOWN), + new_thread_state(), + resolve.clone(), + inspect_object.clone(), + ItemSliceSend(child_items.0), + ) + } + }, + { + move |node, state| { + resolve::deltas( + object_counter.clone(), + size_counter.clone(), + node, + state, + object_hash.len_in_bytes(), + ) + } + }, + || (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)), + |_| (), + )?; + + lock(&object_progress).show_throughput(start); + size_progress.show_throughput(start); + + Ok(Outcome { + roots: self.root_items, + children: self.child_items, + }) + } +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/resolve.rs b/vendor/gix-pack/src/cache/delta/traverse/resolve.rs new file mode 100644 index 000000000..fc94d87ef --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/resolve.rs @@ -0,0 +1,154 @@ +use std::{cell::RefCell, collections::BTreeMap, sync::atomic::Ordering}; + +use gix_features::{progress::Progress, zlib}; + +use crate::{ + cache::delta::{ + traverse::{ + util::{ItemSliceSend, Node}, + Context, Error, + }, + Item, + }, + data::EntryRange, +}; + +pub(crate) fn deltas<T, F, P, MBFN, S, E>( + object_counter: Option<gix_features::progress::StepShared>, + size_counter: Option<gix_features::progress::StepShared>, + node: &mut crate::cache::delta::Item<T>, + (bytes_buf, ref mut progress, state, resolve, modify_base, child_items): &mut ( + Vec<u8>, + P, + S, + F, + MBFN, + ItemSliceSend<Item<T>>, + ), + hash_len: usize, +) -> Result<(), Error> +where + T: Send, + F: for<'r> Fn(EntryRange, &'r mut Vec<u8>) -> Option<()>, + P: Progress, + MBFN: Fn(&mut T, &mut P, Context<'_, S>) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, +{ + let mut decompressed_bytes_by_pack_offset = BTreeMap::new(); + let bytes_buf = RefCell::new(bytes_buf); + let decompress_from_resolver = |slice: EntryRange| -> Result<(crate::data::Entry, u64, Vec<u8>), Error> { + let mut bytes_buf = bytes_buf.borrow_mut(); + bytes_buf.resize((slice.end - slice.start) as usize, 0); + resolve(slice.clone(), &mut bytes_buf).ok_or(Error::ResolveFailed { + pack_offset: slice.start, + })?; + let entry = crate::data::Entry::from_bytes(&bytes_buf, slice.start, hash_len); + let compressed = &bytes_buf[entry.header_size()..]; + let decompressed_len = entry.decompressed_size as usize; + Ok((entry, slice.end, decompress_all_at_once(compressed, decompressed_len)?)) + }; + + // Traverse the tree breadth first and loose the data produced for the base as it won't be needed anymore. + progress.init(None, gix_features::progress::count_with_decimals("objects", 2)); + + // each node is a base, and its children always start out as deltas which become a base after applying them. + // These will be pushed onto our stack until all are processed + let root_level = 0; + let mut nodes: Vec<_> = vec![( + root_level, + Node { + item: node, + child_items: child_items.0, + }, + )]; + while let Some((level, mut base)) = nodes.pop() { + let (base_entry, entry_end, base_bytes) = if level == root_level { + decompress_from_resolver(base.entry_slice())? + } else { + decompressed_bytes_by_pack_offset + .remove(&base.offset()) + .expect("we store the resolved delta buffer when done") + }; + + // anything done here must be repeated further down for leaf-nodes. + // This way we avoid retaining their decompressed memory longer than needed (they have no children, + // thus their memory can be released right away, using 18% less peak memory on the linux kernel). + { + modify_base( + base.data(), + progress, + Context { + entry: &base_entry, + entry_end, + decompressed: &base_bytes, + state, + level, + }, + ) + .map_err(|err| Box::new(err) as Box<dyn std::error::Error + Send + Sync>)?; + object_counter.as_ref().map(|c| c.fetch_add(1, Ordering::SeqCst)); + size_counter + .as_ref() + .map(|c| c.fetch_add(base_bytes.len(), Ordering::SeqCst)); + } + + for mut child in base.into_child_iter() { + let (mut child_entry, entry_end, delta_bytes) = decompress_from_resolver(child.entry_slice())?; + let (base_size, consumed) = crate::data::delta::decode_header_size(&delta_bytes); + let mut header_ofs = consumed; + assert_eq!( + base_bytes.len(), + base_size as usize, + "recorded base size in delta does not match" + ); + let (result_size, consumed) = crate::data::delta::decode_header_size(&delta_bytes[consumed..]); + header_ofs += consumed; + + let mut fully_resolved_delta_bytes = bytes_buf.borrow_mut(); + fully_resolved_delta_bytes.resize(result_size as usize, 0); + crate::data::delta::apply(&base_bytes, &mut fully_resolved_delta_bytes, &delta_bytes[header_ofs..]); + + // FIXME: this actually invalidates the "pack_offset()" computation, which is not obvious to consumers + // at all + child_entry.header = base_entry.header; // assign the actual object type, instead of 'delta' + if child.has_children() { + decompressed_bytes_by_pack_offset.insert( + child.offset(), + (child_entry, entry_end, fully_resolved_delta_bytes.to_owned()), + ); + nodes.push((level + 1, child)); + } else { + modify_base( + child.data(), + progress, + Context { + entry: &child_entry, + entry_end, + decompressed: &fully_resolved_delta_bytes, + state, + level: level + 1, + }, + ) + .map_err(|err| Box::new(err) as Box<dyn std::error::Error + Send + Sync>)?; + object_counter.as_ref().map(|c| c.fetch_add(1, Ordering::SeqCst)); + size_counter + .as_ref() + .map(|c| c.fetch_add(base_bytes.len(), Ordering::SeqCst)); + } + } + } + + Ok(()) +} + +fn decompress_all_at_once(b: &[u8], decompressed_len: usize) -> Result<Vec<u8>, Error> { + let mut out = Vec::new(); + out.resize(decompressed_len, 0); + zlib::Inflate::default() + .once(b, &mut out) + .map_err(|err| Error::ZlibInflate { + source: err, + message: "Failed to decompress entry", + })?; + Ok(out) +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/util.rs b/vendor/gix-pack/src/cache/delta/traverse/util.rs new file mode 100644 index 000000000..e7caf2ff5 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/util.rs @@ -0,0 +1,63 @@ +use crate::cache::delta::Item; + +pub struct ItemSliceSend<T>(pub *mut [T]) +where + T: Send; + +impl<T> Clone for ItemSliceSend<T> +where + T: Send, +{ + fn clone(&self) -> Self { + ItemSliceSend(self.0) + } +} + +// SAFETY: T is `Send`, and we only ever access one T at a time. And, ptrs need that assurance, I wonder if it's always right. +#[allow(unsafe_code)] +unsafe impl<T> Send for ItemSliceSend<T> where T: Send {} + +/// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`]. +pub struct Node<'a, T> { + pub item: &'a mut Item<T>, + pub child_items: *mut [Item<T>], +} + +impl<'a, T> Node<'a, T> { + /// Returns the offset into the pack at which the `Node`s data is located. + pub fn offset(&self) -> u64 { + self.item.offset + } + + /// Returns the slice into the data pack at which the pack entry is located. + pub fn entry_slice(&self) -> crate::data::EntryRange { + self.item.offset..self.item.next_offset + } + + /// Returns the node data associated with this node. + pub fn data(&mut self) -> &mut T { + &mut self.item.data + } + + /// Returns true if this node has children, e.g. is not a leaf in the tree. + pub fn has_children(&self) -> bool { + !self.item.children.is_empty() + } + + /// Transform this `Node` into an iterator over its children. + /// + /// Children are `Node`s referring to pack entries whose base object is this pack entry. + pub fn into_child_iter(self) -> impl Iterator<Item = Node<'a, T>> + 'a { + let children = self.child_items; + self.item.children.iter().map(move |&index| { + // SAFETY: The children array is alive by the 'a lifetime. + // SAFETY: The index is a valid index into the children array. + // SAFETY: The resulting mutable pointer cannot be yielded by any other node. + #[allow(unsafe_code)] + Node { + item: unsafe { &mut *(children as *mut Item<T>).add(index as usize) }, + child_items: children, + } + }) + } +} diff --git a/vendor/gix-pack/src/cache/lru.rs b/vendor/gix-pack/src/cache/lru.rs new file mode 100644 index 000000000..bba4f5d33 --- /dev/null +++ b/vendor/gix-pack/src/cache/lru.rs @@ -0,0 +1,165 @@ +use super::DecodeEntry; + +#[cfg(feature = "pack-cache-lru-dynamic")] +mod memory { + use std::num::NonZeroUsize; + + use clru::WeightScale; + + use super::DecodeEntry; + + struct Entry { + data: Vec<u8>, + kind: gix_object::Kind, + compressed_size: usize, + } + + type Key = (u32, u64); + struct CustomScale; + + impl WeightScale<Key, Entry> for CustomScale { + fn weight(&self, _key: &Key, value: &Entry) -> usize { + value.data.len() + } + } + + /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. + pub struct MemoryCappedHashmap { + inner: clru::CLruCache<Key, Entry, std::collections::hash_map::RandomState, CustomScale>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl MemoryCappedHashmap { + /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` + /// object data. + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { + MemoryCappedHashmap { + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_scale(CustomScale), + ), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("MemoryCappedHashmap({memory_cap_in_bytes}B)")), + } + } + } + + impl DecodeEntry for MemoryCappedHashmap { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { + self.debug.put(); + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( + (pack_id, offset), + Entry { + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + compressed_size, + }, + ) { + self.free_list.push(previous_entry.data) + } + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + let res = self.inner.get(&(pack_id, offset)).map(|e| { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + (e.kind, e.compressed_size) + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} + +#[cfg(feature = "pack-cache-lru-dynamic")] +pub use memory::MemoryCappedHashmap; + +#[cfg(feature = "pack-cache-lru-static")] +mod _static { + use super::DecodeEntry; + struct Entry { + pack_id: u32, + offset: u64, + data: Vec<u8>, + kind: gix_object::Kind, + compressed_size: usize, + } + + /// A cache using a least-recently-used implementation capable of storing the `SIZE` most recent objects. + /// The cache must be small as the search is 'naive' and the underlying data structure is a linked list. + /// Values of 64 seem to improve performance. + pub struct StaticLinkedList<const SIZE: usize> { + inner: uluru::LRUCache<Entry, SIZE>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl<const SIZE: usize> Default for StaticLinkedList<SIZE> { + fn default() -> Self { + StaticLinkedList { + inner: Default::default(), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("StaticLinkedList<{SIZE}>")), + } + } + } + + impl<const SIZE: usize> DecodeEntry for StaticLinkedList<SIZE> { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { + self.debug.put(); + if let Some(previous) = self.inner.insert(Entry { + offset, + pack_id, + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + compressed_size, + }) { + self.free_list.push(previous.data) + } + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + let res = self.inner.lookup(|e: &mut Entry| { + if e.pack_id == pack_id && e.offset == offset { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + Some((e.kind, e.compressed_size)) + } else { + None + } + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} + +#[cfg(feature = "pack-cache-lru-static")] +pub use _static::StaticLinkedList; diff --git a/vendor/gix-pack/src/cache/mod.rs b/vendor/gix-pack/src/cache/mod.rs new file mode 100644 index 000000000..cf4b94df8 --- /dev/null +++ b/vendor/gix-pack/src/cache/mod.rs @@ -0,0 +1,55 @@ +use std::ops::DerefMut; + +use gix_object::Kind; + +/// A trait to model putting objects at a given pack `offset` into a cache, and fetching them. +/// +/// It is used to speed up [pack traversals][crate::index::File::traverse()]. +pub trait DecodeEntry { + /// Store a fully decoded object at `offset` of `kind` with `compressed_size` and `data` in the cache. + /// + /// It is up to the cache implementation whether that actually happens or not. + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize); + /// Attempt to fetch the object at `offset` and store its decoded bytes in `out`, as previously stored with [`DecodeEntry::put()`], and return + /// its (object `kind`, `decompressed_size`) + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)>; +} + +/// A cache that stores nothing and retrieves nothing, thus it _never_ caches. +#[derive(Default)] +pub struct Never; + +impl DecodeEntry for Never { + fn put(&mut self, _pack_id: u32, _offset: u64, _data: &[u8], _kind: gix_object::Kind, _compressed_size: usize) {} + fn get(&mut self, _pack_id: u32, _offset: u64, _out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + None + } +} + +impl<T: DecodeEntry + ?Sized> DecodeEntry for Box<T> { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: Kind, compressed_size: usize) { + self.deref_mut().put(pack_id, offset, data, kind, compressed_size) + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(Kind, usize)> { + self.deref_mut().get(pack_id, offset, out) + } +} + +/// A way of storing and retrieving entire objects to and from a cache. +pub trait Object { + /// Put the object going by `id` of `kind` with `data` into the cache. + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]); + + /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind>; +} + +/// Various implementations of [`DecodeEntry`] using least-recently-used algorithms. +#[cfg(any(feature = "pack-cache-lru-dynamic", feature = "pack-cache-lru-static"))] +pub mod lru; + +pub mod object; + +/// +pub(crate) mod delta; diff --git a/vendor/gix-pack/src/cache/object.rs b/vendor/gix-pack/src/cache/object.rs new file mode 100644 index 000000000..e64f47a8c --- /dev/null +++ b/vendor/gix-pack/src/cache/object.rs @@ -0,0 +1,123 @@ +//! # Note +//! +//! This module is a bit 'misplaced' if spelled out like 'gix_pack::cache::object::*' but is best placed here for code re-use and +//! general usefulness. +use crate::cache; + +#[cfg(feature = "object-cache-dynamic")] +mod memory { + use std::num::NonZeroUsize; + + use clru::WeightScale; + + use crate::cache; + + struct Entry { + data: Vec<u8>, + kind: gix_object::Kind, + } + + type Key = gix_hash::ObjectId; + + struct CustomScale; + + impl WeightScale<Key, Entry> for CustomScale { + fn weight(&self, key: &Key, value: &Entry) -> usize { + value.data.len() + std::mem::size_of::<Entry>() + key.as_bytes().len() + } + } + + /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. + pub struct MemoryCappedHashmap { + inner: clru::CLruCache<Key, Entry, gix_hashtable::hash::Builder, CustomScale>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl MemoryCappedHashmap { + /// The amount of bytes we can hold in total, or the value we saw in `new(…)`. + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` + /// object data. + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { + MemoryCappedHashmap { + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_hasher(gix_hashtable::hash::Builder::default()) + .with_scale(CustomScale), + ), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("MemoryCappedObjectHashmap({memory_cap_in_bytes}B)")), + } + } + } + + impl cache::Object for MemoryCappedHashmap { + /// Put the object going by `id` of `kind` with `data` into the cache. + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { + self.debug.put(); + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( + id, + Entry { + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + }, + ) { + self.free_list.push(previous_entry.data) + } + } + + /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind> { + let res = self.inner.get(id).map(|e| { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + e.kind + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} +#[cfg(feature = "object-cache-dynamic")] +pub use memory::MemoryCappedHashmap; + +/// A cache implementation that doesn't do any caching. +pub struct Never; + +impl cache::Object for Never { + /// Noop + fn put(&mut self, _id: gix_hash::ObjectId, _kind: gix_object::Kind, _data: &[u8]) {} + + /// Noop + fn get(&mut self, _id: &gix_hash::ObjectId, _out: &mut Vec<u8>) -> Option<gix_object::Kind> { + None + } +} + +impl<T: cache::Object + ?Sized> cache::Object for Box<T> { + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { + use std::ops::DerefMut; + self.deref_mut().put(id, kind, data) + } + + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind> { + use std::ops::DerefMut; + self.deref_mut().get(id, out) + } +} |