diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:41:41 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:41:41 +0000 |
commit | 10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87 (patch) | |
tree | bdffd5d80c26cf4a7a518281a204be1ace85b4c1 /vendor/gix-pack/src | |
parent | Releasing progress-linux version 1.70.0+dfsg1-9~progress7.99u1. (diff) | |
download | rustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.tar.xz rustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.zip |
Merging upstream version 1.70.0+dfsg2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/gix-pack/src')
66 files changed, 9389 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/bundle/find.rs b/vendor/gix-pack/src/bundle/find.rs new file mode 100644 index 000000000..d39ed49a9 --- /dev/null +++ b/vendor/gix-pack/src/bundle/find.rs @@ -0,0 +1,63 @@ +impl crate::Bundle { + /// Find an object with the given [`ObjectId`][gix_hash::ObjectId] and place its data into `out`. + /// + /// [`cache`][crate::cache::DecodeEntry] is used to accelerate the lookup. + /// + /// **Note** that ref deltas are automatically resolved within this pack only, which makes this implementation unusable + /// for thin packs, which by now are expected to be resolved already. + pub fn find<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + out: &'a mut Vec<u8>, + cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, crate::data::entry::Location)>, crate::data::decode::Error> { + let idx = match self.index.lookup(id) { + Some(idx) => idx, + None => return Ok(None), + }; + self.get_object_by_index(idx, out, cache).map(Some) + } + + /// Special-use function to get an object given an index previously returned from + /// internal_find_pack_index. + /// + /// # Panics + /// + /// If `index` is out of bounds. + pub fn get_object_by_index<'a>( + &self, + idx: u32, + out: &'a mut Vec<u8>, + cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<(gix_object::Data<'a>, crate::data::entry::Location), crate::data::decode::Error> { + let ofs = self.index.pack_offset_at_index(idx); + let pack_entry = self.pack.entry(ofs); + let header_size = pack_entry.header_size(); + self.pack + .decode_entry( + pack_entry, + out, + |id, _out| { + self.index.lookup(id).map(|idx| { + crate::data::decode::entry::ResolvedBase::InPack( + self.pack.entry(self.index.pack_offset_at_index(idx)), + ) + }) + }, + cache, + ) + .map(move |r| { + ( + gix_object::Data { + kind: r.kind, + data: out.as_slice(), + }, + crate::data::entry::Location { + pack_id: self.pack.id, + pack_offset: ofs, + entry_size: r.compressed_size + header_size, + }, + ) + }) + } +} diff --git a/vendor/gix-pack/src/bundle/init.rs b/vendor/gix-pack/src/bundle/init.rs new file mode 100644 index 000000000..3ba5257ed --- /dev/null +++ b/vendor/gix-pack/src/bundle/init.rs @@ -0,0 +1,46 @@ +use std::path::{Path, PathBuf}; + +use crate::Bundle; + +/// Returned by [`Bundle::at()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An 'idx' extension is expected of an index file: '{0}'")] + InvalidPath(PathBuf), + #[error(transparent)] + Pack(#[from] crate::data::header::decode::Error), + #[error(transparent)] + Index(#[from] crate::index::init::Error), +} + +/// Initialization +impl Bundle { + /// Create a `Bundle` from `path`, which is either a pack file _(*.pack)_ or an index file _(*.idx)_. + /// + /// The corresponding complementary file is expected to be present. + /// + /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind + /// isn't stored within the file format itself. + pub fn at(path: impl AsRef<Path>, object_hash: gix_hash::Kind) -> Result<Self, Error> { + Self::at_inner(path.as_ref(), object_hash) + } + + fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result<Self, Error> { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .ok_or_else(|| Error::InvalidPath(path.to_owned()))?; + Ok(match ext { + "idx" => Self { + index: crate::index::File::at(path, object_hash)?, + pack: crate::data::File::at(path.with_extension("pack"), object_hash)?, + }, + "pack" => Self { + pack: crate::data::File::at(path, object_hash)?, + index: crate::index::File::at(path.with_extension("idx"), object_hash)?, + }, + _ => return Err(Error::InvalidPath(path.to_owned())), + }) + } +} diff --git a/vendor/gix-pack/src/bundle/mod.rs b/vendor/gix-pack/src/bundle/mod.rs new file mode 100644 index 000000000..076b355d9 --- /dev/null +++ b/vendor/gix-pack/src/bundle/mod.rs @@ -0,0 +1,60 @@ +/// +pub mod init; + +mod find; +/// +#[cfg(not(feature = "wasm"))] +pub mod write; + +/// +pub mod verify { + use std::sync::atomic::AtomicBool; + + use gix_features::progress::Progress; + + /// + pub mod integrity { + /// Returned by [`Bundle::verify_integrity()`][crate::Bundle::verify_integrity()]. + pub struct Outcome<P> { + /// The computed checksum of the index which matched the stored one. + pub actual_index_checksum: gix_hash::ObjectId, + /// The packs traversal outcome + pub pack_traverse_outcome: crate::index::traverse::Statistics, + /// The provided progress instance. + pub progress: P, + } + } + + use crate::Bundle; + + impl Bundle { + /// Similar to [`crate::index::File::verify_integrity()`] but more convenient to call as the presence of the + /// pack file is a given. + pub fn verify_integrity<C, P, F>( + &self, + progress: P, + should_interrupt: &AtomicBool, + options: crate::index::verify::integrity::Options<F>, + ) -> Result<integrity::Outcome<P>, crate::index::traverse::Error<crate::index::verify::integrity::Error>> + where + P: Progress, + C: crate::cache::DecodeEntry, + F: Fn() -> C + Send + Clone, + { + self.index + .verify_integrity( + Some(crate::index::verify::PackContext { + data: &self.pack, + options, + }), + progress, + should_interrupt, + ) + .map(|o| integrity::Outcome { + actual_index_checksum: o.actual_index_checksum, + pack_traverse_outcome: o.pack_traverse_statistics.expect("pack is set"), + progress: o.progress, + }) + } + } +} diff --git a/vendor/gix-pack/src/bundle/write/error.rs b/vendor/gix-pack/src/bundle/write/error.rs new file mode 100644 index 000000000..883c34029 --- /dev/null +++ b/vendor/gix-pack/src/bundle/write/error.rs @@ -0,0 +1,17 @@ +use std::io; + +use gix_tempfile::handle::Writable; + +/// The error returned by [`Bundle::write_to_directory()`][crate::Bundle::write_to_directory()] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An IO error occurred when reading the pack or creating a temporary file")] + Io(#[from] io::Error), + #[error(transparent)] + PackIter(#[from] crate::data::input::Error), + #[error("Could not move a temporary file into its desired place")] + Persist(#[from] gix_tempfile::handle::persist::Error<Writable>), + #[error(transparent)] + IndexWrite(#[from] crate::index::write::Error), +} diff --git a/vendor/gix-pack/src/bundle/write/mod.rs b/vendor/gix-pack/src/bundle/write/mod.rs new file mode 100644 index 000000000..fc0284b53 --- /dev/null +++ b/vendor/gix-pack/src/bundle/write/mod.rs @@ -0,0 +1,378 @@ +use std::{ + io, + io::Write, + marker::PhantomData, + path::{Path, PathBuf}, + sync::{atomic::AtomicBool, Arc}, +}; + +use gix_features::{interrupt, progress, progress::Progress}; +use gix_tempfile::{AutoRemove, ContainingDirectory}; + +use crate::data; + +mod error; +pub use error::Error; + +mod types; +use types::{LockWriter, PassThrough}; +pub use types::{Options, Outcome}; + +use crate::bundle::write::types::SharedTempFile; + +type ThinPackLookupFn = Box<dyn for<'a> FnMut(gix_hash::ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>>>; +type ThinPackLookupFnSend = + Box<dyn for<'a> FnMut(gix_hash::ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>> + Send + 'static>; + +/// The progress ids used in [`write_to_directory()`][crate::Bundle::write_to_directory()]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// The amount of bytes read from the input pack data file. + ReadPackBytes, + /// A root progress counting logical steps towards an index file on disk. + /// + /// Underneath will be more progress information related to actually producing the index. + IndexingSteps(PhantomData<crate::index::write::ProgressId>), +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::ReadPackBytes => *b"BWRB", + ProgressId::IndexingSteps(_) => *b"BWCI", + } + } +} + +impl crate::Bundle { + /// Given a `pack` data stream, write it along with a generated index into the `directory` if `Some` or discard all output if `None`. + /// + /// In the latter case, the functionality provided here is more a kind of pack data stream validation. + /// + /// * `progress` provides detailed progress information which can be discarded with [`gix_features::progress::Discard`]. + /// * `should_interrupt` is checked regularly and when true, the whole operation will stop. + /// * `thin_pack_base_object_lookup_fn` If set, we expect to see a thin-pack with objects that reference their base object by object id which is + /// expected to exist in the object database the bundle is contained within. + /// `options` further configure how the task is performed. + /// + /// # Note + /// + /// * the resulting pack may be empty, that is, contains zero objects in some situations. This is a valid reply by a server and should + /// be accounted for. + /// - Empty packs always have the same name and not handling this case will result in at most one superfluous pack. + pub fn write_to_directory<P>( + pack: impl io::BufRead, + directory: Option<impl AsRef<Path>>, + mut progress: P, + should_interrupt: &AtomicBool, + thin_pack_base_object_lookup_fn: Option<ThinPackLookupFn>, + options: Options, + ) -> Result<Outcome, Error> + where + P: Progress, + { + let mut read_progress = progress.add_child_with_id("read pack", ProgressId::ReadPackBytes.into()); + read_progress.init(None, progress::bytes()); + let pack = progress::Read { + inner: pack, + progress: progress::ThroughputOnDrop::new(read_progress), + }; + + let object_hash = options.object_hash; + let data_file = Arc::new(parking_lot::Mutex::new(io::BufWriter::with_capacity( + 64 * 1024, + match directory.as_ref() { + Some(directory) => gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?, + None => gix_tempfile::new(std::env::temp_dir(), ContainingDirectory::Exists, AutoRemove::Tempfile)?, + }, + ))); + let (pack_entries_iter, pack_version): ( + Box<dyn Iterator<Item = Result<data::input::Entry, data::input::Error>>>, + _, + ) = match thin_pack_base_object_lookup_fn { + Some(thin_pack_lookup_fn) => { + let pack = interrupt::Read { + inner: pack, + should_interrupt, + }; + let buffered_pack = io::BufReader::new(pack); + let pack_entries_iter = data::input::LookupRefDeltaObjectsIter::new( + data::input::BytesToEntriesIter::new_from_header( + buffered_pack, + options.iteration_mode, + data::input::EntryDataMode::KeepAndCrc32, + object_hash, + )?, + thin_pack_lookup_fn, + ); + let pack_version = pack_entries_iter.inner.version(); + let pack_entries_iter = data::input::EntriesToBytesIter::new( + pack_entries_iter, + LockWriter { + writer: data_file.clone(), + }, + pack_version, + gix_hash::Kind::Sha1, // Thin packs imply a pack being transported, and there we only ever know SHA1 at the moment. + ); + (Box::new(pack_entries_iter), pack_version) + } + None => { + let pack = PassThrough { + reader: interrupt::Read { + inner: pack, + should_interrupt, + }, + writer: Some(data_file.clone()), + }; + // This buf-reader is required to assure we call 'read()' in order to fill the (extra) buffer. Otherwise all the counting + // we do with the wrapped pack reader doesn't work as it does not expect anyone to call BufRead functions directly. + // However, this is exactly what's happening in the ZipReader implementation that is eventually used. + // The performance impact of this is probably negligible, compared to all the other work that is done anyway :D. + let buffered_pack = io::BufReader::new(pack); + let pack_entries_iter = data::input::BytesToEntriesIter::new_from_header( + buffered_pack, + options.iteration_mode, + data::input::EntryDataMode::Crc32, + object_hash, + )?; + let pack_version = pack_entries_iter.version(); + (Box::new(pack_entries_iter), pack_version) + } + }; + let WriteOutcome { + outcome, + data_path, + index_path, + keep_path, + } = crate::Bundle::inner_write( + directory, + progress, + options, + data_file, + pack_entries_iter, + should_interrupt, + pack_version, + )?; + + Ok(Outcome { + index: outcome, + object_hash, + pack_version, + data_path, + index_path, + keep_path, + }) + } + + /// Equivalent to [`write_to_directory()`][crate::Bundle::write_to_directory()] but offloads reading of the pack into its own thread, hence the `Send + 'static'` bounds. + /// + /// # Note + /// + /// As it sends portions of the input to a thread it requires the 'static lifetime for the interrupt flags. This can only + /// be satisfied by a static AtomicBool which is only suitable for programs that only run one of these operations at a time + /// or don't mind that all of them abort when the flag is set. + pub fn write_to_directory_eagerly<P>( + pack: impl io::Read + Send + 'static, + pack_size: Option<u64>, + directory: Option<impl AsRef<Path>>, + mut progress: P, + should_interrupt: &'static AtomicBool, + thin_pack_base_object_lookup_fn: Option<ThinPackLookupFnSend>, + options: Options, + ) -> Result<Outcome, Error> + where + P: Progress, + P::SubProgress: 'static, + { + let mut read_progress = progress.add_child_with_id("read pack", ProgressId::ReadPackBytes.into()); /* Bundle Write Read pack Bytes*/ + read_progress.init(pack_size.map(|s| s as usize), progress::bytes()); + let pack = progress::Read { + inner: pack, + progress: progress::ThroughputOnDrop::new(read_progress), + }; + + let data_file = Arc::new(parking_lot::Mutex::new(io::BufWriter::new(match directory.as_ref() { + Some(directory) => gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?, + None => gix_tempfile::new(std::env::temp_dir(), ContainingDirectory::Exists, AutoRemove::Tempfile)?, + }))); + let object_hash = options.object_hash; + let eight_pages = 4096 * 8; + let (pack_entries_iter, pack_version): ( + Box<dyn Iterator<Item = Result<data::input::Entry, data::input::Error>> + Send + 'static>, + _, + ) = match thin_pack_base_object_lookup_fn { + Some(thin_pack_lookup_fn) => { + let pack = interrupt::Read { + inner: pack, + should_interrupt, + }; + let buffered_pack = io::BufReader::with_capacity(eight_pages, pack); + let pack_entries_iter = data::input::LookupRefDeltaObjectsIter::new( + data::input::BytesToEntriesIter::new_from_header( + buffered_pack, + options.iteration_mode, + data::input::EntryDataMode::KeepAndCrc32, + object_hash, + )?, + thin_pack_lookup_fn, + ); + let pack_kind = pack_entries_iter.inner.version(); + (Box::new(pack_entries_iter), pack_kind) + } + None => { + let pack = PassThrough { + reader: interrupt::Read { + inner: pack, + should_interrupt, + }, + writer: Some(data_file.clone()), + }; + let buffered_pack = io::BufReader::with_capacity(eight_pages, pack); + let pack_entries_iter = data::input::BytesToEntriesIter::new_from_header( + buffered_pack, + options.iteration_mode, + data::input::EntryDataMode::Crc32, + object_hash, + )?; + let pack_kind = pack_entries_iter.version(); + (Box::new(pack_entries_iter), pack_kind) + } + }; + let num_objects = pack_entries_iter.size_hint().0; + let pack_entries_iter = + gix_features::parallel::EagerIterIf::new(move || num_objects > 25_000, pack_entries_iter, 5_000, 5); + + let WriteOutcome { + outcome, + data_path, + index_path, + keep_path, + } = crate::Bundle::inner_write( + directory, + progress, + options, + data_file, + pack_entries_iter, + should_interrupt, + pack_version, + )?; + + Ok(Outcome { + index: outcome, + object_hash, + pack_version, + data_path, + index_path, + keep_path, + }) + } + + fn inner_write( + directory: Option<impl AsRef<Path>>, + mut progress: impl Progress, + Options { + thread_limit, + iteration_mode: _, + index_version: index_kind, + object_hash, + }: Options, + data_file: SharedTempFile, + pack_entries_iter: impl Iterator<Item = Result<data::input::Entry, data::input::Error>>, + should_interrupt: &AtomicBool, + pack_version: data::Version, + ) -> Result<WriteOutcome, Error> { + let indexing_progress = progress.add_child_with_id( + "create index file", + ProgressId::IndexingSteps(Default::default()).into(), + ); + Ok(match directory { + Some(directory) => { + let directory = directory.as_ref(); + let mut index_file = gix_tempfile::new(directory, ContainingDirectory::Exists, AutoRemove::Tempfile)?; + + let outcome = crate::index::File::write_data_iter_to_stream( + index_kind, + { + let data_file = Arc::clone(&data_file); + move || new_pack_file_resolver(data_file) + }, + pack_entries_iter, + thread_limit, + indexing_progress, + &mut index_file, + should_interrupt, + object_hash, + pack_version, + )?; + + let data_path = directory.join(format!("pack-{}.pack", outcome.data_hash.to_hex())); + let index_path = data_path.with_extension("idx"); + let keep_path = data_path.with_extension("keep"); + + std::fs::write(&keep_path, b"")?; + Arc::try_unwrap(data_file) + .expect("only one handle left after pack was consumed") + .into_inner() + .into_inner() + .map_err(|err| Error::from(err.into_error()))? + .persist(&data_path)?; + index_file + .persist(&index_path) + .map_err(|err| { + progress.info(format!( + "pack file at {} is retained despite failing to move the index file into place. You can use plumbing to make it usable.", + data_path.display() + )); + err + })?; + WriteOutcome { + outcome, + data_path: Some(data_path), + index_path: Some(index_path), + keep_path: Some(keep_path), + } + } + None => WriteOutcome { + outcome: crate::index::File::write_data_iter_to_stream( + index_kind, + move || new_pack_file_resolver(data_file), + pack_entries_iter, + thread_limit, + indexing_progress, + io::sink(), + should_interrupt, + object_hash, + pack_version, + )?, + data_path: None, + index_path: None, + keep_path: None, + }, + }) + } +} + +fn new_pack_file_resolver( + data_file: SharedTempFile, +) -> io::Result<impl Fn(data::EntryRange, &mut Vec<u8>) -> Option<()> + Send + Clone> { + let mut guard = data_file.lock(); + guard.flush()?; + let mapped_file = Arc::new(crate::mmap::read_only( + &guard.get_mut().with_mut(|f| f.path().to_owned())?, + )?); + let pack_data_lookup = move |range: std::ops::Range<u64>, out: &mut Vec<u8>| -> Option<()> { + mapped_file + .get(range.start as usize..range.end as usize) + .map(|pack_entry| out.copy_from_slice(pack_entry)) + }; + Ok(pack_data_lookup) +} + +struct WriteOutcome { + outcome: crate::index::write::Outcome, + data_path: Option<PathBuf>, + index_path: Option<PathBuf>, + keep_path: Option<PathBuf>, +} diff --git a/vendor/gix-pack/src/bundle/write/types.rs b/vendor/gix-pack/src/bundle/write/types.rs new file mode 100644 index 000000000..56c14ac59 --- /dev/null +++ b/vendor/gix-pack/src/bundle/write/types.rs @@ -0,0 +1,120 @@ +use std::{hash::Hash, io, io::SeekFrom, path::PathBuf, sync::Arc}; + +use gix_tempfile::handle::Writable; + +/// Configuration for [write_to_directory][crate::Bundle::write_to_directory()] or +/// [write_to_directory_eagerly][crate::Bundle::write_to_directory_eagerly()] +#[derive(Debug, Clone)] +pub struct Options { + /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. + pub thread_limit: Option<usize>, + /// Determine how much processing to spend on protecting against corruption or recovering from errors. + pub iteration_mode: crate::data::input::Mode, + /// The version of pack index to write, should be [`crate::index::Version::default()`] + pub index_version: crate::index::Version, + /// The kind of hash to use when writing the bundle. + pub object_hash: gix_hash::Kind, +} + +impl Default for Options { + /// Options which favor speed and correctness and write the most commonly supported index file. + fn default() -> Self { + Options { + thread_limit: None, + iteration_mode: crate::data::input::Mode::Verify, + index_version: Default::default(), + object_hash: Default::default(), + } + } +} + +/// Returned by [write_to_directory][crate::Bundle::write_to_directory()] or +/// [write_to_directory_eagerly][crate::Bundle::write_to_directory_eagerly()] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The successful result of the index write operation + pub index: crate::index::write::Outcome, + /// The version of the pack + pub pack_version: crate::data::Version, + /// The kind of hash stored within the pack and indices + pub object_hash: gix_hash::Kind, + + /// The path to the pack index file + pub index_path: Option<PathBuf>, + /// The path to the pack data file + pub data_path: Option<PathBuf>, + /// The path to the `.keep` file to prevent collection of the newly written pack until refs are pointing to it. + /// + /// The file is created right before moving the pack data and index data into place (i.e. `data_path` and `index_path`) + /// and is expected to be removed by the caller when ready. + pub keep_path: Option<PathBuf>, +} + +impl Outcome { + /// Instantiate a bundle from the newly written index and data file that are represented by this `Outcome` + pub fn to_bundle(&self) -> Option<Result<crate::Bundle, crate::bundle::init::Error>> { + self.index_path + .as_ref() + .map(|path| crate::Bundle::at(path, self.object_hash)) + } +} + +pub(crate) type SharedTempFile = Arc<parking_lot::Mutex<std::io::BufWriter<gix_tempfile::Handle<Writable>>>>; + +pub(crate) struct PassThrough<R> { + pub reader: R, + pub writer: Option<SharedTempFile>, +} + +impl<R> io::Read for PassThrough<R> +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + let bytes_read = self.reader.read(buf)?; + if let Some(writer) = self.writer.as_mut() { + use std::io::Write; + writer.lock().write_all(&buf[..bytes_read])?; + } + Ok(bytes_read) + } +} +impl<R> io::BufRead for PassThrough<R> +where + R: io::BufRead, +{ + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.reader.fill_buf() + } + + fn consume(&mut self, amt: usize) { + self.reader.consume(amt) + } +} + +pub(crate) struct LockWriter { + pub writer: SharedTempFile, +} + +impl io::Write for LockWriter { + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + self.writer.lock().write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.writer.lock().flush() + } +} + +impl io::Read for LockWriter { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + self.writer.lock().get_mut().read(buf) + } +} + +impl io::Seek for LockWriter { + fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> { + self.writer.lock().seek(pos) + } +} diff --git a/vendor/gix-pack/src/cache/delta/from_offsets.rs b/vendor/gix-pack/src/cache/delta/from_offsets.rs new file mode 100644 index 000000000..8acb4a802 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/from_offsets.rs @@ -0,0 +1,161 @@ +use std::{ + convert::TryFrom, + fs, io, + io::{BufRead, Read, Seek, SeekFrom}, + sync::atomic::{AtomicBool, Ordering}, + time::Instant, +}; + +use gix_features::progress::{self, Progress}; + +use crate::{cache::delta::Tree, data}; + +/// Returned by [`Tree::from_offsets_in_pack()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("{message}")] + Io { source: io::Error, message: &'static str }, + #[error(transparent)] + Header(#[from] crate::data::header::decode::Error), + #[error("Could find object with id {id} in this pack. Thin packs are not supported")] + UnresolvedRefDelta { id: gix_hash::ObjectId }, + #[error(transparent)] + Tree(#[from] crate::cache::delta::Error), + #[error("Interrupted")] + Interrupted, +} + +const PACK_HEADER_LEN: usize = 12; + +/// Generate tree from certain input +impl<T> Tree<T> { + /// Create a new `Tree` from any data sorted by offset, ascending as returned by the `data_sorted_by_offsets` iterator. + /// * `get_pack_offset(item: &T`) -> data::Offset` is a function returning the pack offset of the given item, which can be used + /// for obtaining the objects entry within the pack. + /// * `pack_path` is the path to the pack file itself and from which to read the entry data, which is a pack file matching the offsets + /// returned by `get_pack_offset(…)`. + /// * `progress` is used to track progress when creating the tree. + /// * `resolve_in_pack_id(gix_hash::oid) -> Option<data::Offset>` takes an object ID and tries to resolve it to an object within this pack if + /// possible. Failing to do so aborts the operation, and this function is not expected to be called in usual packs. It's a theoretical + /// possibility though as old packs might have referred to their objects using the 20 bytes hash, instead of their encoded offset from the base. + /// + /// Note that the sort order is ascending. The given pack file path must match the provided offsets. + pub fn from_offsets_in_pack( + pack_path: impl AsRef<std::path::Path>, + data_sorted_by_offsets: impl Iterator<Item = T>, + get_pack_offset: impl Fn(&T) -> data::Offset, + resolve_in_pack_id: impl Fn(&gix_hash::oid) -> Option<data::Offset>, + mut progress: impl Progress, + should_interrupt: &AtomicBool, + object_hash: gix_hash::Kind, + ) -> Result<Self, Error> { + let mut r = io::BufReader::with_capacity( + 8192 * 8, // this value directly corresponds to performance, 8k (default) is about 4x slower than 64k + fs::File::open(pack_path).map_err(|err| Error::Io { + source: err, + message: "open pack path", + })?, + ); + + let anticipated_num_objects = if let Some(num_objects) = data_sorted_by_offsets.size_hint().1 { + progress.init(Some(num_objects), progress::count("objects")); + num_objects + } else { + 0 + }; + let mut tree = Tree::with_capacity(anticipated_num_objects)?; + + { + // safety check - assure ourselves it's a pack we can handle + let mut buf = [0u8; PACK_HEADER_LEN]; + r.read_exact(&mut buf).map_err(|err| Error::Io { + source: err, + message: "reading header buffer with at least 12 bytes failed - pack file truncated?", + })?; + crate::data::header::decode(&buf)?; + } + + let then = Instant::now(); + + let mut previous_cursor_position = None::<u64>; + + let hash_len = object_hash.len_in_bytes(); + for (idx, data) in data_sorted_by_offsets.enumerate() { + let pack_offset = get_pack_offset(&data); + if let Some(previous_offset) = previous_cursor_position { + Self::advance_cursor_to_pack_offset(&mut r, pack_offset, previous_offset)?; + }; + let entry = crate::data::Entry::from_read(&mut r, pack_offset, hash_len).map_err(|err| Error::Io { + source: err, + message: "EOF while parsing header", + })?; + previous_cursor_position = Some(pack_offset + entry.header_size() as u64); + + use crate::data::entry::Header::*; + match entry.header { + Tree | Blob | Commit | Tag => { + tree.add_root(pack_offset, data)?; + } + RefDelta { base_id } => { + resolve_in_pack_id(base_id.as_ref()) + .ok_or(Error::UnresolvedRefDelta { id: base_id }) + .and_then(|base_pack_offset| { + tree.add_child(base_pack_offset, pack_offset, data).map_err(Into::into) + })?; + } + OfsDelta { base_distance } => { + let base_pack_offset = pack_offset + .checked_sub(base_distance) + .expect("in bound distance for deltas"); + tree.add_child(base_pack_offset, pack_offset, data)?; + } + }; + progress.inc(); + if idx % 10_000 == 0 && should_interrupt.load(Ordering::SeqCst) { + return Err(Error::Interrupted); + } + } + + progress.show_throughput(then); + Ok(tree) + } + + fn advance_cursor_to_pack_offset( + r: &mut io::BufReader<fs::File>, + pack_offset: u64, + previous_offset: u64, + ) -> Result<(), Error> { + let bytes_to_skip: u64 = pack_offset + .checked_sub(previous_offset) + .expect("continuously ascending pack offsets"); + if bytes_to_skip == 0 { + return Ok(()); + } + let buf = r.fill_buf().map_err(|err| Error::Io { + source: err, + message: "skip bytes", + })?; + if buf.is_empty() { + // This means we have reached the end of file and can't make progress anymore, before we have satisfied our need + // for more + return Err(Error::Io { + source: io::Error::new( + io::ErrorKind::UnexpectedEof, + "ran out of bytes before reading desired amount of bytes", + ), + message: "index file is damaged or corrupt", + }); + } + if bytes_to_skip <= u64::try_from(buf.len()).expect("sensible buffer size") { + // SAFETY: bytes_to_skip <= buf.len() <= usize::MAX + r.consume(bytes_to_skip as usize); + } else { + r.seek(SeekFrom::Start(pack_offset)).map_err(|err| Error::Io { + source: err, + message: "seek to next entry", + })?; + } + Ok(()) + } +} diff --git a/vendor/gix-pack/src/cache/delta/mod.rs b/vendor/gix-pack/src/cache/delta/mod.rs new file mode 100644 index 000000000..f4c1b6fc6 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/mod.rs @@ -0,0 +1,216 @@ +/// Returned when using various methods on a [`Tree`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("Pack offsets must only increment. The previous pack offset was {last_pack_offset}, the current one is {pack_offset}")] + InvariantIncreasingPackOffset { + /// The last seen pack offset + last_pack_offset: crate::data::Offset, + /// The invariant violating offset + pack_offset: crate::data::Offset, + }, +} + +/// +pub mod traverse; + +/// +pub mod from_offsets; + +/// An item stored within the [`Tree`] +pub struct Item<T> { + /// The offset into the pack file at which the pack entry's data is located. + pub offset: crate::data::Offset, + /// The offset of the next item in the pack file. + pub next_offset: crate::data::Offset, + /// Data to store with each Item, effectively data associated with each entry in a pack. + pub data: T, + /// Indices into our Tree's `items`, one for each pack entry that depends on us. + /// + /// Limited to u32 as that's the maximum amount of objects in a pack. + children: Vec<u32>, +} + +/// Identify what kind of node we have last seen +enum NodeKind { + Root, + Child, +} + +/// A tree that allows one-time iteration over all nodes and their children, consuming it in the process, +/// while being shareable among threads without a lock. +/// It does this by making the guarantee that iteration only happens once. +pub struct Tree<T> { + /// The root nodes, i.e. base objects + root_items: Vec<Item<T>>, + /// The child nodes, i.e. those that rely a base object, like ref and ofs delta objects + child_items: Vec<Item<T>>, + /// The last encountered node was either a root or a child. + last_seen: Option<NodeKind>, + /// Future child offsets, associating their offset into the pack with their index in the items array. + /// (parent_offset, child_index) + future_child_offsets: Vec<(crate::data::Offset, usize)>, +} + +impl<T> Tree<T> { + /// Instantiate a empty tree capable of storing `num_objects` amounts of items. + pub fn with_capacity(num_objects: usize) -> Result<Self, Error> { + Ok(Tree { + root_items: Vec::with_capacity(num_objects / 2), + child_items: Vec::with_capacity(num_objects / 2), + last_seen: None, + future_child_offsets: Vec::new(), + }) + } + + fn num_items(&self) -> usize { + self.root_items.len() + self.child_items.len() + } + + fn assert_is_incrementing_and_update_next_offset(&mut self, offset: crate::data::Offset) -> Result<(), Error> { + let items = match &self.last_seen { + Some(NodeKind::Root) => &mut self.root_items, + Some(NodeKind::Child) => &mut self.child_items, + None => return Ok(()), + }; + let item = &mut items.last_mut().expect("last seen won't lie"); + if offset <= item.offset { + return Err(Error::InvariantIncreasingPackOffset { + last_pack_offset: item.offset, + pack_offset: offset, + }); + } + item.next_offset = offset; + Ok(()) + } + + fn set_pack_entries_end_and_resolve_ref_offsets( + &mut self, + pack_entries_end: crate::data::Offset, + ) -> Result<(), traverse::Error> { + if !self.future_child_offsets.is_empty() { + for (parent_offset, child_index) in self.future_child_offsets.drain(..) { + if let Ok(i) = self.child_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.child_items[i].children.push(child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&parent_offset, |i| i.offset) { + self.root_items[i].children.push(child_index as u32); + } else { + return Err(traverse::Error::OutOfPackRefDelta { + base_pack_offset: parent_offset, + }); + } + } + } + + self.assert_is_incrementing_and_update_next_offset(pack_entries_end) + .expect("BUG: pack now is smaller than all previously seen entries"); + Ok(()) + } + + /// Add a new root node, one that only has children but is not a child itself, at the given pack `offset` and associate + /// custom `data` with it. + pub fn add_root(&mut self, offset: crate::data::Offset, data: T) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + self.last_seen = NodeKind::Root.into(); + self.root_items.push(Item { + offset, + next_offset: 0, + data, + children: Default::default(), + }); + Ok(()) + } + + /// Add a child of the item at `base_offset` which itself resides at pack `offset` and associate custom `data` with it. + pub fn add_child( + &mut self, + base_offset: crate::data::Offset, + offset: crate::data::Offset, + data: T, + ) -> Result<(), Error> { + self.assert_is_incrementing_and_update_next_offset(offset)?; + + let next_child_index = self.child_items.len(); + if let Ok(i) = self.child_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.child_items[i].children.push(next_child_index as u32); + } else if let Ok(i) = self.root_items.binary_search_by_key(&base_offset, |i| i.offset) { + self.root_items[i].children.push(next_child_index as u32); + } else { + self.future_child_offsets.push((base_offset, next_child_index)); + } + + self.last_seen = NodeKind::Child.into(); + self.child_items.push(Item { + offset, + next_offset: 0, + data, + children: Default::default(), + }); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + mod tree { + mod from_offsets_in_pack { + use std::sync::atomic::AtomicBool; + + use crate as pack; + + const SMALL_PACK_INDEX: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"; + const SMALL_PACK: &str = "objects/pack/pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.pack"; + + const INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"; + const PACK_FOR_INDEX_V1: &str = "objects/pack/pack-c0438c19fb16422b6bbcce24387b3264416d485b.pack"; + + use gix_testtools::fixture_path; + + #[test] + fn v1() -> Result<(), Box<dyn std::error::Error>> { + tree(INDEX_V1, PACK_FOR_INDEX_V1) + } + + #[test] + fn v2() -> Result<(), Box<dyn std::error::Error>> { + tree(SMALL_PACK_INDEX, SMALL_PACK) + } + + fn tree(index_path: &str, pack_path: &str) -> Result<(), Box<dyn std::error::Error>> { + let idx = pack::index::File::at(fixture_path(index_path), gix_hash::Kind::Sha1)?; + crate::cache::delta::Tree::from_offsets_in_pack( + fixture_path(pack_path), + idx.sorted_offsets().into_iter(), + |ofs| *ofs, + |id| idx.lookup(id).map(|index| idx.pack_offset_at_index(index)), + gix_features::progress::Discard, + &AtomicBool::new(false), + gix_hash::Kind::Sha1, + )?; + Ok(()) + } + } + } + + #[test] + fn size_of_pack_tree_item() { + use super::Item; + assert_eq!(std::mem::size_of::<[Item<()>; 7_500_000]>(), 300_000_000); + } + + #[test] + fn size_of_pack_verify_data_structure() { + use super::Item; + pub struct EntryWithDefault { + _index_entry: crate::index::Entry, + _kind: gix_object::Kind, + _object_size: u64, + _decompressed_size: u64, + _compressed_size: u64, + _header_size: u16, + _level: u16, + } + + assert_eq!(std::mem::size_of::<[Item<EntryWithDefault>; 7_500_000]>(), 840_000_000); + } +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/mod.rs b/vendor/gix-pack/src/cache/delta/traverse/mod.rs new file mode 100644 index 000000000..bfe2ec687 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/mod.rs @@ -0,0 +1,177 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +use gix_features::{ + parallel::in_parallel_with_slice, + progress::{self, Progress}, + threading::{lock, Mutable, OwnShared}, +}; + +use crate::{ + cache::delta::{traverse::util::ItemSliceSend, Item, Tree}, + data::EntryRange, +}; + +mod resolve; +pub(crate) mod util; + +/// Returned by [`Tree::traverse()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("{message}")] + ZlibInflate { + source: gix_features::zlib::inflate::Error, + message: &'static str, + }, + #[error("The resolver failed to obtain the pack entry bytes for the entry at {pack_offset}")] + ResolveFailed { pack_offset: u64 }, + #[error("One of the object inspectors failed")] + Inspect(#[from] Box<dyn std::error::Error + Send + Sync>), + #[error("Interrupted")] + Interrupted, + #[error( + "The base at {base_pack_offset} was referred to by a ref-delta, but it was never added to the tree as if the pack was still thin." + )] + OutOfPackRefDelta { + /// The base's offset which was from a resolved ref-delta that didn't actually get added to the tree + base_pack_offset: crate::data::Offset, + }, +} + +/// Additional context passed to the `inspect_object(…)` function of the [`Tree::traverse()`] method. +pub struct Context<'a, S> { + /// The pack entry describing the object + pub entry: &'a crate::data::Entry, + /// The offset at which `entry` ends in the pack, useful to learn about the exact range of `entry` within the pack. + pub entry_end: u64, + /// The decompressed object itself, ready to be decoded. + pub decompressed: &'a [u8], + /// Custom state known to the function + pub state: &'a mut S, + /// The depth at which this object resides in the delta-tree. It represents the amount of base objects, with 0 indicating + /// an 'undeltified' object, and higher values indicating delta objects with the given amount of bases. + pub level: u16, +} + +/// Options for [`Tree::traverse()`]. +pub struct Options<'a, P1, P2> { + /// is a progress instance to track progress for each object in the traversal. + pub object_progress: P1, + /// is a progress instance to track the overall progress. + pub size_progress: P2, + /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on + /// the amount of available logical cores. + pub thread_limit: Option<usize>, + /// Abort the operation if the value is `true`. + pub should_interrupt: &'a AtomicBool, + /// specifies what kind of hashes we expect to be stored in oid-delta entries, which is viable to decoding them + /// with the correct size. + pub object_hash: gix_hash::Kind, +} + +/// The outcome of [`Tree::traverse()`] +pub struct Outcome<T> { + /// The items that have no children in the pack, i.e. base objects. + pub roots: Vec<Item<T>>, + /// The items that children to a root object, i.e. delta objects. + pub children: Vec<Item<T>>, +} + +impl<T> Tree<T> +where + T: Send, +{ + /// Traverse this tree of delta objects with a function `inspect_object` to process each object at will. + /// + /// * `should_run_in_parallel() -> bool` returns true if the underlying pack is big enough to warrant parallel traversal at all. + /// * `resolve(EntrySlice, &mut Vec<u8>) -> Option<()>` resolves the bytes in the pack for the given `EntrySlice` and stores them in the + /// output vector. It returns `Some(())` if the object existed in the pack, or `None` to indicate a resolution error, which would abort the + /// operation as well. + /// * `pack_entries_end` marks one-past-the-last byte of the last entry in the pack, as the last entries size would otherwise + /// be unknown as it's not part of the index file. + /// * `new_thread_state() -> State` is a function to create state to be used in each thread, invoked once per thread. + /// * `inspect_object(node_data: &mut T, progress: Progress, context: Context<ThreadLocal State>) -> Result<(), CustomError>` is a function + /// running for each thread receiving fully decoded objects along with contextual information, which either succeeds with `Ok(())` + /// or returns a `CustomError`. + /// Note that `node_data` can be modified to allow storing maintaining computation results on a per-object basis. + /// + /// This method returns a vector of all tree items, along with their potentially modified custom node data. + /// + /// _Note_ that this method consumed the Tree to assure safe parallel traversal with mutation support. + pub fn traverse<F, P1, P2, MBFN, S, E>( + mut self, + resolve: F, + pack_entries_end: u64, + new_thread_state: impl Fn() -> S + Send + Clone, + inspect_object: MBFN, + Options { + thread_limit, + object_progress, + mut size_progress, + should_interrupt, + object_hash, + }: Options<'_, P1, P2>, + ) -> Result<Outcome<T>, Error> + where + F: for<'r> Fn(EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone, + P1: Progress, + P2: Progress, + MBFN: Fn(&mut T, &mut <P1 as Progress>::SubProgress, Context<'_, S>) -> Result<(), E> + Send + Clone, + E: std::error::Error + Send + Sync + 'static, + { + self.set_pack_entries_end_and_resolve_ref_offsets(pack_entries_end)?; + let object_progress = OwnShared::new(Mutable::new(object_progress)); + + let num_objects = self.num_items(); + let object_counter = { + let mut progress = lock(&object_progress); + progress.init(Some(num_objects), progress::count("objects")); + progress.counter() + }; + size_progress.init(None, progress::bytes()); + let size_counter = size_progress.counter(); + let child_items = self.child_items.as_mut_slice(); + + let start = std::time::Instant::now(); + in_parallel_with_slice( + &mut self.root_items, + thread_limit, + { + let object_progress = object_progress.clone(); + let child_items = ItemSliceSend(child_items as *mut [Item<T>]); + move |thread_index| { + ( + Vec::<u8>::with_capacity(4096), + lock(&object_progress) + .add_child_with_id(format!("thread {thread_index}"), gix_features::progress::UNKNOWN), + new_thread_state(), + resolve.clone(), + inspect_object.clone(), + ItemSliceSend(child_items.0), + ) + } + }, + { + move |node, state| { + resolve::deltas( + object_counter.clone(), + size_counter.clone(), + node, + state, + object_hash.len_in_bytes(), + ) + } + }, + || (!should_interrupt.load(Ordering::Relaxed)).then(|| std::time::Duration::from_millis(50)), + |_| (), + )?; + + lock(&object_progress).show_throughput(start); + size_progress.show_throughput(start); + + Ok(Outcome { + roots: self.root_items, + children: self.child_items, + }) + } +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/resolve.rs b/vendor/gix-pack/src/cache/delta/traverse/resolve.rs new file mode 100644 index 000000000..fc94d87ef --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/resolve.rs @@ -0,0 +1,154 @@ +use std::{cell::RefCell, collections::BTreeMap, sync::atomic::Ordering}; + +use gix_features::{progress::Progress, zlib}; + +use crate::{ + cache::delta::{ + traverse::{ + util::{ItemSliceSend, Node}, + Context, Error, + }, + Item, + }, + data::EntryRange, +}; + +pub(crate) fn deltas<T, F, P, MBFN, S, E>( + object_counter: Option<gix_features::progress::StepShared>, + size_counter: Option<gix_features::progress::StepShared>, + node: &mut crate::cache::delta::Item<T>, + (bytes_buf, ref mut progress, state, resolve, modify_base, child_items): &mut ( + Vec<u8>, + P, + S, + F, + MBFN, + ItemSliceSend<Item<T>>, + ), + hash_len: usize, +) -> Result<(), Error> +where + T: Send, + F: for<'r> Fn(EntryRange, &'r mut Vec<u8>) -> Option<()>, + P: Progress, + MBFN: Fn(&mut T, &mut P, Context<'_, S>) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, +{ + let mut decompressed_bytes_by_pack_offset = BTreeMap::new(); + let bytes_buf = RefCell::new(bytes_buf); + let decompress_from_resolver = |slice: EntryRange| -> Result<(crate::data::Entry, u64, Vec<u8>), Error> { + let mut bytes_buf = bytes_buf.borrow_mut(); + bytes_buf.resize((slice.end - slice.start) as usize, 0); + resolve(slice.clone(), &mut bytes_buf).ok_or(Error::ResolveFailed { + pack_offset: slice.start, + })?; + let entry = crate::data::Entry::from_bytes(&bytes_buf, slice.start, hash_len); + let compressed = &bytes_buf[entry.header_size()..]; + let decompressed_len = entry.decompressed_size as usize; + Ok((entry, slice.end, decompress_all_at_once(compressed, decompressed_len)?)) + }; + + // Traverse the tree breadth first and loose the data produced for the base as it won't be needed anymore. + progress.init(None, gix_features::progress::count_with_decimals("objects", 2)); + + // each node is a base, and its children always start out as deltas which become a base after applying them. + // These will be pushed onto our stack until all are processed + let root_level = 0; + let mut nodes: Vec<_> = vec![( + root_level, + Node { + item: node, + child_items: child_items.0, + }, + )]; + while let Some((level, mut base)) = nodes.pop() { + let (base_entry, entry_end, base_bytes) = if level == root_level { + decompress_from_resolver(base.entry_slice())? + } else { + decompressed_bytes_by_pack_offset + .remove(&base.offset()) + .expect("we store the resolved delta buffer when done") + }; + + // anything done here must be repeated further down for leaf-nodes. + // This way we avoid retaining their decompressed memory longer than needed (they have no children, + // thus their memory can be released right away, using 18% less peak memory on the linux kernel). + { + modify_base( + base.data(), + progress, + Context { + entry: &base_entry, + entry_end, + decompressed: &base_bytes, + state, + level, + }, + ) + .map_err(|err| Box::new(err) as Box<dyn std::error::Error + Send + Sync>)?; + object_counter.as_ref().map(|c| c.fetch_add(1, Ordering::SeqCst)); + size_counter + .as_ref() + .map(|c| c.fetch_add(base_bytes.len(), Ordering::SeqCst)); + } + + for mut child in base.into_child_iter() { + let (mut child_entry, entry_end, delta_bytes) = decompress_from_resolver(child.entry_slice())?; + let (base_size, consumed) = crate::data::delta::decode_header_size(&delta_bytes); + let mut header_ofs = consumed; + assert_eq!( + base_bytes.len(), + base_size as usize, + "recorded base size in delta does not match" + ); + let (result_size, consumed) = crate::data::delta::decode_header_size(&delta_bytes[consumed..]); + header_ofs += consumed; + + let mut fully_resolved_delta_bytes = bytes_buf.borrow_mut(); + fully_resolved_delta_bytes.resize(result_size as usize, 0); + crate::data::delta::apply(&base_bytes, &mut fully_resolved_delta_bytes, &delta_bytes[header_ofs..]); + + // FIXME: this actually invalidates the "pack_offset()" computation, which is not obvious to consumers + // at all + child_entry.header = base_entry.header; // assign the actual object type, instead of 'delta' + if child.has_children() { + decompressed_bytes_by_pack_offset.insert( + child.offset(), + (child_entry, entry_end, fully_resolved_delta_bytes.to_owned()), + ); + nodes.push((level + 1, child)); + } else { + modify_base( + child.data(), + progress, + Context { + entry: &child_entry, + entry_end, + decompressed: &fully_resolved_delta_bytes, + state, + level: level + 1, + }, + ) + .map_err(|err| Box::new(err) as Box<dyn std::error::Error + Send + Sync>)?; + object_counter.as_ref().map(|c| c.fetch_add(1, Ordering::SeqCst)); + size_counter + .as_ref() + .map(|c| c.fetch_add(base_bytes.len(), Ordering::SeqCst)); + } + } + } + + Ok(()) +} + +fn decompress_all_at_once(b: &[u8], decompressed_len: usize) -> Result<Vec<u8>, Error> { + let mut out = Vec::new(); + out.resize(decompressed_len, 0); + zlib::Inflate::default() + .once(b, &mut out) + .map_err(|err| Error::ZlibInflate { + source: err, + message: "Failed to decompress entry", + })?; + Ok(out) +} diff --git a/vendor/gix-pack/src/cache/delta/traverse/util.rs b/vendor/gix-pack/src/cache/delta/traverse/util.rs new file mode 100644 index 000000000..e7caf2ff5 --- /dev/null +++ b/vendor/gix-pack/src/cache/delta/traverse/util.rs @@ -0,0 +1,63 @@ +use crate::cache::delta::Item; + +pub struct ItemSliceSend<T>(pub *mut [T]) +where + T: Send; + +impl<T> Clone for ItemSliceSend<T> +where + T: Send, +{ + fn clone(&self) -> Self { + ItemSliceSend(self.0) + } +} + +// SAFETY: T is `Send`, and we only ever access one T at a time. And, ptrs need that assurance, I wonder if it's always right. +#[allow(unsafe_code)] +unsafe impl<T> Send for ItemSliceSend<T> where T: Send {} + +/// An item returned by `iter_root_chunks`, allowing access to the `data` stored alongside nodes in a [`Tree`]. +pub struct Node<'a, T> { + pub item: &'a mut Item<T>, + pub child_items: *mut [Item<T>], +} + +impl<'a, T> Node<'a, T> { + /// Returns the offset into the pack at which the `Node`s data is located. + pub fn offset(&self) -> u64 { + self.item.offset + } + + /// Returns the slice into the data pack at which the pack entry is located. + pub fn entry_slice(&self) -> crate::data::EntryRange { + self.item.offset..self.item.next_offset + } + + /// Returns the node data associated with this node. + pub fn data(&mut self) -> &mut T { + &mut self.item.data + } + + /// Returns true if this node has children, e.g. is not a leaf in the tree. + pub fn has_children(&self) -> bool { + !self.item.children.is_empty() + } + + /// Transform this `Node` into an iterator over its children. + /// + /// Children are `Node`s referring to pack entries whose base object is this pack entry. + pub fn into_child_iter(self) -> impl Iterator<Item = Node<'a, T>> + 'a { + let children = self.child_items; + self.item.children.iter().map(move |&index| { + // SAFETY: The children array is alive by the 'a lifetime. + // SAFETY: The index is a valid index into the children array. + // SAFETY: The resulting mutable pointer cannot be yielded by any other node. + #[allow(unsafe_code)] + Node { + item: unsafe { &mut *(children as *mut Item<T>).add(index as usize) }, + child_items: children, + } + }) + } +} diff --git a/vendor/gix-pack/src/cache/lru.rs b/vendor/gix-pack/src/cache/lru.rs new file mode 100644 index 000000000..bba4f5d33 --- /dev/null +++ b/vendor/gix-pack/src/cache/lru.rs @@ -0,0 +1,165 @@ +use super::DecodeEntry; + +#[cfg(feature = "pack-cache-lru-dynamic")] +mod memory { + use std::num::NonZeroUsize; + + use clru::WeightScale; + + use super::DecodeEntry; + + struct Entry { + data: Vec<u8>, + kind: gix_object::Kind, + compressed_size: usize, + } + + type Key = (u32, u64); + struct CustomScale; + + impl WeightScale<Key, Entry> for CustomScale { + fn weight(&self, _key: &Key, value: &Entry) -> usize { + value.data.len() + } + } + + /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. + pub struct MemoryCappedHashmap { + inner: clru::CLruCache<Key, Entry, std::collections::hash_map::RandomState, CustomScale>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl MemoryCappedHashmap { + /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` + /// object data. + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { + MemoryCappedHashmap { + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_scale(CustomScale), + ), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("MemoryCappedHashmap({memory_cap_in_bytes}B)")), + } + } + } + + impl DecodeEntry for MemoryCappedHashmap { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { + self.debug.put(); + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( + (pack_id, offset), + Entry { + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + compressed_size, + }, + ) { + self.free_list.push(previous_entry.data) + } + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + let res = self.inner.get(&(pack_id, offset)).map(|e| { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + (e.kind, e.compressed_size) + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} + +#[cfg(feature = "pack-cache-lru-dynamic")] +pub use memory::MemoryCappedHashmap; + +#[cfg(feature = "pack-cache-lru-static")] +mod _static { + use super::DecodeEntry; + struct Entry { + pack_id: u32, + offset: u64, + data: Vec<u8>, + kind: gix_object::Kind, + compressed_size: usize, + } + + /// A cache using a least-recently-used implementation capable of storing the `SIZE` most recent objects. + /// The cache must be small as the search is 'naive' and the underlying data structure is a linked list. + /// Values of 64 seem to improve performance. + pub struct StaticLinkedList<const SIZE: usize> { + inner: uluru::LRUCache<Entry, SIZE>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl<const SIZE: usize> Default for StaticLinkedList<SIZE> { + fn default() -> Self { + StaticLinkedList { + inner: Default::default(), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("StaticLinkedList<{SIZE}>")), + } + } + } + + impl<const SIZE: usize> DecodeEntry for StaticLinkedList<SIZE> { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize) { + self.debug.put(); + if let Some(previous) = self.inner.insert(Entry { + offset, + pack_id, + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + compressed_size, + }) { + self.free_list.push(previous.data) + } + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + let res = self.inner.lookup(|e: &mut Entry| { + if e.pack_id == pack_id && e.offset == offset { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + Some((e.kind, e.compressed_size)) + } else { + None + } + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} + +#[cfg(feature = "pack-cache-lru-static")] +pub use _static::StaticLinkedList; diff --git a/vendor/gix-pack/src/cache/mod.rs b/vendor/gix-pack/src/cache/mod.rs new file mode 100644 index 000000000..cf4b94df8 --- /dev/null +++ b/vendor/gix-pack/src/cache/mod.rs @@ -0,0 +1,55 @@ +use std::ops::DerefMut; + +use gix_object::Kind; + +/// A trait to model putting objects at a given pack `offset` into a cache, and fetching them. +/// +/// It is used to speed up [pack traversals][crate::index::File::traverse()]. +pub trait DecodeEntry { + /// Store a fully decoded object at `offset` of `kind` with `compressed_size` and `data` in the cache. + /// + /// It is up to the cache implementation whether that actually happens or not. + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: gix_object::Kind, compressed_size: usize); + /// Attempt to fetch the object at `offset` and store its decoded bytes in `out`, as previously stored with [`DecodeEntry::put()`], and return + /// its (object `kind`, `decompressed_size`) + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)>; +} + +/// A cache that stores nothing and retrieves nothing, thus it _never_ caches. +#[derive(Default)] +pub struct Never; + +impl DecodeEntry for Never { + fn put(&mut self, _pack_id: u32, _offset: u64, _data: &[u8], _kind: gix_object::Kind, _compressed_size: usize) {} + fn get(&mut self, _pack_id: u32, _offset: u64, _out: &mut Vec<u8>) -> Option<(gix_object::Kind, usize)> { + None + } +} + +impl<T: DecodeEntry + ?Sized> DecodeEntry for Box<T> { + fn put(&mut self, pack_id: u32, offset: u64, data: &[u8], kind: Kind, compressed_size: usize) { + self.deref_mut().put(pack_id, offset, data, kind, compressed_size) + } + + fn get(&mut self, pack_id: u32, offset: u64, out: &mut Vec<u8>) -> Option<(Kind, usize)> { + self.deref_mut().get(pack_id, offset, out) + } +} + +/// A way of storing and retrieving entire objects to and from a cache. +pub trait Object { + /// Put the object going by `id` of `kind` with `data` into the cache. + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]); + + /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind>; +} + +/// Various implementations of [`DecodeEntry`] using least-recently-used algorithms. +#[cfg(any(feature = "pack-cache-lru-dynamic", feature = "pack-cache-lru-static"))] +pub mod lru; + +pub mod object; + +/// +pub(crate) mod delta; diff --git a/vendor/gix-pack/src/cache/object.rs b/vendor/gix-pack/src/cache/object.rs new file mode 100644 index 000000000..e64f47a8c --- /dev/null +++ b/vendor/gix-pack/src/cache/object.rs @@ -0,0 +1,123 @@ +//! # Note +//! +//! This module is a bit 'misplaced' if spelled out like 'gix_pack::cache::object::*' but is best placed here for code re-use and +//! general usefulness. +use crate::cache; + +#[cfg(feature = "object-cache-dynamic")] +mod memory { + use std::num::NonZeroUsize; + + use clru::WeightScale; + + use crate::cache; + + struct Entry { + data: Vec<u8>, + kind: gix_object::Kind, + } + + type Key = gix_hash::ObjectId; + + struct CustomScale; + + impl WeightScale<Key, Entry> for CustomScale { + fn weight(&self, key: &Key, value: &Entry) -> usize { + value.data.len() + std::mem::size_of::<Entry>() + key.as_bytes().len() + } + } + + /// An LRU cache with hash map backing and an eviction rule based on the memory usage for object data in bytes. + pub struct MemoryCappedHashmap { + inner: clru::CLruCache<Key, Entry, gix_hashtable::hash::Builder, CustomScale>, + free_list: Vec<Vec<u8>>, + debug: gix_features::cache::Debug, + } + + impl MemoryCappedHashmap { + /// The amount of bytes we can hold in total, or the value we saw in `new(…)`. + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + /// Return a new instance which evicts least recently used items if it uses more than `memory_cap_in_bytes` + /// object data. + pub fn new(memory_cap_in_bytes: usize) -> MemoryCappedHashmap { + MemoryCappedHashmap { + inner: clru::CLruCache::with_config( + clru::CLruCacheConfig::new(NonZeroUsize::new(memory_cap_in_bytes).expect("non zero")) + .with_hasher(gix_hashtable::hash::Builder::default()) + .with_scale(CustomScale), + ), + free_list: Vec::new(), + debug: gix_features::cache::Debug::new(format!("MemoryCappedObjectHashmap({memory_cap_in_bytes}B)")), + } + } + } + + impl cache::Object for MemoryCappedHashmap { + /// Put the object going by `id` of `kind` with `data` into the cache. + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { + self.debug.put(); + if let Ok(Some(previous_entry)) = self.inner.put_with_weight( + id, + Entry { + data: self + .free_list + .pop() + .map(|mut v| { + v.clear(); + v.resize(data.len(), 0); + v.copy_from_slice(data); + v + }) + .unwrap_or_else(|| Vec::from(data)), + kind, + }, + ) { + self.free_list.push(previous_entry.data) + } + } + + /// Try to retrieve the object named `id` and place its data into `out` if available and return `Some(kind)` if found. + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind> { + let res = self.inner.get(id).map(|e| { + out.resize(e.data.len(), 0); + out.copy_from_slice(&e.data); + e.kind + }); + if res.is_some() { + self.debug.hit() + } else { + self.debug.miss() + } + res + } + } +} +#[cfg(feature = "object-cache-dynamic")] +pub use memory::MemoryCappedHashmap; + +/// A cache implementation that doesn't do any caching. +pub struct Never; + +impl cache::Object for Never { + /// Noop + fn put(&mut self, _id: gix_hash::ObjectId, _kind: gix_object::Kind, _data: &[u8]) {} + + /// Noop + fn get(&mut self, _id: &gix_hash::ObjectId, _out: &mut Vec<u8>) -> Option<gix_object::Kind> { + None + } +} + +impl<T: cache::Object + ?Sized> cache::Object for Box<T> { + fn put(&mut self, id: gix_hash::ObjectId, kind: gix_object::Kind, data: &[u8]) { + use std::ops::DerefMut; + self.deref_mut().put(id, kind, data) + } + + fn get(&mut self, id: &gix_hash::ObjectId, out: &mut Vec<u8>) -> Option<gix_object::Kind> { + use std::ops::DerefMut; + self.deref_mut().get(id, out) + } +} diff --git a/vendor/gix-pack/src/data/delta.rs b/vendor/gix-pack/src/data/delta.rs new file mode 100644 index 000000000..a898e4aaf --- /dev/null +++ b/vendor/gix-pack/src/data/delta.rs @@ -0,0 +1,70 @@ +/// Given the decompressed pack delta `d`, decode a size in bytes (either the base object size or the result object size) +/// Equivalent to [this canonical git function](https://github.com/git/git/blob/311531c9de557d25ac087c1637818bd2aad6eb3a/delta.h#L89) +pub fn decode_header_size(d: &[u8]) -> (u64, usize) { + let mut i = 0; + let mut size = 0u64; + let mut consumed = 0; + for cmd in d.iter() { + consumed += 1; + size |= (*cmd as u64 & 0x7f) << i; + i += 7; + if *cmd & 0x80 == 0 { + break; + } + } + (size, consumed) +} + +pub fn apply(base: &[u8], mut target: &mut [u8], data: &[u8]) { + let mut i = 0; + while let Some(cmd) = data.get(i) { + i += 1; + match cmd { + cmd if cmd & 0b1000_0000 != 0 => { + let (mut ofs, mut size): (u32, u32) = (0, 0); + if cmd & 0b0000_0001 != 0 { + ofs = data[i] as u32; + i += 1; + } + if cmd & 0b0000_0010 != 0 { + ofs |= (data[i] as u32) << 8; + i += 1; + } + if cmd & 0b0000_0100 != 0 { + ofs |= (data[i] as u32) << 16; + i += 1; + } + if cmd & 0b0000_1000 != 0 { + ofs |= (data[i] as u32) << 24; + i += 1; + } + if cmd & 0b0001_0000 != 0 { + size = data[i] as u32; + i += 1; + } + if cmd & 0b0010_0000 != 0 { + size |= (data[i] as u32) << 8; + i += 1; + } + if cmd & 0b0100_0000 != 0 { + size |= (data[i] as u32) << 16; + i += 1; + } + if size == 0 { + size = 0x10000; // 65536 + } + let ofs = ofs as usize; + std::io::Write::write(&mut target, &base[ofs..ofs + size as usize]) + .expect("delta copy from base: byte slices must match"); + } + 0 => panic!("encountered unsupported command code: 0"), + size => { + std::io::Write::write(&mut target, &data[i..i + *size as usize]) + .expect("delta copy data: slice sizes to match up"); + i += *size as usize; + } + } + } + assert_eq!(i, data.len()); + assert_eq!(target.len(), 0); +} diff --git a/vendor/gix-pack/src/data/entry/decode.rs b/vendor/gix-pack/src/data/entry/decode.rs new file mode 100644 index 000000000..79d7aecff --- /dev/null +++ b/vendor/gix-pack/src/data/entry/decode.rs @@ -0,0 +1,125 @@ +use std::io; + +use gix_features::decode::{leb64, leb64_from_read}; + +use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; +use crate::data; + +/// Decoding +impl data::Entry { + /// Decode an entry from the given entry data `d`, providing the `pack_offset` to allow tracking the start of the entry data section. + /// + /// # Panics + /// + /// If we cannot understand the header, garbage data is likely to trigger this. + pub fn from_bytes(d: &[u8], pack_offset: data::Offset, hash_len: usize) -> data::Entry { + let (type_id, size, mut consumed) = parse_header_info(d); + + use crate::data::entry::Header::*; + let object = match type_id { + OFS_DELTA => { + let (distance, leb_bytes) = leb64(&d[consumed..]); + let delta = OfsDelta { + base_distance: distance, + }; + consumed += leb_bytes; + delta + } + REF_DELTA => { + let delta = RefDelta { + base_id: gix_hash::ObjectId::from(&d[consumed..][..hash_len]), + }; + consumed += hash_len; + delta + } + BLOB => Blob, + TREE => Tree, + COMMIT => Commit, + TAG => Tag, + _ => panic!("We currently don't support any V3 features or extensions"), + }; + data::Entry { + header: object, + decompressed_size: size, + data_offset: pack_offset + consumed as u64, + } + } + + /// Instantiate an `Entry` from the reader `r`, providing the `pack_offset` to allow tracking the start of the entry data section. + pub fn from_read( + mut r: impl io::Read, + pack_offset: data::Offset, + hash_len: usize, + ) -> Result<data::Entry, io::Error> { + let (type_id, size, mut consumed) = streaming_parse_header_info(&mut r)?; + + use crate::data::entry::Header::*; + let object = match type_id { + OFS_DELTA => { + let (distance, leb_bytes) = leb64_from_read(&mut r)?; + let delta = OfsDelta { + base_distance: distance, + }; + consumed += leb_bytes; + delta + } + REF_DELTA => { + let mut buf = gix_hash::Kind::buf(); + let hash = &mut buf[..hash_len]; + r.read_exact(hash)?; + #[allow(clippy::redundant_slicing)] + let delta = RefDelta { + base_id: gix_hash::ObjectId::from(&hash[..]), + }; + consumed += hash_len; + delta + } + BLOB => Blob, + TREE => Tree, + COMMIT => Commit, + TAG => Tag, + _ => panic!("We currently don't support any V3 features or extensions"), + }; + Ok(data::Entry { + header: object, + decompressed_size: size, + data_offset: pack_offset + consumed as u64, + }) + } +} + +#[inline] +fn streaming_parse_header_info(mut read: impl io::Read) -> Result<(u8, u64, usize), io::Error> { + let mut byte = [0u8; 1]; + read.read_exact(&mut byte)?; + let mut c = byte[0]; + let mut i = 1; + let type_id = (c >> 4) & 0b0000_0111; + let mut size = c as u64 & 0b0000_1111; + let mut s = 4; + while c & 0b1000_0000 != 0 { + read.read_exact(&mut byte)?; + c = byte[0]; + i += 1; + size += ((c & 0b0111_1111) as u64) << s; + s += 7 + } + Ok((type_id, size, i)) +} + +/// Parses the header of a pack-entry, yielding object type id, decompressed object size, and consumed bytes +#[inline] +fn parse_header_info(data: &[u8]) -> (u8, u64, usize) { + let mut c = data[0]; + let mut i = 1; + let type_id = (c >> 4) & 0b0000_0111; + let mut size = c as u64 & 0b0000_1111; + let mut s = 4; + while c & 0b1000_0000 != 0 { + c = data[i]; + i += 1; + size += ((c & 0b0111_1111) as u64) << s; + s += 7 + } + (type_id, size, i) +} diff --git a/vendor/gix-pack/src/data/entry/header.rs b/vendor/gix-pack/src/data/entry/header.rs new file mode 100644 index 000000000..83983eab0 --- /dev/null +++ b/vendor/gix-pack/src/data/entry/header.rs @@ -0,0 +1,150 @@ +use std::io; + +use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; +use crate::data; + +/// The header portion of a pack data entry, identifying the kind of stored object. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Header { + /// The object is a commit + Commit, + /// The object is a tree + Tree, + /// The object is a blob + Blob, + /// The object is a tag + Tag, + /// Describes a delta-object which needs to be applied to a base. The base object is identified by the `base_id` field + /// which is found within the parent repository. + /// Most commonly used for **thin-packs** when receiving pack files from the server to refer to objects that are not + /// part of the pack but expected to be present in the receivers repository. + /// + /// # Note + /// This could also be an object within this pack if the LSB encoded offset would be larger than 20 bytes, which is unlikely to + /// happen. + /// + /// **The naming** is exactly the same as the canonical implementation uses, namely **REF_DELTA**. + RefDelta { base_id: gix_hash::ObjectId }, + /// Describes a delta-object present in this pack which acts as base for this object. + /// The base object is measured as a distance from this objects + /// pack offset, so that `base_pack_offset = this_objects_pack_offset - base_distance` + /// + /// # Note + /// + /// **The naming** is exactly the same as the canonical implementation uses, namely **OFS_DELTA**. + OfsDelta { base_distance: u64 }, +} + +impl Header { + /// Subtract `distance` from `pack_offset` safely without the chance for overflow or no-ops if `distance` is 0. + pub fn verified_base_pack_offset(pack_offset: data::Offset, distance: u64) -> Option<data::Offset> { + if distance == 0 { + return None; + } + pack_offset.checked_sub(distance) + } + /// Convert the header's object kind into [`gix_object::Kind`] if possible + pub fn as_kind(&self) -> Option<gix_object::Kind> { + use gix_object::Kind::*; + Some(match self { + Header::Tree => Tree, + Header::Blob => Blob, + Header::Commit => Commit, + Header::Tag => Tag, + Header::RefDelta { .. } | Header::OfsDelta { .. } => return None, + }) + } + /// Convert this header's object kind into the packs internal representation + pub fn as_type_id(&self) -> u8 { + use Header::*; + match self { + Blob => BLOB, + Tree => TREE, + Commit => COMMIT, + Tag => TAG, + OfsDelta { .. } => OFS_DELTA, + RefDelta { .. } => REF_DELTA, + } + } + /// Return's true if this is a delta object, i.e. not a full object. + pub fn is_delta(&self) -> bool { + matches!(self, Header::OfsDelta { .. } | Header::RefDelta { .. }) + } + /// Return's true if this is a base object, i.e. not a delta object. + pub fn is_base(&self) -> bool { + !self.is_delta() + } +} + +impl Header { + /// Encode this header along the given `decompressed_size_in_bytes` into the `out` write stream for use within a data pack. + /// + /// Returns the amount of bytes written to `out`. + /// `decompressed_size_in_bytes` is the full size in bytes of the object that this header represents + pub fn write_to(&self, decompressed_size_in_bytes: u64, mut out: impl io::Write) -> io::Result<usize> { + let mut size = decompressed_size_in_bytes; + let mut written = 1; + let mut c: u8 = (self.as_type_id() << 4) | (size as u8 & 0b0000_1111); + size >>= 4; + while size != 0 { + out.write_all(&[c | 0b1000_0000])?; + written += 1; + c = size as u8 & 0b0111_1111; + size >>= 7; + } + out.write_all(&[c])?; + + use Header::*; + match self { + RefDelta { base_id: oid } => { + out.write_all(oid.as_slice())?; + written += oid.as_slice().len(); + } + OfsDelta { base_distance } => { + let mut buf = [0u8; 10]; + let buf = leb64_encode(*base_distance, &mut buf); + out.write_all(buf)?; + written += buf.len(); + } + Blob | Tree | Commit | Tag => {} + } + Ok(written) + } + + /// The size of the header in bytes when serialized + pub fn size(&self, decompressed_size: u64) -> usize { + self.write_to(decompressed_size, io::sink()) + .expect("io::sink() to never fail") + } +} + +#[inline] +fn leb64_encode(mut n: u64, buf: &mut [u8; 10]) -> &[u8] { + let mut bytes_written = 1; + buf[buf.len() - 1] = n as u8 & 0b0111_1111; + for out in buf.iter_mut().rev().skip(1) { + n >>= 7; + if n == 0 { + break; + } + n -= 1; + *out = 0b1000_0000 | (n as u8 & 0b0111_1111); + bytes_written += 1; + } + debug_assert_eq!(n, 0, "BUG: buffer must be large enough to hold a 64 bit integer"); + &buf[buf.len() - bytes_written..] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn leb64_encode_max_int() { + let mut buf = [0u8; 10]; + let buf = leb64_encode(u64::MAX, &mut buf); + assert_eq!(buf.len(), 10, "10 bytes should be used when 64bits are encoded"); + } +} diff --git a/vendor/gix-pack/src/data/entry/mod.rs b/vendor/gix-pack/src/data/entry/mod.rs new file mode 100644 index 000000000..f11c39c5c --- /dev/null +++ b/vendor/gix-pack/src/data/entry/mod.rs @@ -0,0 +1,53 @@ +use crate::data::Entry; + +const _TYPE_EXT1: u8 = 0; +const COMMIT: u8 = 1; +const TREE: u8 = 2; +const BLOB: u8 = 3; +const TAG: u8 = 4; +const _TYPE_EXT2: u8 = 5; +const OFS_DELTA: u8 = 6; +const REF_DELTA: u8 = 7; + +/// A way to uniquely identify the location of an entry within a pack bundle +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Location { + /// The id of the pack containing the object. It's unique within its frame of reference which is the owning object database. + pub pack_id: u32, + /// The size of the entry of disk so that the range of bytes of the entry is `pack_offset..pack_offset + entry_size`. + pub entry_size: usize, + /// The start of the entry in the pack identified by `pack_id`. + pub pack_offset: data::Offset, +} + +impl Location { + /// Compute a range suitable for lookup in pack data using the [`entry_slice()`][crate::data::File::entry_slice()] method. + pub fn entry_range(&self, pack_offset: data::Offset) -> crate::data::EntryRange { + pack_offset..pack_offset + self.entry_size as u64 + } +} + +/// Access +impl Entry { + /// Compute the pack offset to the base entry of the object represented by this entry. + pub fn base_pack_offset(&self, distance: u64) -> data::Offset { + let pack_offset = self.data_offset - self.header_size() as u64; + pack_offset.checked_sub(distance).expect("in-bound distance of deltas") + } + /// The pack offset at which this entry starts + pub fn pack_offset(&self) -> data::Offset { + self.data_offset - self.header_size() as u64 + } + /// The amount of bytes used to describe this entry in the pack. The header starts at [`Self::pack_offset()`] + pub fn header_size(&self) -> usize { + self.header.size(self.decompressed_size) + } +} + +mod decode; + +mod header; +pub use header::Header; + +use crate::data; diff --git a/vendor/gix-pack/src/data/file/decode/entry.rs b/vendor/gix-pack/src/data/file/decode/entry.rs new file mode 100644 index 000000000..60fefec0f --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/entry.rs @@ -0,0 +1,422 @@ +use std::{convert::TryInto, ops::Range}; + +use gix_features::zlib; +use smallvec::SmallVec; + +use crate::{ + cache, data, + data::{delta, file::decode::Error, File}, +}; + +/// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ResolvedBase { + /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. + InPack(data::Entry), + /// Indicates the object of `kind` was found outside of the pack, and its data was written into an output + /// vector which now has a length of `end`. + #[allow(missing_docs)] + OutOfPack { kind: gix_object::Kind, end: usize }, +} + +#[derive(Debug)] +struct Delta { + data: Range<usize>, + base_size: usize, + result_size: usize, + + decompressed_size: usize, + data_offset: data::Offset, +} + +/// Additional information and statistics about a successfully decoded object produced by [`File::decode_entry()`]. +/// +/// Useful to understand the effectiveness of the pack compression or the cost of decompression. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The kind of resolved object. + pub kind: gix_object::Kind, + /// The amount of deltas in the chain of objects that had to be resolved beforehand. + /// + /// This number is affected by the [`Cache`][cache::DecodeEntry] implementation, with cache hits shortening the + /// delta chain accordingly + pub num_deltas: u32, + /// The total decompressed size of all pack entries in the delta chain + pub decompressed_size: u64, + /// The total compressed size of all pack entries in the delta chain + pub compressed_size: usize, + /// The total size of the decoded object. + pub object_size: u64, +} + +impl Outcome { + pub(crate) fn default_from_kind(kind: gix_object::Kind) -> Self { + Self { + kind, + num_deltas: 0, + decompressed_size: 0, + compressed_size: 0, + object_size: 0, + } + } + fn from_object_entry(kind: gix_object::Kind, entry: &data::Entry, compressed_size: usize) -> Self { + Self { + kind, + num_deltas: 0, + decompressed_size: entry.decompressed_size, + compressed_size, + object_size: entry.decompressed_size, + } + } +} + +/// Decompression of objects +impl File { + /// Decompress the given `entry` into `out` and return the amount of bytes read from the pack data. + /// + /// _Note_ that this method does not resolve deltified objects, but merely decompresses their content + /// `out` is expected to be large enough to hold `entry.size` bytes. + /// + /// # Panics + /// + /// If `out` isn't large enough to hold the decompressed `entry` + pub fn decompress_entry(&self, entry: &data::Entry, out: &mut [u8]) -> Result<usize, Error> { + assert!( + out.len() as u64 >= entry.decompressed_size, + "output buffer isn't large enough to hold decompressed result, want {}, have {}", + entry.decompressed_size, + out.len() + ); + + self.decompress_entry_from_data_offset(entry.data_offset, out) + .map_err(Into::into) + } + + fn assure_v2(&self) { + assert!( + matches!(self.version, crate::data::Version::V2), + "Only V2 is implemented" + ); + } + + /// Obtain the [`Entry`][crate::data::Entry] at the given `offset` into the pack. + /// + /// The `offset` is typically obtained from the pack index file. + pub fn entry(&self, offset: data::Offset) -> data::Entry { + self.assure_v2(); + let pack_offset: usize = offset.try_into().expect("offset representable by machine"); + assert!(pack_offset <= self.data.len(), "offset out of bounds"); + + let object_data = &self.data[pack_offset..]; + data::Entry::from_bytes(object_data, offset, self.hash_len) + } + + /// Decompress the object expected at the given data offset, sans pack header. This information is only + /// known after the pack header was parsed. + /// Note that this method does not resolve deltified objects, but merely decompresses their content + /// `out` is expected to be large enough to hold `entry.size` bytes. + /// Returns the amount of packed bytes there read from the pack data file. + pub(crate) fn decompress_entry_from_data_offset( + &self, + data_offset: data::Offset, + out: &mut [u8], + ) -> Result<usize, zlib::inflate::Error> { + let offset: usize = data_offset.try_into().expect("offset representable by machine"); + assert!(offset < self.data.len(), "entry offset out of bounds"); + + zlib::Inflate::default() + .once(&self.data[offset..], out) + .map(|(_status, consumed_in, _consumed_out)| consumed_in) + } + + /// Like `decompress_entry_from_data_offset`, but returns consumed input and output. + pub(crate) fn decompress_entry_from_data_offset_2( + &self, + data_offset: data::Offset, + out: &mut [u8], + ) -> Result<(usize, usize), zlib::inflate::Error> { + let offset: usize = data_offset.try_into().expect("offset representable by machine"); + assert!(offset < self.data.len(), "entry offset out of bounds"); + + zlib::Inflate::default() + .once(&self.data[offset..], out) + .map(|(_status, consumed_in, consumed_out)| (consumed_in, consumed_out)) + } + + /// Decode an entry, resolving delta's as needed, while growing the `out` vector if there is not enough + /// space to hold the result object. + /// + /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. + /// + /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id is used to refer to + /// a base object, instead of an in-pack offset. + /// + /// `delta_cache` is a mechanism to avoid looking up base objects multiple times when decompressing multiple objects in a row. + /// Use a [Noop-Cache][cache::Never] to disable caching all together at the cost of repeating work. + pub fn decode_entry( + &self, + entry: data::Entry, + out: &mut Vec<u8>, + resolve: impl Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>, + delta_cache: &mut impl cache::DecodeEntry, + ) -> Result<Outcome, Error> { + use crate::data::entry::Header::*; + match entry.header { + Tree | Blob | Commit | Tag => { + out.resize( + entry + .decompressed_size + .try_into() + .expect("size representable by machine"), + 0, + ); + self.decompress_entry(&entry, out.as_mut_slice()).map(|consumed_input| { + Outcome::from_object_entry( + entry.header.as_kind().expect("a non-delta entry"), + &entry, + consumed_input, + ) + }) + } + OfsDelta { .. } | RefDelta { .. } => self.resolve_deltas(entry, resolve, out, delta_cache), + } + } + + /// resolve: technically, this shouldn't ever be required as stored local packs don't refer to objects by id + /// that are outside of the pack. Unless, of course, the ref refers to an object within this pack, which means + /// it's very, very large as 20bytes are smaller than the corresponding MSB encoded number + fn resolve_deltas( + &self, + last: data::Entry, + resolve: impl Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>, + out: &mut Vec<u8>, + cache: &mut impl cache::DecodeEntry, + ) -> Result<Outcome, Error> { + // all deltas, from the one that produces the desired object (first) to the oldest at the end of the chain + let mut chain = SmallVec::<[Delta; 10]>::default(); + let first_entry = last.clone(); + let mut cursor = last; + let mut base_buffer_size: Option<usize> = None; + let mut object_kind: Option<gix_object::Kind> = None; + let mut consumed_input: Option<usize> = None; + + // Find the first full base, either an undeltified object in the pack or a reference to another object. + let mut total_delta_data_size: u64 = 0; + while cursor.header.is_delta() { + if let Some((kind, packed_size)) = cache.get(self.id, cursor.data_offset, out) { + base_buffer_size = Some(out.len()); + object_kind = Some(kind); + // If the input entry is a cache hit, keep the packed size as it must be returned. + // Otherwise, the packed size will be determined later when decompressing the input delta + if total_delta_data_size == 0 { + consumed_input = Some(packed_size); + } + break; + } + total_delta_data_size += cursor.decompressed_size; + let decompressed_size = cursor + .decompressed_size + .try_into() + .expect("a single delta size small enough to fit a usize"); + chain.push(Delta { + data: Range { + start: 0, + end: decompressed_size, + }, + base_size: 0, + result_size: 0, + decompressed_size, + data_offset: cursor.data_offset, + }); + use crate::data::entry::Header; + cursor = match cursor.header { + Header::OfsDelta { base_distance } => self.entry(cursor.base_pack_offset(base_distance)), + Header::RefDelta { base_id } => match resolve(base_id.as_ref(), out) { + Some(ResolvedBase::InPack(entry)) => entry, + Some(ResolvedBase::OutOfPack { end, kind }) => { + base_buffer_size = Some(end); + object_kind = Some(kind); + break; + } + None => return Err(Error::DeltaBaseUnresolved(base_id)), + }, + _ => unreachable!("cursor.is_delta() only allows deltas here"), + }; + } + + // This can happen if the cache held the first entry itself + // We will just treat it as an object then, even though it's technically incorrect. + if chain.is_empty() { + return Ok(Outcome::from_object_entry( + object_kind.expect("object kind as set by cache"), + &first_entry, + consumed_input.expect("consumed bytes as set by cache"), + )); + }; + + // First pass will decompress all delta data and keep it in our output buffer + // [<possibly resolved base object>]<delta-1..delta-n>... + // so that we can find the biggest result size. + let total_delta_data_size: usize = total_delta_data_size.try_into().expect("delta data to fit in memory"); + + let chain_len = chain.len(); + let (first_buffer_end, second_buffer_end) = { + let delta_start = base_buffer_size.unwrap_or(0); + out.resize(delta_start + total_delta_data_size, 0); + + let delta_range = Range { + start: delta_start, + end: delta_start + total_delta_data_size, + }; + let mut instructions = &mut out[delta_range.clone()]; + let mut relative_delta_start = 0; + let mut biggest_result_size = 0; + for (delta_idx, delta) in chain.iter_mut().rev().enumerate() { + let consumed_from_data_offset = self.decompress_entry_from_data_offset( + delta.data_offset, + &mut instructions[..delta.decompressed_size], + )?; + let is_last_delta_to_be_applied = delta_idx + 1 == chain_len; + if is_last_delta_to_be_applied { + consumed_input = Some(consumed_from_data_offset); + } + + let (base_size, offset) = delta::decode_header_size(instructions); + let mut bytes_consumed_by_header = offset; + biggest_result_size = biggest_result_size.max(base_size); + delta.base_size = base_size.try_into().expect("base size fits into usize"); + + let (result_size, offset) = delta::decode_header_size(&instructions[offset..]); + bytes_consumed_by_header += offset; + biggest_result_size = biggest_result_size.max(result_size); + delta.result_size = result_size.try_into().expect("result size fits into usize"); + + // the absolute location into the instructions buffer, so we keep track of the end point of the last + delta.data.start = relative_delta_start + bytes_consumed_by_header; + relative_delta_start += delta.decompressed_size; + delta.data.end = relative_delta_start; + + instructions = &mut instructions[delta.decompressed_size..]; + } + + // Now we can produce a buffer like this + // [<biggest-result-buffer, possibly filled with resolved base object data>]<biggest-result-buffer><delta-1..delta-n> + // from [<possibly resolved base object>]<delta-1..delta-n>... + let biggest_result_size: usize = biggest_result_size + .try_into() + .expect("biggest result size small enough to fit into usize"); + let first_buffer_size = biggest_result_size; + let second_buffer_size = first_buffer_size; + out.resize(first_buffer_size + second_buffer_size + total_delta_data_size, 0); + + // Now 'rescue' the deltas, because in the next step we possibly overwrite that portion + // of memory with the base object (in the majority of cases) + let second_buffer_end = { + let end = first_buffer_size + second_buffer_size; + if delta_range.start < end { + // …this means that the delta size is even larger than two uncompressed worst-case + // intermediate results combined. It would already be undesirable to have it bigger + // then the target size (as you could just store the object in whole). + // However, this just means that it reuses existing deltas smartly, which as we rightfully + // remember stand for an object each. However, this means a lot of data is read to restore + // a single object sometimes. Fair enough - package size is minimized that way. + out.copy_within(delta_range, end); + } else { + let (buffers, instructions) = out.split_at_mut(end); + instructions.copy_from_slice(&buffers[delta_range]); + } + end + }; + + // If we don't have a out-of-pack object already, fill the base-buffer by decompressing the full object + // at which the cursor is left after the iteration + if base_buffer_size.is_none() { + let base_entry = cursor; + debug_assert!(!base_entry.header.is_delta()); + object_kind = base_entry.header.as_kind(); + self.decompress_entry_from_data_offset(base_entry.data_offset, out)?; + } + + (first_buffer_size, second_buffer_end) + }; + + // From oldest to most recent, apply all deltas, swapping the buffer back and forth + // TODO: once we have more tests, we could optimize this memory-intensive work to + // analyse the delta-chains to only copy data once - after all, with 'copy-from-base' deltas, + // all data originates from one base at some point. + // `out` is: [source-buffer][target-buffer][max-delta-instructions-buffer] + let (buffers, instructions) = out.split_at_mut(second_buffer_end); + let (mut source_buf, mut target_buf) = buffers.split_at_mut(first_buffer_end); + + let mut last_result_size = None; + for ( + delta_idx, + Delta { + data, + base_size, + result_size, + .. + }, + ) in chain.into_iter().rev().enumerate() + { + let data = &mut instructions[data]; + if delta_idx + 1 == chain_len { + last_result_size = Some(result_size); + } + delta::apply(&source_buf[..base_size], &mut target_buf[..result_size], data); + // use the target as source for the next delta + std::mem::swap(&mut source_buf, &mut target_buf); + } + + let last_result_size = last_result_size.expect("at least one delta chain item"); + // uneven chains leave the target buffer after the source buffer + // FIXME(Performance) If delta-chains are uneven, we know we will have to copy bytes over here + // Instead we could use a different start buffer, to naturally end up with the result in the + // right one. + // However, this is a bit more complicated than just that - you have to deal with the base + // object, which should also be placed in the second buffer right away. You don't have that + // control/knowledge for out-of-pack bases, so this is a special case to deal with, too. + // Maybe these invariants can be represented in the type system though. + if chain_len % 2 == 1 { + // this seems inverted, but remember: we swapped the buffers on the last iteration + target_buf[..last_result_size].copy_from_slice(&source_buf[..last_result_size]); + } + out.resize(last_result_size, 0); + + let object_kind = object_kind.expect("a base object as root of any delta chain that we are here to resolve"); + let consumed_input = consumed_input.expect("at least one decompressed delta object"); + cache.put( + self.id, + first_entry.data_offset, + out.as_slice(), + object_kind, + consumed_input, + ); + Ok(Outcome { + kind: object_kind, + // technically depending on the cache, the chain size is not correct as it might + // have been cut short by a cache hit. The caller must deactivate the cache to get + // actual results + num_deltas: chain_len as u32, + decompressed_size: first_entry.decompressed_size, + compressed_size: consumed_input, + object_size: last_result_size as u64, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_decode_entry_outcome() { + assert_eq!( + std::mem::size_of::<Outcome>(), + 32, + "this shouldn't change without use noticing as it's returned a lot" + ); + } +} diff --git a/vendor/gix-pack/src/data/file/decode/header.rs b/vendor/gix-pack/src/data/file/decode/header.rs new file mode 100644 index 000000000..1f4b1de0a --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/header.rs @@ -0,0 +1,114 @@ +use crate::{ + data, + data::{delta, file::decode::Error, File}, +}; + +/// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ResolvedBase { + /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. + InPack(data::Entry), + /// Indicates the object of `kind` was found outside of the pack. + OutOfPack { + /// The kind of object we found when reading the header of the out-of-pack base. + kind: gix_object::Kind, + /// The amount of deltas encountered if the object was packed as well. + num_deltas: Option<u32>, + }, +} + +/// Additional information and statistics about a successfully decoded object produced by [`File::decode_header()`]. +/// +/// Useful to understand the effectiveness of the pack compression or the cost of decompression. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The kind of resolved object. + pub kind: gix_object::Kind, + /// The decompressed size of the object. + pub object_size: u64, + /// The amount of deltas in the chain of objects that had to be resolved beforehand. + pub num_deltas: u32, +} + +/// Obtain object information quickly. +impl File { + /// Resolve the object header information starting at `entry`, following the chain of entries as needed. + /// + /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. + /// + /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id + /// is used to refer to a base object, instead of an in-pack offset. + pub fn decode_header( + &self, + mut entry: data::Entry, + resolve: impl Fn(&gix_hash::oid) -> Option<ResolvedBase>, + ) -> Result<Outcome, Error> { + use crate::data::entry::Header::*; + let mut num_deltas = 0; + let mut first_delta_decompressed_size = None::<u64>; + loop { + match entry.header { + Tree | Blob | Commit | Tag => { + return Ok(Outcome { + kind: entry.header.as_kind().expect("always valid for non-refs"), + object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), + num_deltas, + }); + } + OfsDelta { base_distance } => { + num_deltas += 1; + if first_delta_decompressed_size.is_none() { + first_delta_decompressed_size = Some(self.decode_delta_object_size(&entry)?); + } + entry = self.entry(entry.base_pack_offset(base_distance)) + } + RefDelta { base_id } => { + num_deltas += 1; + if first_delta_decompressed_size.is_none() { + first_delta_decompressed_size = Some(self.decode_delta_object_size(&entry)?); + } + match resolve(base_id.as_ref()) { + Some(ResolvedBase::InPack(base_entry)) => entry = base_entry, + Some(ResolvedBase::OutOfPack { + kind, + num_deltas: origin_num_deltas, + }) => { + return Ok(Outcome { + kind, + object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), + num_deltas: origin_num_deltas.unwrap_or_default() + num_deltas, + }) + } + None => return Err(Error::DeltaBaseUnresolved(base_id)), + } + } + }; + } + } + + #[inline] + fn decode_delta_object_size(&self, entry: &data::Entry) -> Result<u64, Error> { + let mut buf = [0_u8; 32]; + let used = self.decompress_entry_from_data_offset_2(entry.data_offset, &mut buf)?.1; + let buf = &buf[..used]; + let (_base_size, offset) = delta::decode_header_size(buf); + let (result_size, _offset) = delta::decode_header_size(&buf[offset..]); + Ok(result_size) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_decode_entry_outcome() { + assert_eq!( + std::mem::size_of::<Outcome>(), + 16, + "this shouldn't change without use noticing as it's returned a lot" + ); + } +} diff --git a/vendor/gix-pack/src/data/file/decode/mod.rs b/vendor/gix-pack/src/data/file/decode/mod.rs new file mode 100644 index 000000000..10bb7f19b --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/mod.rs @@ -0,0 +1,16 @@ +/// +pub mod entry; +/// +pub mod header; + +/// Returned by [`File::decode_header()`][crate::data::File::decode_header()], +/// [`File::decode_entry()`][crate::data::File::decode_entry()] and . +/// [`File::decompress_entry()`][crate::data::File::decompress_entry()] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("Failed to decompress pack entry")] + ZlibInflate(#[from] gix_features::zlib::inflate::Error), + #[error("A delta chain could not be followed as the ref base with id {0} could not be found")] + DeltaBaseUnresolved(gix_hash::ObjectId), +} diff --git a/vendor/gix-pack/src/data/file/init.rs b/vendor/gix-pack/src/data/file/init.rs new file mode 100644 index 000000000..b16072417 --- /dev/null +++ b/vendor/gix-pack/src/data/file/init.rs @@ -0,0 +1,41 @@ +use std::{convert::TryInto, path::Path}; + +use crate::data; + +/// Instantiation +impl data::File { + /// Try opening a data file at the given `path`. + /// + /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind + /// isn't stored within the file format itself. + pub fn at(path: impl AsRef<Path>, object_hash: gix_hash::Kind) -> Result<data::File, data::header::decode::Error> { + Self::at_inner(path.as_ref(), object_hash) + } + + fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result<data::File, data::header::decode::Error> { + use crate::data::header::N32_SIZE; + let hash_len = object_hash.len_in_bytes(); + + let data = crate::mmap::read_only(path).map_err(|e| data::header::decode::Error::Io { + source: e, + path: path.to_owned(), + })?; + let pack_len = data.len(); + if pack_len < N32_SIZE * 3 + hash_len { + return Err(data::header::decode::Error::Corrupt(format!( + "Pack data of size {pack_len} is too small for even an empty pack with shortest hash" + ))); + } + let (kind, num_objects) = + data::header::decode(&data[..12].try_into().expect("enough data after previous check"))?; + Ok(data::File { + data, + path: path.to_owned(), + id: gix_features::hash::crc32(path.as_os_str().to_string_lossy().as_bytes()), + version: kind, + num_objects, + hash_len, + object_hash, + }) + } +} diff --git a/vendor/gix-pack/src/data/file/mod.rs b/vendor/gix-pack/src/data/file/mod.rs new file mode 100644 index 000000000..6bfe0e272 --- /dev/null +++ b/vendor/gix-pack/src/data/file/mod.rs @@ -0,0 +1,9 @@ +mod init; +/// +pub mod verify; + +/// +pub mod decode; + +/// The bytes used as header in a pack data file. +pub type Header = [u8; 12]; diff --git a/vendor/gix-pack/src/data/file/verify.rs b/vendor/gix-pack/src/data/file/verify.rs new file mode 100644 index 000000000..afec20826 --- /dev/null +++ b/vendor/gix-pack/src/data/file/verify.rs @@ -0,0 +1,42 @@ +use std::sync::atomic::AtomicBool; + +use gix_features::progress::Progress; + +use crate::data::File; + +/// +pub mod checksum { + /// Returned by [`data::File::verify_checksum()`][crate::data::File::verify_checksum()]. + pub type Error = crate::verify::checksum::Error; +} + +/// Checksums and verify checksums +impl File { + /// The checksum in the trailer of this pack data file + pub fn checksum(&self) -> gix_hash::ObjectId { + gix_hash::ObjectId::from(&self.data[self.data.len() - self.hash_len..]) + } + + /// Verifies that the checksum of the packfile over all bytes preceding it indeed matches the actual checksum, + /// returning the actual checksum equivalent to the return value of [`checksum()`][File::checksum()] if there + /// is no mismatch. + /// + /// Note that if no `progress` is desired, one can pass [`gix_features::progress::Discard`]. + /// + /// Have a look at [`index::File::verify_integrity(…)`][crate::index::File::verify_integrity()] for an + /// even more thorough integrity check. + pub fn verify_checksum( + &self, + progress: impl Progress, + should_interrupt: &AtomicBool, + ) -> Result<gix_hash::ObjectId, checksum::Error> { + crate::verify::checksum_on_disk_or_mmap( + self.path(), + &self.data, + self.checksum(), + self.object_hash, + progress, + should_interrupt, + ) + } +} diff --git a/vendor/gix-pack/src/data/header.rs b/vendor/gix-pack/src/data/header.rs new file mode 100644 index 000000000..348a4ca24 --- /dev/null +++ b/vendor/gix-pack/src/data/header.rs @@ -0,0 +1,55 @@ +use crate::data; + +pub(crate) const N32_SIZE: usize = std::mem::size_of::<u32>(); + +/// Parses the first 12 bytes of a pack file, returning the pack version as well as the number of objects contained in the pack. +pub fn decode(data: &[u8; 12]) -> Result<(data::Version, u32), decode::Error> { + let mut ofs = 0; + if &data[ofs..ofs + b"PACK".len()] != b"PACK" { + return Err(decode::Error::Corrupt("Pack data type not recognized".into())); + } + ofs += N32_SIZE; + let kind = match crate::read_u32(&data[ofs..ofs + N32_SIZE]) { + 2 => data::Version::V2, + 3 => data::Version::V3, + v => return Err(decode::Error::UnsupportedVersion(v)), + }; + ofs += N32_SIZE; + let num_objects = crate::read_u32(&data[ofs..ofs + N32_SIZE]); + + Ok((kind, num_objects)) +} + +/// Write a pack data header at `version` with `num_objects` and return a buffer. +pub fn encode(version: data::Version, num_objects: u32) -> [u8; 12] { + use crate::data::Version::*; + let mut buf = [0u8; 12]; + buf[..4].copy_from_slice(b"PACK"); + buf[4..8].copy_from_slice( + &match version { + V2 => 2u32, + V3 => 3, + } + .to_be_bytes()[..], + ); + buf[8..].copy_from_slice(&num_objects.to_be_bytes()[..]); + buf +} + +/// +pub mod decode { + /// Returned by [`decode()`][super::decode()]. + #[derive(thiserror::Error, Debug)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not open pack file at '{path}'")] + Io { + source: std::io::Error, + path: std::path::PathBuf, + }, + #[error("{0}")] + Corrupt(String), + #[error("Unsupported pack version: {0}")] + UnsupportedVersion(u32), + } +} diff --git a/vendor/gix-pack/src/data/input/bytes_to_entries.rs b/vendor/gix-pack/src/data/input/bytes_to_entries.rs new file mode 100644 index 000000000..cf20d5fbf --- /dev/null +++ b/vendor/gix-pack/src/data/input/bytes_to_entries.rs @@ -0,0 +1,295 @@ +use std::{fs, io}; + +use gix_features::{ + hash, + hash::Sha1, + zlib::{stream::inflate::ReadBoxed, Decompress}, +}; +use gix_hash::ObjectId; + +use crate::data::input; + +/// An iterator over [`Entries`][input::Entry] in a byte stream. +/// +/// The iterator used as part of [Bundle::write_to_directory(…)][crate::Bundle::write_to_directory()]. +pub struct BytesToEntriesIter<BR> { + read: BR, + decompressor: Option<Box<Decompress>>, + offset: u64, + had_error: bool, + version: crate::data::Version, + objects_left: u32, + hash: Option<Sha1>, + mode: input::Mode, + compressed: input::EntryDataMode, + compressed_buf: Option<Vec<u8>>, + hash_len: usize, + object_hash: gix_hash::Kind, +} + +/// Access +impl<BR> BytesToEntriesIter<BR> { + /// The pack version currently being iterated + pub fn version(&self) -> crate::data::Version { + self.version + } + + /// The kind of iteration + pub fn mode(&self) -> input::Mode { + self.mode + } +} + +/// Initialization +impl<BR> BytesToEntriesIter<BR> +where + BR: io::BufRead, +{ + /// Obtain an iterator from a `read` stream to a pack data file and configure it using `mode` and `compressed`. + /// `object_hash` specifies which hash is used for objects in ref-delta entries. + /// + /// Note that `read` is expected at the beginning of a valid pack data file with a header, entries and a trailer. + pub fn new_from_header( + mut read: BR, + mode: input::Mode, + compressed: input::EntryDataMode, + object_hash: gix_hash::Kind, + ) -> Result<BytesToEntriesIter<BR>, input::Error> { + let mut header_data = [0u8; 12]; + read.read_exact(&mut header_data)?; + + let (version, num_objects) = crate::data::header::decode(&header_data)?; + assert_eq!( + version, + crate::data::Version::V2, + "let's stop here if we see undocumented pack formats" + ); + Ok(BytesToEntriesIter { + read, + decompressor: None, + compressed, + offset: 12, + had_error: false, + version, + objects_left: num_objects, + hash: (mode != input::Mode::AsIs).then(|| { + let mut hash = gix_features::hash::hasher(object_hash); + hash.update(&header_data); + hash + }), + mode, + compressed_buf: None, + hash_len: object_hash.len_in_bytes(), + object_hash, + }) + } + + fn next_inner(&mut self) -> Result<input::Entry, input::Error> { + self.objects_left -= 1; // even an error counts as objects + + // Read header + let entry = match self.hash.take() { + Some(hash) => { + let mut read = read_and_pass_to( + &mut self.read, + hash::Write { + inner: io::sink(), + hash, + }, + ); + let res = crate::data::Entry::from_read(&mut read, self.offset, self.hash_len); + self.hash = Some(read.write.hash); + res + } + None => crate::data::Entry::from_read(&mut self.read, self.offset, self.hash_len), + } + .map_err(input::Error::from)?; + + // Decompress object to learn its compressed bytes + let mut decompressor = self + .decompressor + .take() + .unwrap_or_else(|| Box::new(Decompress::new(true))); + let compressed_buf = self.compressed_buf.take().unwrap_or_else(|| Vec::with_capacity(4096)); + decompressor.reset(true); + let mut decompressed_reader = ReadBoxed { + inner: read_and_pass_to( + &mut self.read, + if self.compressed.keep() { + Vec::with_capacity(entry.decompressed_size as usize) + } else { + compressed_buf + }, + ), + decompressor, + }; + + let bytes_copied = io::copy(&mut decompressed_reader, &mut io::sink())?; + if bytes_copied != entry.decompressed_size { + return Err(input::Error::IncompletePack { + actual: bytes_copied, + expected: entry.decompressed_size, + }); + } + + let pack_offset = self.offset; + let compressed_size = decompressed_reader.decompressor.total_in(); + self.offset += entry.header_size() as u64 + compressed_size; + self.decompressor = Some(decompressed_reader.decompressor); + + let mut compressed = decompressed_reader.inner.write; + debug_assert_eq!( + compressed_size, + compressed.len() as u64, + "we must track exactly the same amount of bytes as read by the decompressor" + ); + if let Some(hash) = self.hash.as_mut() { + hash.update(&compressed); + } + + let crc32 = if self.compressed.crc32() { + let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; + let header_len = entry.header.write_to(bytes_copied, header_buf.as_mut())?; + let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); + Some(gix_features::hash::crc32_update(state, &compressed)) + } else { + None + }; + + let compressed = if self.compressed.keep() { + Some(compressed) + } else { + compressed.clear(); + self.compressed_buf = Some(compressed); + None + }; + + // Last objects gets trailer (which is potentially verified) + let trailer = self.try_read_trailer()?; + Ok(input::Entry { + header: entry.header, + header_size: entry.header_size() as u16, + compressed, + compressed_size, + crc32, + pack_offset, + decompressed_size: bytes_copied, + trailer, + }) + } + + fn try_read_trailer(&mut self) -> Result<Option<ObjectId>, input::Error> { + Ok(if self.objects_left == 0 { + let mut id = gix_hash::ObjectId::null(self.object_hash); + if let Err(err) = self.read.read_exact(id.as_mut_slice()) { + if self.mode != input::Mode::Restore { + return Err(err.into()); + } + } + + if let Some(hash) = self.hash.take() { + let actual_id = gix_hash::ObjectId::from(hash.digest()); + if self.mode == input::Mode::Restore { + id = actual_id; + } + if id != actual_id { + return Err(input::Error::ChecksumMismatch { + actual: actual_id, + expected: id, + }); + } + } + Some(id) + } else if self.mode == input::Mode::Restore { + let hash = self.hash.clone().expect("in restore mode a hash is set"); + Some(gix_hash::ObjectId::from(hash.digest())) + } else { + None + }) + } +} + +fn read_and_pass_to<R: io::Read, W: io::Write>(read: &mut R, to: W) -> PassThrough<&mut R, W> { + PassThrough { read, write: to } +} + +impl<R> Iterator for BytesToEntriesIter<R> +where + R: io::BufRead, +{ + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.had_error || self.objects_left == 0 { + return None; + } + let result = self.next_inner(); + self.had_error = result.is_err(); + if self.had_error { + self.objects_left = 0; + } + if self.mode == input::Mode::Restore && self.had_error { + None + } else { + Some(result) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (self.objects_left as usize, Some(self.objects_left as usize)) + } +} + +impl<R> std::iter::ExactSizeIterator for BytesToEntriesIter<R> where R: io::BufRead {} + +struct PassThrough<R, W> { + read: R, + write: W, +} + +impl<R, W> io::BufRead for PassThrough<R, W> +where + Self: io::Read, + R: io::BufRead, + W: io::Write, +{ + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.read.fill_buf() + } + + fn consume(&mut self, amt: usize) { + let buf = self + .read + .fill_buf() + .expect("never fail as we called fill-buf before and this does nothing"); + self.write + .write_all(&buf[..amt]) + .expect("a write to never fail - should be a memory buffer"); + self.read.consume(amt) + } +} + +impl<R, W> io::Read for PassThrough<R, W> +where + W: io::Write, + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + let bytes_read = self.read.read(buf)?; + self.write.write_all(&buf[..bytes_read])?; + Ok(bytes_read) + } +} + +impl crate::data::File { + /// Returns an iterator over [`Entries`][crate::data::input::Entry], without making use of the memory mapping. + pub fn streaming_iter(&self) -> Result<BytesToEntriesIter<impl io::BufRead>, input::Error> { + let reader = io::BufReader::with_capacity(4096 * 8, fs::File::open(&self.path)?); + BytesToEntriesIter::new_from_header( + reader, + input::Mode::Verify, + input::EntryDataMode::KeepAndCrc32, + self.object_hash, + ) + } +} diff --git a/vendor/gix-pack/src/data/input/entries_to_bytes.rs b/vendor/gix-pack/src/data/input/entries_to_bytes.rs new file mode 100644 index 000000000..a8c21e653 --- /dev/null +++ b/vendor/gix-pack/src/data/input/entries_to_bytes.rs @@ -0,0 +1,155 @@ +use std::iter::Peekable; + +use gix_features::hash; + +use crate::data::input; + +/// An implementation of [`Iterator`] to write [encoded entries][input::Entry] to an inner implementation each time +/// `next()` is called. +/// +/// It is able to deal with an unknown amount of objects as it will rewrite the pack header once the entries iterator +/// is depleted and compute the hash in one go by re-reading the whole file. +pub struct EntriesToBytesIter<I: Iterator, W> { + /// An iterator for input [`input::Entry`] instances + pub input: Peekable<I>, + /// A way of writing encoded bytes. + output: W, + /// Our trailing hash when done writing all input entries + trailer: Option<gix_hash::ObjectId>, + /// The amount of objects in the iteration and the version of the packfile to be written. + /// Will be `None` to signal the header was written already. + data_version: crate::data::Version, + /// The amount of entries seen so far + num_entries: u32, + /// If we are done, no additional writes will occur + is_done: bool, + /// The kind of hash to use for the digest + object_hash: gix_hash::Kind, +} + +impl<I, W> EntriesToBytesIter<I, W> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + W: std::io::Read + std::io::Write + std::io::Seek, +{ + /// Create a new instance reading [entries][input::Entry] from an `input` iterator and write pack data bytes to + /// `output` writer, resembling a pack of `version`. The amount of entries will be dynamically determined and + /// the pack is completed once the last entry was written. + /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. + /// + /// # Panics + /// + /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. + pub fn new(input: I, output: W, version: crate::data::Version, object_hash: gix_hash::Kind) -> Self { + assert!( + matches!(version, crate::data::Version::V2), + "currently only pack version 2 can be written", + ); + assert!( + matches!(object_hash, gix_hash::Kind::Sha1), + "currently only Sha1 is supported, right now we don't know how other hashes are encoded", + ); + EntriesToBytesIter { + input: input.peekable(), + output, + object_hash, + num_entries: 0, + trailer: None, + data_version: version, + is_done: false, + } + } + + /// Returns the trailing hash over all ~ entries once done. + /// It's `None` if we are not yet done writing. + pub fn digest(&self) -> Option<gix_hash::ObjectId> { + self.trailer + } + + fn next_inner(&mut self, entry: input::Entry) -> Result<input::Entry, input::Error> { + if self.num_entries == 0 { + let header_bytes = crate::data::header::encode(self.data_version, 0); + self.output.write_all(&header_bytes[..])?; + } + self.num_entries += 1; + entry.header.write_to(entry.decompressed_size, &mut self.output)?; + std::io::copy( + &mut entry + .compressed + .as_deref() + .expect("caller must configure generator to keep compressed bytes"), + &mut self.output, + )?; + Ok(entry) + } + + fn write_header_and_digest(&mut self, last_entry: Option<&mut input::Entry>) -> Result<(), input::Error> { + let header_bytes = crate::data::header::encode(self.data_version, self.num_entries); + let num_bytes_written = if last_entry.is_some() { + self.output.stream_position()? + } else { + header_bytes.len() as u64 + }; + self.output.rewind()?; + self.output.write_all(&header_bytes[..])?; + self.output.flush()?; + + self.output.rewind()?; + let interrupt_never = std::sync::atomic::AtomicBool::new(false); + let digest = hash::bytes( + &mut self.output, + num_bytes_written as usize, + self.object_hash, + &mut gix_features::progress::Discard, + &interrupt_never, + )?; + self.output.write_all(digest.as_slice())?; + self.output.flush()?; + + self.is_done = true; + if let Some(last_entry) = last_entry { + last_entry.trailer = Some(digest); + } + self.trailer = Some(digest); + Ok(()) + } +} + +impl<I, W> Iterator for EntriesToBytesIter<I, W> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + W: std::io::Read + std::io::Write + std::io::Seek, +{ + /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.is_done { + return None; + } + + match self.input.next() { + Some(res) => Some(match res { + Ok(entry) => self.next_inner(entry).and_then(|mut entry| { + if self.input.peek().is_none() { + self.write_header_and_digest(Some(&mut entry)).map(|_| entry) + } else { + Ok(entry) + } + }), + Err(err) => { + self.is_done = true; + Err(err) + } + }), + None => match self.write_header_and_digest(None) { + Ok(_) => None, + Err(err) => Some(Err(err)), + }, + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.input.size_hint() + } +} diff --git a/vendor/gix-pack/src/data/input/entry.rs b/vendor/gix-pack/src/data/input/entry.rs new file mode 100644 index 000000000..74d4800a0 --- /dev/null +++ b/vendor/gix-pack/src/data/input/entry.rs @@ -0,0 +1,65 @@ +use std::io::Write; + +use crate::data::{entry::Header, input}; + +impl input::Entry { + /// Create a new input entry from a given data `obj` set to be placed at the given `pack_offset`. + /// + /// This method is useful when arbitrary base entries are created + pub fn from_data_obj(obj: &gix_object::Data<'_>, pack_offset: u64) -> Result<Self, input::Error> { + let header = to_header(obj.kind); + let compressed = compress_data(obj)?; + let compressed_size = compressed.len() as u64; + let mut entry = input::Entry { + header, + header_size: header.size(obj.data.len() as u64) as u16, + pack_offset, + compressed: Some(compressed), + compressed_size, + crc32: None, + decompressed_size: obj.data.len() as u64, + trailer: None, + }; + entry.crc32 = Some(entry.compute_crc32()); + Ok(entry) + } + /// The amount of bytes this entry may consume in a pack data file + pub fn bytes_in_pack(&self) -> u64 { + self.header_size as u64 + self.compressed_size + } + + /// Update our CRC value by recalculating it from our header and compressed data. + pub fn compute_crc32(&self) -> u32 { + let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; + let header_len = self + .header + .write_to(self.decompressed_size, header_buf.as_mut()) + .expect("write to memory will not fail"); + let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); + gix_features::hash::crc32_update(state, self.compressed.as_ref().expect("we always set it")) + } +} + +fn to_header(kind: gix_object::Kind) -> Header { + use gix_object::Kind::*; + match kind { + Tree => Header::Tree, + Blob => Header::Blob, + Commit => Header::Commit, + Tag => Header::Tag, + } +} + +fn compress_data(obj: &gix_object::Data<'_>) -> Result<Vec<u8>, input::Error> { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { + match err.kind() { + std::io::ErrorKind::Other => return Err(input::Error::Io(err)), + err => { + unreachable!("Should never see other errors than zlib, but got {:?}", err,) + } + } + }; + out.flush().expect("zlib flush should never fail"); + Ok(out.into_inner()) +} diff --git a/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs b/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs new file mode 100644 index 000000000..f52c645f8 --- /dev/null +++ b/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs @@ -0,0 +1,211 @@ +use std::convert::TryInto; + +use gix_hash::ObjectId; + +use crate::data::{entry::Header, input}; + +/// An iterator to resolve thin packs on the fly. +pub struct LookupRefDeltaObjectsIter<I, LFn> { + /// The inner iterator whose entries we will resolve. + pub inner: I, + lookup: LFn, + /// The cached delta to provide next time we are called, it's the delta to go with the base we just resolved in its place. + next_delta: Option<input::Entry>, + /// Fuse to stop iteration after first missing object. + error: bool, + /// The overall pack-offset we accumulated thus far. Each inserted entry offsets all following + /// objects by its length. We need to determine exactly where the object was inserted to see if its affected at all. + inserted_entry_length_at_offset: Vec<Change>, + /// The sum of all entries added so far, as a cache to avoid recomputation + inserted_entries_length_in_bytes: i64, + buf: Vec<u8>, +} + +impl<I, LFn> LookupRefDeltaObjectsIter<I, LFn> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + LFn: for<'a> FnMut(ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>>, +{ + /// Create a new instance wrapping `iter` and using `lookup` as function to retrieve objects that will serve as bases + /// for ref deltas seen while traversing `iter`. + pub fn new(iter: I, lookup: LFn) -> Self { + LookupRefDeltaObjectsIter { + inner: iter, + lookup, + error: false, + inserted_entry_length_at_offset: Vec::new(), + inserted_entries_length_in_bytes: 0, + next_delta: None, + buf: Vec::new(), + } + } + + fn shifted_pack_offset(&self, pack_offset: u64) -> u64 { + let new_ofs = pack_offset as i64 + self.inserted_entries_length_in_bytes; + new_ofs.try_into().expect("offset value is never becomes negative") + } + + /// positive `size_change` values mean an object grew or was more commonly, was inserted. Negative values + /// mean the object shrunk, usually because there header changed from ref-deltas to ofs deltas. + fn track_change( + &mut self, + shifted_pack_offset: u64, + pack_offset: u64, + size_change: i64, + oid: impl Into<Option<ObjectId>>, + ) { + if size_change == 0 { + return; + } + self.inserted_entry_length_at_offset.push(Change { + shifted_pack_offset, + pack_offset, + size_change_in_bytes: size_change, + oid: oid.into().unwrap_or_else(|| + // NOTE: this value acts as sentinel and the actual hash kind doesn't matter. + gix_hash::Kind::Sha1.null()), + }); + self.inserted_entries_length_in_bytes += size_change; + } + + fn shift_entry_and_point_to_base_by_offset(&mut self, entry: &mut input::Entry, base_distance: u64) { + let pack_offset = entry.pack_offset; + entry.pack_offset = self.shifted_pack_offset(pack_offset); + entry.header = Header::OfsDelta { base_distance }; + let previous_header_size = entry.header_size; + entry.header_size = entry.header.size(entry.decompressed_size) as u16; + + let change = entry.header_size as i64 - previous_header_size as i64; + entry.crc32 = Some(entry.compute_crc32()); + self.track_change(entry.pack_offset, pack_offset, change, None); + } +} + +impl<I, LFn> Iterator for LookupRefDeltaObjectsIter<I, LFn> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + LFn: for<'a> FnMut(ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>>, +{ + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.error { + return None; + } + if let Some(delta) = self.next_delta.take() { + return Some(Ok(delta)); + } + match self.inner.next() { + Some(Ok(mut entry)) => match entry.header { + Header::RefDelta { base_id } => { + match self.inserted_entry_length_at_offset.iter().rfind(|e| e.oid == base_id) { + None => { + let base_entry = match (self.lookup)(base_id, &mut self.buf) { + Some(obj) => { + let current_pack_offset = entry.pack_offset; + let mut entry = match input::Entry::from_data_obj(&obj, 0) { + Ok(e) => e, + Err(err) => return Some(Err(err)), + }; + entry.pack_offset = self.shifted_pack_offset(current_pack_offset); + self.track_change( + entry.pack_offset, + current_pack_offset, + entry.bytes_in_pack() as i64, + base_id, + ); + entry + } + None => { + self.error = true; + return Some(Err(input::Error::NotFound { object_id: base_id })); + } + }; + + { + self.shift_entry_and_point_to_base_by_offset(&mut entry, base_entry.bytes_in_pack()); + self.next_delta = Some(entry); + } + Some(Ok(base_entry)) + } + Some(base_entry) => { + let base_distance = + self.shifted_pack_offset(entry.pack_offset) - base_entry.shifted_pack_offset; + self.shift_entry_and_point_to_base_by_offset(&mut entry, base_distance); + Some(Ok(entry)) + } + } + } + _ => { + if self.inserted_entries_length_in_bytes != 0 { + if let Header::OfsDelta { base_distance } = entry.header { + // We have to find the new distance based on the previous distance to the base, using the absolute + // pack offset computed from it as stored in `base_pack_offset`. + let base_pack_offset = entry + .pack_offset + .checked_sub(base_distance) + .expect("distance to be in range of pack"); + match self + .inserted_entry_length_at_offset + .binary_search_by_key(&base_pack_offset, |c| c.pack_offset) + { + Ok(index) => { + let index = { + let maybe_index_of_actual_entry = index + 1; + self.inserted_entry_length_at_offset + .get(maybe_index_of_actual_entry) + .and_then(|c| { + (c.pack_offset == base_pack_offset) + .then_some(maybe_index_of_actual_entry) + }) + .unwrap_or(index) + }; + let new_distance = self + .shifted_pack_offset(entry.pack_offset) + .checked_sub(self.inserted_entry_length_at_offset[index].shifted_pack_offset) + .expect("a base that is behind us in the pack"); + self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); + } + Err(index) => { + let change_since_offset = self.inserted_entry_length_at_offset[index..] + .iter() + .map(|c| c.size_change_in_bytes) + .sum::<i64>(); + let new_distance: u64 = { + (base_distance as i64 + change_since_offset) + .try_into() + .expect("it still points behind us") + }; + self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); + } + } + } else { + // Offset this entry by all changes (positive or negative) that we saw thus far. + entry.pack_offset = self.shifted_pack_offset(entry.pack_offset); + } + } + Some(Ok(entry)) + } + }, + other => other, + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (min, max) = self.inner.size_hint(); + max.map(|max| (min, Some(max * 2))).unwrap_or_else(|| (min * 2, None)) + } +} + +#[derive(Debug)] +struct Change { + /// The original pack offset as mentioned in the entry we saw. This is used to find this as base object if deltas refer to it by + /// old offset. + pack_offset: u64, + /// The new pack offset that is the shifted location of the pack entry in the pack. + shifted_pack_offset: u64, + /// The size change of the entry header, negative values denote shrinking, positive denote growing. + size_change_in_bytes: i64, + /// The object id of the entry responsible for the change, or null if it's an entry just for tracking an insertion. + oid: ObjectId, +} diff --git a/vendor/gix-pack/src/data/input/mod.rs b/vendor/gix-pack/src/data/input/mod.rs new file mode 100644 index 000000000..df191de67 --- /dev/null +++ b/vendor/gix-pack/src/data/input/mod.rs @@ -0,0 +1,41 @@ +/// An item of the iteration produced by [`BytesToEntriesIter`] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The header of a pack entry + pub header: crate::data::entry::Header, + /// The amount of bytes used to encode the `header`. `pack_offset + header_size` is the beginning of + /// the compressed data in the pack. + pub header_size: u16, + /// The first byte of the entry at which the `header` can be read. + pub pack_offset: u64, + /// The bytes consumed while producing `decompressed` + /// These do not contain the header, which makes it possible to easily replace a RefDelta with offset deltas + /// when resolving thin packs. + /// Depends on `CompressionMode` when the iterator is initialized. + pub compressed: Option<Vec<u8>>, + /// The amount of bytes the compressed portion of the entry takes, i.e. the portion behind behind the header. + pub compressed_size: u64, + /// The CRC32 over the complete entry, that is encoded header and compressed object data. + /// Depends on `CompressionMode` when the iterator is initialized + pub crc32: Option<u32>, + /// The amount of decompressed bytes of the entry. + pub decompressed_size: u64, + /// Set for the last object in the iteration, providing the hash over all bytes of the iteration + /// for use as trailer in a pack or to verify it matches the trailer. + pub trailer: Option<gix_hash::ObjectId>, +} + +mod entry; + +mod types; +pub use types::{EntryDataMode, Error, Mode}; + +mod bytes_to_entries; +pub use bytes_to_entries::BytesToEntriesIter; + +mod lookup_ref_delta_objects; +pub use lookup_ref_delta_objects::LookupRefDeltaObjectsIter; + +mod entries_to_bytes; +pub use entries_to_bytes::EntriesToBytesIter; diff --git a/vendor/gix-pack/src/data/input/types.rs b/vendor/gix-pack/src/data/input/types.rs new file mode 100644 index 000000000..6fcd459e2 --- /dev/null +++ b/vendor/gix-pack/src/data/input/types.rs @@ -0,0 +1,73 @@ +use std::io; + +/// Returned by [`BytesToEntriesIter::new_from_header()`][crate::data::input::BytesToEntriesIter::new_from_header()] and as part +/// of `Item` of [`BytesToEntriesIter`][crate::data::input::BytesToEntriesIter]. +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An IO operation failed while streaming an entry")] + Io(#[from] io::Error), + #[error(transparent)] + PackParse(#[from] crate::data::header::decode::Error), + #[error("pack checksum in trailer was {expected}, but actual checksum was {actual}")] + ChecksumMismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + }, + #[error("pack is incomplete: it was decompressed into {actual} bytes but {expected} bytes where expected.")] + IncompletePack { actual: u64, expected: u64 }, + #[error("The object {object_id} could not be decoded or wasn't found")] + NotFound { object_id: gix_hash::ObjectId }, +} + +/// Iteration Mode +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum Mode { + /// Provide the trailer as read from the pack + AsIs, + /// Generate an own hash and trigger an error on the last iterated object + /// if it does not match the hash provided with the pack. + /// + /// This way the one iterating the data cannot miss corruption as long as + /// the iteration is continued through to the end. + Verify, + /// Generate an own hash and if there was an error or the objects are depleted early + /// due to partial packs, return the last valid entry and with our own hash thus far. + /// Note that the existing pack hash, if present, will be ignored. + /// As we won't know which objects fails, every object will have the hash obtained thus far. + /// This also means that algorithms must know about this possibility, or else might wrongfully + /// assume the pack is finished. + Restore, +} + +/// Define what to do with the compressed bytes portion of a pack [`Entry`][super::Entry] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum EntryDataMode { + /// Do nothing with the compressed bytes we read + Ignore, + /// Only create a CRC32 of the entry, otherwise similar to `Ignore` + Crc32, + /// Keep them and pass them along in a newly allocated buffer + Keep, + /// As above, but also compute a CRC32 + KeepAndCrc32, +} + +impl EntryDataMode { + /// Returns true if a crc32 should be computed + pub fn crc32(&self) -> bool { + match self { + EntryDataMode::KeepAndCrc32 | EntryDataMode::Crc32 => true, + EntryDataMode::Keep | EntryDataMode::Ignore => false, + } + } + /// Returns true if compressed bytes should be kept + pub fn keep(&self) -> bool { + match self { + EntryDataMode::Keep | EntryDataMode::KeepAndCrc32 => true, + EntryDataMode::Ignore | EntryDataMode::Crc32 => false, + } + } +} diff --git a/vendor/gix-pack/src/data/mod.rs b/vendor/gix-pack/src/data/mod.rs new file mode 100644 index 000000000..da717fc1a --- /dev/null +++ b/vendor/gix-pack/src/data/mod.rs @@ -0,0 +1,134 @@ +//! a pack data file +use std::{convert::TryInto, path::Path}; + +/// The offset to an entry into the pack data file, relative to its beginning. +pub type Offset = u64; + +/// An identifier to uniquely identify all packs loaded within a known context or namespace. +pub type Id = u32; + +use memmap2::Mmap; + +/// An representing an full- or delta-object within a pack +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The entry's header + pub header: entry::Header, + /// The decompressed size of the entry in bytes. + /// + /// Note that for non-delta entries this will be the size of the object itself. + pub decompressed_size: u64, + /// absolute offset to compressed object data in the pack, just behind the entry's header + pub data_offset: Offset, +} + +mod file; +pub use file::{decode, verify, Header}; +/// +pub mod header; + +/// +pub mod init { + pub use super::header::decode::Error; +} + +/// +pub mod entry; + +/// +pub mod input; + +/// Utilities to encode pack data entries and write them to a `Write` implementation to resemble a pack data file. +pub mod output; + +/// A slice into a pack file denoting a pack entry. +/// +/// An entry can be decoded into an object. +pub type EntryRange = std::ops::Range<Offset>; + +/// Supported versions of a pack data file +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Version { + V2, + V3, +} + +impl Default for Version { + fn default() -> Self { + Version::V2 + } +} + +/// A pack data file +pub struct File { + data: Mmap, + path: std::path::PathBuf, + /// A value to represent this pack uniquely when used with cache lookup, or a way to identify this pack by its location on disk. + /// The same location on disk should yield the same id. + /// + /// These must be unique per pack and must be stable, that is they don't change if the pack doesn't change. + /// If the same id is assigned (or reassigned) to different packs, pack creation or cache access will fail in hard-to-debug ways. + /// + /// This value is controlled by the owning object store, which can use it in whichever way it wants as long as the above constraints are met. + pub id: Id, + version: Version, + num_objects: u32, + /// The size of the hash contained within. This is entirely determined by the caller, and repositories have to know which hash to use + /// based on their configuration. + hash_len: usize, + object_hash: gix_hash::Kind, +} + +/// Information about the pack data file itself +impl File { + /// The pack data version of this file + pub fn version(&self) -> Version { + self.version + } + /// The number of objects stored in this pack data file + pub fn num_objects(&self) -> u32 { + self.num_objects + } + /// The length of all mapped data, including the pack header and the pack trailer + pub fn data_len(&self) -> usize { + self.data.len() + } + /// The kind of hash we use internally. + pub fn object_hash(&self) -> gix_hash::Kind { + self.object_hash + } + /// The position of the byte one past the last pack entry, or in other terms, the first byte of the trailing hash. + pub fn pack_end(&self) -> usize { + self.data.len() - self.hash_len + } + + /// The path to the pack data file on disk + pub fn path(&self) -> &Path { + &self.path + } + + /// Returns the pack data at the given slice if its range is contained in the mapped pack data + pub fn entry_slice(&self, slice: EntryRange) -> Option<&[u8]> { + let entry_end: usize = slice.end.try_into().expect("end of pack fits into usize"); + let entry_start = slice.start as usize; + self.data.get(entry_start..entry_end) + } + + /// Returns the CRC32 of the pack data indicated by `pack_offset` and the `size` of the mapped data. + /// + /// _Note:_ finding the right size is only possible by decompressing + /// the pack entry beforehand, or by using the (to be sorted) offsets stored in an index file. + /// + /// # Panics + /// + /// If `pack_offset` or `size` are pointing to a range outside of the mapped pack data. + pub fn entry_crc32(&self, pack_offset: Offset, size: usize) -> u32 { + let pack_offset: usize = pack_offset.try_into().expect("pack_size fits into usize"); + gix_features::hash::crc32(&self.data[pack_offset..pack_offset + size]) + } +} + +pub(crate) mod delta; diff --git a/vendor/gix-pack/src/data/output/bytes.rs b/vendor/gix-pack/src/data/output/bytes.rs new file mode 100644 index 000000000..ec219db9d --- /dev/null +++ b/vendor/gix-pack/src/data/output/bytes.rs @@ -0,0 +1,156 @@ +use std::io::Write; + +use gix_features::hash; + +use crate::data::output; + +/// The error returned by `next()` in the [`FromEntriesIter`] iterator. +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error<E> +where + E: std::error::Error + 'static, +{ + #[error(transparent)] + Io(#[from] std::io::Error), + #[error(transparent)] + Input(E), +} + +/// An implementation of [`Iterator`] to write [encoded entries][output::Entry] to an inner implementation each time +/// `next()` is called. +pub struct FromEntriesIter<I, W> { + /// An iterator for input [`output::Entry`] instances + pub input: I, + /// A way of writing encoded bytes. + output: hash::Write<W>, + /// Our trailing hash when done writing all input entries + trailer: Option<gix_hash::ObjectId>, + /// The amount of objects in the iteration and the version of the packfile to be written. + /// Will be `None` to signal the header was written already. + header_info: Option<(crate::data::Version, u32)>, + /// The pack data version with which pack entries should be written. + entry_version: crate::data::Version, + /// The amount of written bytes thus far + written: u64, + /// Required to quickly find offsets by object IDs, as future objects may refer to those in the past to become a delta offset base. + /// It stores the pack offsets at which objects begin. + /// Additionally we store if an object was invalid, and if so we will not write it nor will we allow delta objects to it. + pack_offsets_and_validity: Vec<(u64, bool)>, + /// If we are done, no additional writes will occur + is_done: bool, +} + +impl<I, W, E> FromEntriesIter<I, W> +where + I: Iterator<Item = Result<Vec<output::Entry>, E>>, + W: std::io::Write, + E: std::error::Error + 'static, +{ + /// Create a new instance reading [entries][output::Entry] from an `input` iterator and write pack data bytes to + /// `output` writer, resembling a pack of `version` with exactly `num_entries` amount of objects contained in it. + /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. + /// + /// The input chunks are expected to be sorted already. You can use the [InOrderIter][gix_features::parallel::InOrderIter] to assure + /// this happens on the fly holding entire chunks in memory as long as needed for them to be dispensed in order. + /// + /// # Panics + /// + /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. + pub fn new( + input: I, + output: W, + num_entries: u32, + version: crate::data::Version, + object_hash: gix_hash::Kind, + ) -> Self { + assert!( + matches!(version, crate::data::Version::V2), + "currently only pack version 2 can be written", + ); + FromEntriesIter { + input, + output: hash::Write::new(output, object_hash), + trailer: None, + entry_version: version, + pack_offsets_and_validity: Vec::with_capacity(num_entries as usize), + written: 0, + header_info: Some((version, num_entries)), + is_done: false, + } + } + + /// Consume this instance and return the `output` implementation. + /// + /// _Note_ that the `input` iterator can be moved out of this instance beforehand. + pub fn into_write(self) -> W { + self.output.inner + } + + /// Returns the trailing hash over all written entries once done. + /// It's `None` if we are not yet done writing. + pub fn digest(&self) -> Option<gix_hash::ObjectId> { + self.trailer + } + + fn next_inner(&mut self) -> Result<u64, Error<E>> { + let previous_written = self.written; + if let Some((version, num_entries)) = self.header_info.take() { + let header_bytes = crate::data::header::encode(version, num_entries); + self.output.write_all(&header_bytes[..])?; + self.written += header_bytes.len() as u64; + } + match self.input.next() { + Some(entries) => { + for entry in entries.map_err(Error::Input)? { + if entry.is_invalid() { + self.pack_offsets_and_validity.push((0, false)); + continue; + }; + self.pack_offsets_and_validity.push((self.written, true)); + let header = entry.to_entry_header(self.entry_version, |index| { + let (base_offset, is_valid_object) = self.pack_offsets_and_validity[index]; + if !is_valid_object { + unreachable!("if you see this the object database is correct as a delta refers to a non-existing object") + } + self.written - base_offset + }); + self.written += header.write_to(entry.decompressed_size as u64, &mut self.output)? as u64; + self.written += std::io::copy(&mut &*entry.compressed_data, &mut self.output)?; + } + } + None => { + let digest = self.output.hash.clone().digest(); + self.output.inner.write_all(&digest[..])?; + self.written += digest.len() as u64; + self.output.inner.flush()?; + self.is_done = true; + self.trailer = Some(gix_hash::ObjectId::from(digest)); + } + }; + Ok(self.written - previous_written) + } +} + +impl<I, W, E> Iterator for FromEntriesIter<I, W> +where + I: Iterator<Item = Result<Vec<output::Entry>, E>>, + W: std::io::Write, + E: std::error::Error + 'static, +{ + /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. + type Item = Result<u64, Error<E>>; + + fn next(&mut self) -> Option<Self::Item> { + if self.is_done { + return None; + } + Some(match self.next_inner() { + Err(err) => { + self.is_done = true; + Err(err) + } + Ok(written) => Ok(written), + }) + } +} diff --git a/vendor/gix-pack/src/data/output/count/mod.rs b/vendor/gix-pack/src/data/output/count/mod.rs new file mode 100644 index 000000000..e7ee767de --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/mod.rs @@ -0,0 +1,49 @@ +use gix_hash::ObjectId; + +use crate::data::output::Count; + +/// Specifies how the pack location was handled during counting +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum PackLocation { + /// We did not lookup this object + NotLookedUp, + /// The object was looked up and there may be a location in a pack, along with entry information + LookedUp(Option<crate::data::entry::Location>), +} + +impl PackLocation { + /// Directly go through to LookedUp variant, panic otherwise + pub fn is_none(&self) -> bool { + match self { + PackLocation::LookedUp(opt) => opt.is_none(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } + /// Directly go through to LookedUp variant, panic otherwise + pub fn as_ref(&self) -> Option<&crate::data::entry::Location> { + match self { + PackLocation::LookedUp(opt) => opt.as_ref(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } +} + +impl Count { + /// Create a new instance from the given `oid` and its corresponding git `obj`ect data. + pub fn from_data(oid: impl Into<ObjectId>, location: Option<crate::data::entry::Location>) -> Self { + Count { + id: oid.into(), + entry_pack_location: PackLocation::LookedUp(location), + } + } +} + +#[path = "objects/mod.rs"] +mod objects_impl; +pub use objects_impl::{objects, objects_unthreaded}; + +/// +pub mod objects { + pub use super::objects_impl::{Error, ObjectExpansion, Options, Outcome, Result}; +} diff --git a/vendor/gix-pack/src/data/output/count/objects/mod.rs b/vendor/gix-pack/src/data/output/count/objects/mod.rs new file mode 100644 index 000000000..d56bc9a5f --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/mod.rs @@ -0,0 +1,405 @@ +use std::{ + cell::RefCell, + sync::{atomic::AtomicBool, Arc}, +}; + +use gix_features::{parallel, progress::Progress}; +use gix_hash::ObjectId; + +use crate::{data::output, find}; + +pub(in crate::data::output::count::objects_impl) mod reduce; +mod util; + +mod types; +pub use types::{Error, ObjectExpansion, Options, Outcome}; + +mod tree; + +/// The return type used by [`objects()`]. +pub type Result<E1, E2> = std::result::Result<(Vec<output::Count>, Outcome), Error<E1, E2>>; + +/// Generate [`Count`][output::Count]s from input `objects` with object expansion based on [`options`][Options] +/// to learn which objects would would constitute a pack. This step is required to know exactly how many objects would +/// be in a pack while keeping data around to avoid minimize object database access. +/// +/// A [`Count`][output::Count] object maintains enough state to greatly accelerate future access of packed objects. +/// +/// * `db` - the object store to use for accessing objects. +/// * `objects_ids` +/// * A list of objects ids to add to the pack. Duplication checks are performed so no object is ever added to a pack twice. +/// * Objects may be expanded based on the provided [`options`][Options] +/// * `progress` +/// * a way to obtain progress information +/// * `should_interrupt` +/// * A flag that is set to true if the operation should stop +/// * `options` +/// * more configuration +pub fn objects<Find, Iter, IterErr, Oid>( + db: Find, + objects_ids: Iter, + progress: impl Progress, + should_interrupt: &AtomicBool, + Options { + thread_limit, + input_object_expansion, + chunk_size, + }: Options, +) -> Result<find::existing::Error<Find::Error>, IterErr> +where + Find: crate::Find + Send + Clone, + <Find as crate::Find>::Error: Send, + Iter: Iterator<Item = std::result::Result<Oid, IterErr>> + Send, + Oid: Into<ObjectId> + Send, + IterErr: std::error::Error + Send, +{ + let lower_bound = objects_ids.size_hint().0; + let (chunk_size, thread_limit, _) = parallel::optimize_chunk_size_and_thread_limit( + chunk_size, + if lower_bound == 0 { None } else { Some(lower_bound) }, + thread_limit, + None, + ); + let chunks = gix_features::iter::Chunks { + inner: objects_ids, + size: chunk_size, + }; + let seen_objs = gix_hashtable::sync::ObjectIdMap::default(); + let progress = Arc::new(parking_lot::Mutex::new(progress)); + + parallel::in_parallel( + chunks, + thread_limit, + { + let progress = Arc::clone(&progress); + move |n| { + ( + Vec::new(), // object data buffer + Vec::new(), // object data buffer 2 to hold two objects at a time + { + let mut p = progress + .lock() + .add_child_with_id(format!("thread {n}"), gix_features::progress::UNKNOWN); + p.init(None, gix_features::progress::count("objects")); + p + }, + ) + } + }, + { + let seen_objs = &seen_objs; + move |oids: Vec<std::result::Result<Oid, IterErr>>, (buf1, buf2, progress)| { + expand::this( + &db, + input_object_expansion, + seen_objs, + oids, + buf1, + buf2, + progress, + should_interrupt, + true, /*allow pack lookups*/ + ) + } + }, + reduce::Statistics::new(progress), + ) +} + +/// Like [`objects()`] but using a single thread only to mostly save on the otherwise required overhead. +pub fn objects_unthreaded<Find, IterErr, Oid>( + db: Find, + object_ids: impl Iterator<Item = std::result::Result<Oid, IterErr>>, + mut progress: impl Progress, + should_interrupt: &AtomicBool, + input_object_expansion: ObjectExpansion, +) -> Result<find::existing::Error<Find::Error>, IterErr> +where + Find: crate::Find, + Oid: Into<ObjectId>, + IterErr: std::error::Error, +{ + let seen_objs = RefCell::new(gix_hashtable::HashSet::default()); + + let (mut buf1, mut buf2) = (Vec::new(), Vec::new()); + expand::this( + &db, + input_object_expansion, + &seen_objs, + object_ids, + &mut buf1, + &mut buf2, + &mut progress, + should_interrupt, + false, /*allow pack lookups*/ + ) +} + +mod expand { + use std::sync::atomic::{AtomicBool, Ordering}; + + use gix_features::progress::Progress; + use gix_hash::{oid, ObjectId}; + use gix_object::{CommitRefIter, TagRefIter}; + + use super::{ + tree, + types::{Error, ObjectExpansion, Outcome}, + util, + }; + use crate::{ + data::{output, output::count::PackLocation}, + find, FindExt, + }; + + #[allow(clippy::too_many_arguments)] + pub fn this<Find, IterErr, Oid>( + db: &Find, + input_object_expansion: ObjectExpansion, + seen_objs: &impl util::InsertImmutable, + oids: impl IntoIterator<Item = std::result::Result<Oid, IterErr>>, + buf1: &mut Vec<u8>, + #[allow(clippy::ptr_arg)] buf2: &mut Vec<u8>, + progress: &mut impl Progress, + should_interrupt: &AtomicBool, + allow_pack_lookups: bool, + ) -> super::Result<find::existing::Error<Find::Error>, IterErr> + where + Find: crate::Find, + Oid: Into<ObjectId>, + IterErr: std::error::Error, + { + use ObjectExpansion::*; + + let mut out = Vec::new(); + let mut tree_traversal_state = gix_traverse::tree::breadthfirst::State::default(); + let mut tree_diff_state = gix_diff::tree::State::default(); + let mut parent_commit_ids = Vec::new(); + let mut traverse_delegate = tree::traverse::AllUnseen::new(seen_objs); + let mut changes_delegate = tree::changes::AllNew::new(seen_objs); + let mut outcome = Outcome::default(); + + let stats = &mut outcome; + for id in oids.into_iter() { + if should_interrupt.load(Ordering::Relaxed) { + return Err(Error::Interrupted); + } + + let id = id.map(|oid| oid.into()).map_err(Error::InputIteration)?; + let (obj, location) = db.find(id, buf1)?; + stats.input_objects += 1; + match input_object_expansion { + TreeAdditionsComparedToAncestor => { + use gix_object::Kind::*; + let mut obj = obj; + let mut location = location; + let mut id = id.to_owned(); + + loop { + push_obj_count_unique(&mut out, seen_objs, &id, location, progress, stats, false); + match obj.kind { + Tree | Blob => break, + Tag => { + id = TagRefIter::from_bytes(obj.data) + .target_id() + .expect("every tag has a target"); + let tmp = db.find(id, buf1)?; + + obj = tmp.0; + location = tmp.1; + + stats.expanded_objects += 1; + continue; + } + Commit => { + let current_tree_iter = { + let mut commit_iter = CommitRefIter::from_bytes(obj.data); + let tree_id = commit_iter.tree_id().expect("every commit has a tree"); + parent_commit_ids.clear(); + for token in commit_iter { + match token { + Ok(gix_object::commit::ref_iter::Token::Parent { id }) => { + parent_commit_ids.push(id) + } + Ok(_) => break, + Err(err) => return Err(Error::CommitDecode(err)), + } + } + let (obj, location) = db.find(tree_id, buf1)?; + push_obj_count_unique( + &mut out, seen_objs, &tree_id, location, progress, stats, true, + ); + gix_object::TreeRefIter::from_bytes(obj.data) + }; + + let objects = if parent_commit_ids.is_empty() { + traverse_delegate.clear(); + gix_traverse::tree::breadthfirst( + current_tree_iter, + &mut tree_traversal_state, + |oid, buf| { + stats.decoded_objects += 1; + match db.find(oid, buf).ok() { + Some((obj, location)) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, location)); + obj.try_into_tree_iter() + } + None => None, + } + }, + &mut traverse_delegate, + ) + .map_err(Error::TreeTraverse)?; + &traverse_delegate.non_trees + } else { + for commit_id in &parent_commit_ids { + let parent_tree_id = { + let (parent_commit_obj, location) = db.find(commit_id, buf2)?; + + push_obj_count_unique( + &mut out, seen_objs, commit_id, location, progress, stats, true, + ); + CommitRefIter::from_bytes(parent_commit_obj.data) + .tree_id() + .expect("every commit has a tree") + }; + let parent_tree = { + let (parent_tree_obj, location) = db.find(parent_tree_id, buf2)?; + push_obj_count_unique( + &mut out, + seen_objs, + &parent_tree_id, + location, + progress, + stats, + true, + ); + gix_object::TreeRefIter::from_bytes(parent_tree_obj.data) + }; + + changes_delegate.clear(); + gix_diff::tree::Changes::from(Some(parent_tree)) + .needed_to_obtain( + current_tree_iter.clone(), + &mut tree_diff_state, + |oid, buf| { + stats.decoded_objects += 1; + db.find_tree_iter(oid, buf).map(|t| t.0) + }, + &mut changes_delegate, + ) + .map_err(Error::TreeChanges)?; + } + &changes_delegate.objects + }; + for id in objects.iter() { + out.push(id_to_count(db, buf2, id, progress, stats, allow_pack_lookups)); + } + break; + } + } + } + } + TreeContents => { + use gix_object::Kind::*; + let mut id = id; + let mut obj = (obj, location); + loop { + push_obj_count_unique(&mut out, seen_objs, &id, obj.1.clone(), progress, stats, false); + match obj.0.kind { + Tree => { + traverse_delegate.clear(); + gix_traverse::tree::breadthfirst( + gix_object::TreeRefIter::from_bytes(obj.0.data), + &mut tree_traversal_state, + |oid, buf| { + stats.decoded_objects += 1; + match db.find(oid, buf).ok() { + Some((obj, location)) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, location)); + obj.try_into_tree_iter() + } + None => None, + } + }, + &mut traverse_delegate, + ) + .map_err(Error::TreeTraverse)?; + for id in traverse_delegate.non_trees.iter() { + out.push(id_to_count(db, buf1, id, progress, stats, allow_pack_lookups)); + } + break; + } + Commit => { + id = CommitRefIter::from_bytes(obj.0.data) + .tree_id() + .expect("every commit has a tree"); + stats.expanded_objects += 1; + obj = db.find(id, buf1)?; + continue; + } + Blob => break, + Tag => { + id = TagRefIter::from_bytes(obj.0.data) + .target_id() + .expect("every tag has a target"); + stats.expanded_objects += 1; + obj = db.find(id, buf1)?; + continue; + } + } + } + } + AsIs => push_obj_count_unique(&mut out, seen_objs, &id, location, progress, stats, false), + } + } + outcome.total_objects = out.len(); + Ok((out, outcome)) + } + + #[inline] + fn push_obj_count_unique( + out: &mut Vec<output::Count>, + all_seen: &impl util::InsertImmutable, + id: &oid, + location: Option<crate::data::entry::Location>, + progress: &mut impl Progress, + statistics: &mut Outcome, + count_expanded: bool, + ) { + let inserted = all_seen.insert(id.to_owned()); + if inserted { + progress.inc(); + statistics.decoded_objects += 1; + if count_expanded { + statistics.expanded_objects += 1; + } + out.push(output::Count::from_data(id, location)); + } + } + + #[inline] + fn id_to_count<Find: crate::Find>( + db: &Find, + buf: &mut Vec<u8>, + id: &oid, + progress: &mut impl Progress, + statistics: &mut Outcome, + allow_pack_lookups: bool, + ) -> output::Count { + progress.inc(); + statistics.expanded_objects += 1; + output::Count { + id: id.to_owned(), + entry_pack_location: if allow_pack_lookups { + PackLocation::LookedUp(db.location_by_oid(id, buf)) + } else { + PackLocation::NotLookedUp + }, + } + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/reduce.rs b/vendor/gix-pack/src/data/output/count/objects/reduce.rs new file mode 100644 index 000000000..c6a61d467 --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/reduce.rs @@ -0,0 +1,49 @@ +use std::{marker::PhantomData, sync::Arc}; + +use gix_features::{parallel, progress::Progress}; + +use super::Outcome; +use crate::data::output; + +pub struct Statistics<E, P> { + total: Outcome, + counts: Vec<output::Count>, + progress: Arc<parking_lot::Mutex<P>>, + _err: PhantomData<E>, +} + +impl<E, P> Statistics<E, P> +where + P: Progress, +{ + pub fn new(progress: Arc<parking_lot::Mutex<P>>) -> Self { + Statistics { + total: Default::default(), + counts: Default::default(), + progress, + _err: PhantomData::default(), + } + } +} + +impl<E, P> parallel::Reduce for Statistics<E, P> +where + P: Progress, +{ + type Input = Result<(Vec<output::Count>, Outcome), E>; + type FeedProduce = (); + type Output = (Vec<output::Count>, Outcome); + type Error = E; + + fn feed(&mut self, item: Self::Input) -> Result<Self::FeedProduce, Self::Error> { + let (counts, stats) = item?; + self.total.aggregate(stats); + self.progress.lock().inc_by(counts.len()); + self.counts.extend(counts); + Ok(()) + } + + fn finalize(self) -> Result<Self::Output, Self::Error> { + Ok((self.counts, self.total)) + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/tree.rs b/vendor/gix-pack/src/data/output/count/objects/tree.rs new file mode 100644 index 000000000..d3f4f6b9a --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/tree.rs @@ -0,0 +1,124 @@ +pub mod changes { + use gix_diff::tree::{ + visit::{Action, Change}, + Visit, + }; + use gix_hash::ObjectId; + use gix_object::{bstr::BStr, tree::EntryMode}; + + use crate::data::output::count::objects_impl::util::InsertImmutable; + + pub struct AllNew<'a, H> { + pub objects: Vec<ObjectId>, + all_seen: &'a H, + } + + impl<'a, H> AllNew<'a, H> + where + H: InsertImmutable, + { + pub fn new(all_seen: &'a H) -> Self { + AllNew { + objects: Default::default(), + all_seen, + } + } + pub fn clear(&mut self) { + self.objects.clear(); + } + } + + impl<'a, H> Visit for AllNew<'a, H> + where + H: InsertImmutable, + { + fn pop_front_tracked_path_and_set_current(&mut self) {} + + fn push_back_tracked_path_component(&mut self, _component: &BStr) {} + + fn push_path_component(&mut self, _component: &BStr) {} + + fn pop_path_component(&mut self) {} + + fn visit(&mut self, change: Change) -> Action { + match change { + Change::Addition { oid, entry_mode } | Change::Modification { oid, entry_mode, .. } => { + if entry_mode == EntryMode::Commit { + return Action::Continue; + } + let inserted = self.all_seen.insert(oid); + if inserted { + self.objects.push(oid); + } + } + Change::Deletion { .. } => {} + }; + Action::Continue + } + } +} + +pub mod traverse { + use gix_hash::ObjectId; + use gix_object::{ + bstr::BStr, + tree::{EntryMode, EntryRef}, + }; + use gix_traverse::tree::{visit::Action, Visit}; + + use crate::data::output::count::objects_impl::util::InsertImmutable; + + pub struct AllUnseen<'a, H> { + pub non_trees: Vec<ObjectId>, + all_seen: &'a H, + } + + impl<'a, H> AllUnseen<'a, H> + where + H: InsertImmutable, + { + pub fn new(all_seen: &'a H) -> Self { + AllUnseen { + non_trees: Default::default(), + all_seen, + } + } + pub fn clear(&mut self) { + self.non_trees.clear(); + } + } + + impl<'a, H> Visit for AllUnseen<'a, H> + where + H: InsertImmutable, + { + fn pop_front_tracked_path_and_set_current(&mut self) {} + + fn push_back_tracked_path_component(&mut self, _component: &BStr) {} + + fn push_path_component(&mut self, _component: &BStr) {} + + fn pop_path_component(&mut self) {} + + fn visit_tree(&mut self, entry: &EntryRef<'_>) -> Action { + let inserted = self.all_seen.insert(entry.oid.to_owned()); + if inserted { + Action::Continue + } else { + Action::Skip + } + } + + fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> Action { + if entry.mode == EntryMode::Commit { + // links don't have a representation + return Action::Continue; + } + let inserted = self.all_seen.insert(entry.oid.to_owned()); + if inserted { + self.non_trees.push(entry.oid.to_owned()); + } + Action::Continue + } + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/types.rs b/vendor/gix-pack/src/data/output/count/objects/types.rs new file mode 100644 index 000000000..8c8c939df --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/types.rs @@ -0,0 +1,105 @@ +/// Information gathered during the run of [`iter_from_objects()`][super::objects()]. +#[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The amount of objects provided to start the iteration. + pub input_objects: usize, + /// The amount of objects that have been expanded from the input source. + /// It's desirable to do that as expansion happens on multiple threads, allowing the amount of input objects to be small. + /// `expanded_objects - decoded_objects` is the 'cheap' object we found without decoding the object itself. + pub expanded_objects: usize, + /// The amount of fully decoded objects. These are the most expensive as they are fully decoded + pub decoded_objects: usize, + /// The total amount of encountered objects. Should be `expanded_objects + input_objects`. + pub total_objects: usize, +} + +impl Outcome { + pub(in crate::data::output::count) fn aggregate( + &mut self, + Outcome { + input_objects, + decoded_objects, + expanded_objects, + total_objects, + }: Self, + ) { + self.input_objects += input_objects; + self.decoded_objects += decoded_objects; + self.expanded_objects += expanded_objects; + self.total_objects += total_objects; + } +} + +/// The way input objects are handled +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ObjectExpansion { + /// Don't do anything with the input objects except for transforming them into pack entries + AsIs, + /// If the input object is a Commit then turn it into a pack entry. Additionally obtain its tree, turn it into a pack entry + /// along with all of its contents, that is nested trees, and any other objects reachable from it. + /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. + /// + /// This mode is useful if all reachable objects should be added, as in cloning a repository. + TreeContents, + /// If the input is a commit, obtain its ancestors and turn them into pack entries. Obtain the ancestor trees along with the commits + /// tree and turn them into pack entries. Finally obtain the added/changed objects when comparing the ancestor trees with the + /// current tree and turn them into entries as well. + /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. + /// + /// This mode is useful to build a pack containing only new objects compared to a previous state. + TreeAdditionsComparedToAncestor, +} + +impl Default for ObjectExpansion { + fn default() -> Self { + ObjectExpansion::AsIs + } +} + +/// Configuration options for the pack generation functions provided in [this module][crate::data::output]. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Options { + /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. + /// If more than one thread is used, the order of returned [counts][crate::data::output::Count] is not deterministic anymore + /// especially when tree traversal is involved. Thus deterministic ordering requires `Some(1)` to be set. + pub thread_limit: Option<usize>, + /// The amount of objects per chunk or unit of work to be sent to threads for processing + pub chunk_size: usize, + /// The way input objects are handled + pub input_object_expansion: ObjectExpansion, +} + +impl Default for Options { + fn default() -> Self { + Options { + thread_limit: None, + chunk_size: 10, + input_object_expansion: Default::default(), + } + } +} + +/// The error returned by the pack generation iterator [bytes::FromEntriesIter][crate::data::output::bytes::FromEntriesIter]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error<FindErr, IterErr> +where + FindErr: std::error::Error + 'static, + IterErr: std::error::Error + 'static, +{ + #[error(transparent)] + CommitDecode(gix_object::decode::Error), + #[error(transparent)] + FindExisting(#[from] FindErr), + #[error(transparent)] + InputIteration(IterErr), + #[error(transparent)] + TreeTraverse(gix_traverse::tree::breadthfirst::Error), + #[error(transparent)] + TreeChanges(gix_diff::tree::changes::Error), + #[error("Operation interrupted")] + Interrupted, +} diff --git a/vendor/gix-pack/src/data/output/count/objects/util.rs b/vendor/gix-pack/src/data/output/count/objects/util.rs new file mode 100644 index 000000000..a80841313 --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/util.rs @@ -0,0 +1,24 @@ +pub trait InsertImmutable { + fn insert(&self, id: gix_hash::ObjectId) -> bool; +} + +mod trait_impls { + use gix_hash::ObjectId; + use std::cell::RefCell; + + use gix_hashtable::HashSet; + + use super::InsertImmutable; + + impl InsertImmutable for gix_hashtable::sync::ObjectIdMap<()> { + fn insert(&self, id: ObjectId) -> bool { + self.insert(id, ()).is_none() + } + } + + impl InsertImmutable for RefCell<HashSet<ObjectId>> { + fn insert(&self, item: ObjectId) -> bool { + self.borrow_mut().insert(item) + } + } +} diff --git a/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs b/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs new file mode 100644 index 000000000..25e256d5c --- /dev/null +++ b/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs @@ -0,0 +1,428 @@ +pub(crate) mod function { + use std::{cmp::Ordering, sync::Arc}; + + use gix_features::{parallel, parallel::SequenceId, progress::Progress}; + + use super::{reduce, util, Error, Mode, Options, Outcome, ProgressId}; + use crate::data::output; + + /// Given a known list of object `counts`, calculate entries ready to be put into a data pack. + /// + /// This allows objects to be written quite soon without having to wait for the entire pack to be built in memory. + /// A chunk of objects is held in memory and compressed using DEFLATE, and serve the output of this iterator. + /// That way slow writers will naturally apply back pressure, and communicate to the implementation that more time can be + /// spent compressing objects. + /// + /// * `counts` + /// * A list of previously counted objects to add to the pack. Duplication checks are not performed, no object is expected to be duplicated. + /// * `progress` + /// * a way to obtain progress information + /// * `options` + /// * more configuration + /// + /// _Returns_ the checksum of the pack + /// + /// ## Discussion + /// + /// ### Advantages + /// + /// * Begins writing immediately and supports back-pressure. + /// * Abstract over object databases and how input is provided. + /// + /// ### Disadvantages + /// + /// * ~~currently there is no way to easily write the pack index, even though the state here is uniquely positioned to do + /// so with minimal overhead (especially compared to `gix index-from-pack`)~~ Probably works now by chaining Iterators + /// or keeping enough state to write a pack and then generate an index with recorded data. + /// + pub fn iter_from_counts<Find>( + mut counts: Vec<output::Count>, + db: Find, + mut progress: impl Progress + 'static, + Options { + version, + mode, + allow_thin_pack, + thread_limit, + chunk_size, + }: Options, + ) -> impl Iterator<Item = Result<(SequenceId, Vec<output::Entry>), Error<Find::Error>>> + + parallel::reduce::Finalize<Reduce = reduce::Statistics<Error<Find::Error>>> + where + Find: crate::Find + Send + Clone + 'static, + <Find as crate::Find>::Error: Send, + { + assert!( + matches!(version, crate::data::Version::V2), + "currently we can only write version 2" + ); + let (chunk_size, thread_limit, _) = + parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); + { + let progress = Arc::new(parking_lot::Mutex::new( + progress.add_child_with_id("resolving", ProgressId::ResolveCounts.into()), + )); + progress.lock().init(None, gix_features::progress::count("counts")); + let enough_counts_present = counts.len() > 4_000; + let start = std::time::Instant::now(); + parallel::in_parallel_if( + || enough_counts_present, + counts.chunks_mut(chunk_size), + thread_limit, + |_n| Vec::<u8>::new(), + { + let progress = Arc::clone(&progress); + let db = db.clone(); + move |chunk, buf| { + let chunk_size = chunk.len(); + for count in chunk { + use crate::data::output::count::PackLocation::*; + match count.entry_pack_location { + LookedUp(_) => continue, + NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + } + } + progress.lock().inc_by(chunk_size); + Ok::<_, ()>(()) + } + }, + parallel::reduce::IdentityWithResult::<(), ()>::default(), + ) + .expect("infallible - we ignore none-existing objects"); + progress.lock().show_throughput(start); + } + let counts_range_by_pack_id = match mode { + Mode::PackCopyAndBaseObjects => { + let mut progress = progress.add_child_with_id("sorting", ProgressId::SortEntries.into()); + progress.init(Some(counts.len()), gix_features::progress::count("counts")); + let start = std::time::Instant::now(); + + use crate::data::output::count::PackLocation::*; + counts.sort_by(|lhs, rhs| match (&lhs.entry_pack_location, &rhs.entry_pack_location) { + (LookedUp(None), LookedUp(None)) => Ordering::Equal, + (LookedUp(Some(_)), LookedUp(None)) => Ordering::Greater, + (LookedUp(None), LookedUp(Some(_))) => Ordering::Less, + (LookedUp(Some(lhs)), LookedUp(Some(rhs))) => lhs + .pack_id + .cmp(&rhs.pack_id) + .then(lhs.pack_offset.cmp(&rhs.pack_offset)), + (_, _) => unreachable!("counts were resolved beforehand"), + }); + + let mut index: Vec<(u32, std::ops::Range<usize>)> = Vec::new(); + let mut chunks_pack_start = counts.partition_point(|e| e.entry_pack_location.is_none()); + let mut slice = &counts[chunks_pack_start..]; + while !slice.is_empty() { + let current_pack_id = slice[0].entry_pack_location.as_ref().expect("packed object").pack_id; + let pack_end = slice.partition_point(|e| { + e.entry_pack_location.as_ref().expect("packed object").pack_id == current_pack_id + }); + index.push((current_pack_id, chunks_pack_start..chunks_pack_start + pack_end)); + slice = &slice[pack_end..]; + chunks_pack_start += pack_end; + } + + progress.set(counts.len()); + progress.show_throughput(start); + + index + } + }; + + let counts = Arc::new(counts); + let progress = Arc::new(parking_lot::Mutex::new(progress)); + let chunks = util::ChunkRanges::new(chunk_size, counts.len()); + + parallel::reduce::Stepwise::new( + chunks.enumerate(), + thread_limit, + { + let progress = Arc::clone(&progress); + move |n| { + ( + Vec::new(), // object data buffer + progress + .lock() + .add_child_with_id(format!("thread {n}"), gix_features::progress::UNKNOWN), + ) + } + }, + { + let counts = Arc::clone(&counts); + move |(chunk_id, chunk_range): (SequenceId, std::ops::Range<usize>), (buf, progress)| { + let mut out = Vec::new(); + let chunk = &counts[chunk_range]; + let mut stats = Outcome::default(); + let mut pack_offsets_to_id = None; + progress.init(Some(chunk.len()), gix_features::progress::count("objects")); + + for count in chunk.iter() { + out.push(match count + .entry_pack_location + .as_ref() + .and_then(|l| db.entry_by_location(l).map(|pe| (l, pe))) + { + Some((location, pack_entry)) => { + if let Some((cached_pack_id, _)) = &pack_offsets_to_id { + if *cached_pack_id != location.pack_id { + pack_offsets_to_id = None; + } + } + let pack_range = counts_range_by_pack_id[counts_range_by_pack_id + .binary_search_by_key(&location.pack_id, |e| e.0) + .expect("pack-id always present")] + .1 + .clone(); + let base_index_offset = pack_range.start; + let counts_in_pack = &counts[pack_range]; + match output::Entry::from_pack_entry( + pack_entry, + count, + counts_in_pack, + base_index_offset, + allow_thin_pack.then_some({ + |pack_id, base_offset| { + let (cached_pack_id, cache) = pack_offsets_to_id.get_or_insert_with(|| { + db.pack_offsets_and_oid(pack_id) + .map(|mut v| { + v.sort_by_key(|e| e.0); + (pack_id, v) + }) + .expect("pack used for counts is still available") + }); + debug_assert_eq!(*cached_pack_id, pack_id); + stats.ref_delta_objects += 1; + cache + .binary_search_by_key(&base_offset, |e| e.0) + .ok() + .map(|idx| cache[idx].1) + } + }), + version, + ) { + Some(entry) => { + stats.objects_copied_from_pack += 1; + entry + } + None => match db.try_find(count.id, buf).map_err(Error::FindExisting)? { + Some((obj, _location)) => { + stats.decoded_and_recompressed_objects += 1; + output::Entry::from_data(count, &obj) + } + None => { + stats.missing_objects += 1; + Ok(output::Entry::invalid()) + } + }, + } + } + None => match db.try_find(count.id, buf).map_err(Error::FindExisting)? { + Some((obj, _location)) => { + stats.decoded_and_recompressed_objects += 1; + output::Entry::from_data(count, &obj) + } + None => { + stats.missing_objects += 1; + Ok(output::Entry::invalid()) + } + }, + }?); + progress.inc(); + } + Ok((chunk_id, out, stats)) + } + }, + reduce::Statistics::default(), + ) + } +} + +mod util { + #[derive(Clone)] + pub struct ChunkRanges { + cursor: usize, + size: usize, + len: usize, + } + + impl ChunkRanges { + pub fn new(size: usize, total: usize) -> Self { + ChunkRanges { + cursor: 0, + size, + len: total, + } + } + } + + impl Iterator for ChunkRanges { + type Item = std::ops::Range<usize>; + + fn next(&mut self) -> Option<Self::Item> { + if self.cursor >= self.len { + None + } else { + let upper = (self.cursor + self.size).min(self.len); + let range = self.cursor..upper; + self.cursor = upper; + Some(range) + } + } + } +} + +mod reduce { + use std::marker::PhantomData; + + use gix_features::{parallel, parallel::SequenceId}; + + use super::Outcome; + use crate::data::output; + + pub struct Statistics<E> { + total: Outcome, + _err: PhantomData<E>, + } + + impl<E> Default for Statistics<E> { + fn default() -> Self { + Statistics { + total: Default::default(), + _err: PhantomData::default(), + } + } + } + + impl<Error> parallel::Reduce for Statistics<Error> { + type Input = Result<(SequenceId, Vec<output::Entry>, Outcome), Error>; + type FeedProduce = (SequenceId, Vec<output::Entry>); + type Output = Outcome; + type Error = Error; + + fn feed(&mut self, item: Self::Input) -> Result<Self::FeedProduce, Self::Error> { + item.map(|(cid, entries, stats)| { + self.total.aggregate(stats); + (cid, entries) + }) + } + + fn finalize(self) -> Result<Self::Output, Self::Error> { + Ok(self.total) + } + } +} + +mod types { + use crate::data::output::entry; + + /// Information gathered during the run of [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub struct Outcome { + /// The amount of fully decoded objects. These are the most expensive as they are fully decoded. + pub decoded_and_recompressed_objects: usize, + /// The amount of objects that could not be located despite them being mentioned during iteration + pub missing_objects: usize, + /// The amount of base or delta objects that could be copied directly from the pack. These are cheapest as they + /// only cost a memory copy for the most part. + pub objects_copied_from_pack: usize, + /// The amount of objects that ref to their base as ref-delta, an indication for a thin back being created. + pub ref_delta_objects: usize, + } + + impl Outcome { + pub(in crate::data::output::entry) fn aggregate( + &mut self, + Outcome { + decoded_and_recompressed_objects: decoded_objects, + missing_objects, + objects_copied_from_pack, + ref_delta_objects, + }: Self, + ) { + self.decoded_and_recompressed_objects += decoded_objects; + self.missing_objects += missing_objects; + self.objects_copied_from_pack += objects_copied_from_pack; + self.ref_delta_objects += ref_delta_objects; + } + } + + /// The way the iterator operates. + #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub enum Mode { + /// Copy base objects and deltas from packs, while non-packed objects will be treated as base objects + /// (i.e. without trying to delta compress them). This is a fast way of obtaining a back while benefiting + /// from existing pack compression and spending the smallest possible time on compressing unpacked objects at + /// the cost of bandwidth. + PackCopyAndBaseObjects, + } + + /// Configuration options for the pack generation functions provided in [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub struct Options { + /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. + pub thread_limit: Option<usize>, + /// The algorithm to produce a pack + pub mode: Mode, + /// If set, the resulting back can have deltas that refer to an object which is not in the pack. This can happen + /// if the initial counted objects do not contain an object that an existing packed delta refers to, for example, because + /// it wasn't part of the iteration, for instance when the iteration was performed on tree deltas or only a part of the + /// commit graph. Please note that thin packs are not valid packs at rest, thus they are only valid for packs in transit. + /// + /// If set to false, delta objects will be decompressed and recompressed as base objects. + pub allow_thin_pack: bool, + /// The amount of objects per chunk or unit of work to be sent to threads for processing + /// TODO: could this become the window size? + pub chunk_size: usize, + /// The pack data version to produce for each entry + pub version: crate::data::Version, + } + + impl Default for Options { + fn default() -> Self { + Options { + thread_limit: None, + mode: Mode::PackCopyAndBaseObjects, + allow_thin_pack: false, + chunk_size: 10, + version: Default::default(), + } + } + } + + /// The error returned by the pack generation function [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error<FindErr> + where + FindErr: std::error::Error + 'static, + { + #[error(transparent)] + FindExisting(FindErr), + #[error(transparent)] + NewEntry(#[from] entry::Error), + } + + /// The progress ids used in [`write_to_directory()`][crate::Bundle::write_to_directory()]. + /// + /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. + #[derive(Debug, Copy, Clone)] + pub enum ProgressId { + /// The amount of [`Count`][crate::data::output::Count] objects which are resolved to their pack location. + ResolveCounts, + /// Layout pack entries for placement into a pack (by pack-id and by offset). + SortEntries, + } + + impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::ResolveCounts => *b"ECRC", + ProgressId::SortEntries => *b"ECSE", + } + } + } +} +pub use types::{Error, Mode, Options, Outcome, ProgressId}; diff --git a/vendor/gix-pack/src/data/output/entry/mod.rs b/vendor/gix-pack/src/data/output/entry/mod.rs new file mode 100644 index 000000000..401d2f24c --- /dev/null +++ b/vendor/gix-pack/src/data/output/entry/mod.rs @@ -0,0 +1,181 @@ +use std::{convert::TryFrom, io::Write}; + +use gix_hash::ObjectId; + +use crate::{data, data::output, find}; + +/// +pub mod iter_from_counts; +pub use iter_from_counts::function::iter_from_counts; + +/// The kind of pack entry to be written +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum Kind { + /// A complete base object, including its kind + Base(gix_object::Kind), + /// A delta against the object with the given index. It's always an index that was already encountered to refer only + /// to object we have written already. + DeltaRef { + /// The absolute index to the object to serve as base. It's up to the writer to maintain enough state to allow producing + /// a packed delta object from it. + object_index: usize, + }, + /// A delta against the given object as identified by its `ObjectId`. + /// This is the case for thin packs only, i.e. those that are sent over the wire. + /// Note that there is the option of the `ObjectId` being used to refer to an object within + /// the same pack, but it's a discontinued practice which won't be encountered here. + DeltaOid { + /// The object serving as base for this delta + id: ObjectId, + }, +} + +/// The error returned by [`output::Entry::from_data()`]. +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("{0}")] + ZlibDeflate(#[from] std::io::Error), +} + +impl output::Entry { + /// An object which can be identified as invalid easily which happens if objects didn't exist even if they were referred to. + pub fn invalid() -> output::Entry { + output::Entry { + id: gix_hash::Kind::Sha1.null(), // NOTE: the actual object hash used in the repo doesn't matter here, this is a sentinel value. + kind: Kind::Base(gix_object::Kind::Blob), + decompressed_size: 0, + compressed_data: vec![], + } + } + + /// Returns true if this object doesn't really exist but still has to be handled responsibly + /// + /// Note that this is true for tree entries that are commits/git submodules, or for objects which aren't present in our local clone + /// due to shallow clones. + pub fn is_invalid(&self) -> bool { + self.id.is_null() + } + + /// Create an Entry from a previously counted object which is located in a pack. It's `entry` is provided here. + /// The `version` specifies what kind of target `Entry` version the caller desires. + pub fn from_pack_entry( + mut entry: find::Entry, + count: &output::Count, + potential_bases: &[output::Count], + bases_index_offset: usize, + pack_offset_to_oid: Option<impl FnMut(u32, u64) -> Option<ObjectId>>, + target_version: crate::data::Version, + ) -> Option<Result<Self, Error>> { + if entry.version != target_version { + return None; + }; + + let pack_offset_must_be_zero = 0; + let pack_entry = + crate::data::Entry::from_bytes(&entry.data, pack_offset_must_be_zero, count.id.as_slice().len()); + + use crate::data::entry::Header::*; + match pack_entry.header { + Commit => Some(output::entry::Kind::Base(gix_object::Kind::Commit)), + Tree => Some(output::entry::Kind::Base(gix_object::Kind::Tree)), + Blob => Some(output::entry::Kind::Base(gix_object::Kind::Blob)), + Tag => Some(output::entry::Kind::Base(gix_object::Kind::Tag)), + OfsDelta { base_distance } => { + let pack_location = count.entry_pack_location.as_ref().expect("packed"); + let base_offset = pack_location + .pack_offset + .checked_sub(base_distance) + .expect("pack-offset - distance is firmly within the pack"); + potential_bases + .binary_search_by(|e| { + e.entry_pack_location + .as_ref() + .expect("packed") + .pack_offset + .cmp(&base_offset) + }) + .ok() + .map(|idx| output::entry::Kind::DeltaRef { + object_index: idx + bases_index_offset, + }) + .or_else(|| { + pack_offset_to_oid + .and_then(|mut f| f(pack_location.pack_id, base_offset)) + .map(|id| output::entry::Kind::DeltaOid { id }) + }) + } + RefDelta { base_id: _ } => None, // ref deltas are for thin packs or legacy, repack them as base objects + } + .map(|kind| { + Ok(output::Entry { + id: count.id.to_owned(), + kind, + decompressed_size: pack_entry.decompressed_size as usize, + compressed_data: { + entry.data.copy_within(pack_entry.data_offset as usize.., 0); + entry.data.resize( + entry.data.len() + - usize::try_from(pack_entry.data_offset).expect("offset representable as usize"), + 0, + ); + entry.data + }, + }) + }) + } + + /// Create a new instance from the given `oid` and its corresponding git `obj`ect data. + pub fn from_data(count: &output::Count, obj: &gix_object::Data<'_>) -> Result<Self, Error> { + Ok(output::Entry { + id: count.id.to_owned(), + kind: Kind::Base(obj.kind), + decompressed_size: obj.data.len(), + compressed_data: { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { + match err.kind() { + std::io::ErrorKind::Other => return Err(Error::ZlibDeflate(err)), + err => unreachable!("Should never see other errors than zlib, but got {:?}", err,), + } + }; + out.flush()?; + out.into_inner() + }, + }) + } + + /// Transform ourselves into pack entry header of `version` which can be written into a pack. + /// + /// `index_to_pack(object_index) -> pack_offset` is a function to convert the base object's index into + /// the input object array (if each object is numbered) to an offset into the pack. + /// This information is known to the one calling the method. + pub fn to_entry_header( + &self, + version: crate::data::Version, + index_to_base_distance: impl FnOnce(usize) -> u64, + ) -> crate::data::entry::Header { + assert!( + matches!(version, data::Version::V2), + "we can only write V2 pack entries for now" + ); + + use Kind::*; + match self.kind { + Base(kind) => { + use gix_object::Kind::*; + match kind { + Tree => data::entry::Header::Tree, + Blob => data::entry::Header::Blob, + Commit => data::entry::Header::Commit, + Tag => data::entry::Header::Tag, + } + } + DeltaOid { id } => data::entry::Header::RefDelta { base_id: id.to_owned() }, + DeltaRef { object_index } => data::entry::Header::OfsDelta { + base_distance: index_to_base_distance(object_index), + }, + } + } +} diff --git a/vendor/gix-pack/src/data/output/mod.rs b/vendor/gix-pack/src/data/output/mod.rs new file mode 100644 index 000000000..f94d32e8e --- /dev/null +++ b/vendor/gix-pack/src/data/output/mod.rs @@ -0,0 +1,41 @@ +use gix_hash::ObjectId; + +/// +pub mod count; + +/// An item representing a future Entry in the leanest way possible. +/// +/// One can expect to have one of these in memory when building big objects, so smaller is better here. +/// They should contain everything of importance to generate a pack as fast as possible. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Count { + /// The hash of the object to write + pub id: ObjectId, + /// A way to locate a pack entry in the object database, only available if the object is in a pack. + pub entry_pack_location: count::PackLocation, +} + +/// An entry to be written to a file. +/// +/// Some of these will be in-flight and in memory while waiting to be written. Memory requirements depend on the amount of compressed +/// data they hold. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The hash of the object to write + pub id: ObjectId, + /// The kind of entry represented by `data`. It's used alongside with it to complete the pack entry + /// at rest or in transit. + pub kind: entry::Kind, + /// The size in bytes needed once `data` gets decompressed + pub decompressed_size: usize, + /// The compressed data right behind the header + pub compressed_data: Vec<u8>, +} + +/// +pub mod entry; + +/// +pub mod bytes; diff --git a/vendor/gix-pack/src/find.rs b/vendor/gix-pack/src/find.rs new file mode 100644 index 000000000..8143692e7 --- /dev/null +++ b/vendor/gix-pack/src/find.rs @@ -0,0 +1,63 @@ +/// +pub mod existing { + use gix_hash::ObjectId; + + /// The error returned by the [`find(…)`][crate::FindExt::find()] trait methods. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error<T: std::error::Error + 'static> { + #[error(transparent)] + Find(T), + #[error("An object with id {} could not be found", .oid)] + NotFound { oid: ObjectId }, + } +} + +/// +pub mod existing_object { + use gix_hash::ObjectId; + + /// The error returned by the various [`find_*`][crate::FindExt::find_commit()] trait methods. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error<T: std::error::Error + 'static> { + #[error(transparent)] + Find(T), + #[error(transparent)] + Decode(gix_object::decode::Error), + #[error("An object with id {} could not be found", .oid)] + NotFound { oid: ObjectId }, + #[error("Expected object of kind {} something else", .expected)] + ObjectKind { expected: gix_object::Kind }, + } +} + +/// +pub mod existing_iter { + use gix_hash::ObjectId; + + /// The error returned by the various [`find_*`][crate::FindExt::find_commit()] trait methods. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error<T: std::error::Error + 'static> { + #[error(transparent)] + Find(T), + #[error("An object with id {} could not be found", .oid)] + NotFound { oid: ObjectId }, + #[error("Expected object of kind {} something else", .expected)] + ObjectKind { expected: gix_object::Kind }, + } +} + +/// An Entry in a pack providing access to its data. +/// +/// Its commonly retrieved by reading from a pack index file followed by a read from a pack data file. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub struct Entry { + /// The pack-data encoded bytes of the pack data entry as present in the pack file, including the header followed by compressed data. + pub data: Vec<u8>, + /// The version of the pack file containing `data` + pub version: crate::data::Version, +} diff --git a/vendor/gix-pack/src/find_traits.rs b/vendor/gix-pack/src/find_traits.rs new file mode 100644 index 000000000..6f828afbf --- /dev/null +++ b/vendor/gix-pack/src/find_traits.rs @@ -0,0 +1,295 @@ +use crate::{data, find}; + +/// Describe how object can be located in an object store with built-in facilities to supports packs specifically. +/// +/// ## Notes +/// +/// Find effectively needs [generic associated types][issue] to allow a trait for the returned object type. +/// Until then, we will have to make due with explicit types and give them the potentially added features we want. +/// +/// Furthermore, despite this trait being in `gix-pack`, it leaks knowledge about objects potentially not being packed. +/// This is a necessary trade-off to allow this trait to live in `gix-pack` where it is used in functions to create a pack. +/// +/// [issue]: https://github.com/rust-lang/rust/issues/44265 +pub trait Find { + /// The error returned by [`try_find()`][Find::try_find()] + type Error: std::error::Error + Send + Sync + 'static; + + /// Returns true if the object exists in the database. + fn contains(&self, id: impl AsRef<gix_hash::oid>) -> bool; + + /// Find an object matching `id` in the database while placing its raw, decoded data into `buffer`. + /// A `pack_cache` can be used to speed up subsequent lookups, set it to [`crate::cache::Never`] if the + /// workload isn't suitable for caching. + /// + /// Returns `Some((<object data>, <pack location if packed>))` if it was present in the database, + /// or the error that occurred during lookup or object retrieval. + fn try_find<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + buffer: &'a mut Vec<u8>, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error> { + self.try_find_cached(id, buffer, &mut crate::cache::Never) + } + + /// Like [`Find::try_find()`], but with support for controlling the pack cache. + /// A `pack_cache` can be used to speed up subsequent lookups, set it to [`crate::cache::Never`] if the + /// workload isn't suitable for caching. + /// + /// Returns `Some((<object data>, <pack location if packed>))` if it was present in the database, + /// or the error that occurred during lookup or object retrieval. + fn try_find_cached<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + buffer: &'a mut Vec<u8>, + pack_cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error>; + + /// Find the packs location where an object with `id` can be found in the database, or `None` if there is no pack + /// holding the object. + /// + /// _Note_ that this is always None if the object isn't packed even though it exists as loose object. + fn location_by_oid(&self, id: impl AsRef<gix_hash::oid>, buf: &mut Vec<u8>) -> Option<data::entry::Location>; + + /// Obtain a vector of all offsets, in index order, along with their object id. + fn pack_offsets_and_oid(&self, pack_id: u32) -> Option<Vec<(data::Offset, gix_hash::ObjectId)>>; + + /// Return the [`find::Entry`] for `location` if it is backed by a pack. + /// + /// Note that this is only in the interest of avoiding duplicate work during pack generation. + /// Pack locations can be obtained from [`Find::try_find()`]. + /// + /// # Notes + /// + /// Custom implementations might be interested in providing their own meta-data with `object`, + /// which currently isn't possible as the `Locate` trait requires GATs to work like that. + fn entry_by_location(&self, location: &data::entry::Location) -> Option<find::Entry>; +} + +mod ext { + use gix_object::{BlobRef, CommitRef, CommitRefIter, Kind, ObjectRef, TagRef, TagRefIter, TreeRef, TreeRefIter}; + + use crate::find; + + macro_rules! make_obj_lookup { + ($method:ident, $object_variant:path, $object_kind:path, $object_type:ty) => { + /// Like [`find(…)`][Self::find()], but flattens the `Result<Option<_>>` into a single `Result` making a non-existing object an error + /// while returning the desired object type. + fn $method<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + buffer: &'a mut Vec<u8>, + ) -> Result<($object_type, Option<crate::data::entry::Location>), find::existing_object::Error<Self::Error>> + { + let id = id.as_ref(); + self.try_find(id, buffer) + .map_err(find::existing_object::Error::Find)? + .ok_or_else(|| find::existing_object::Error::NotFound { + oid: id.as_ref().to_owned(), + }) + .and_then(|(o, l)| { + o.decode() + .map_err(find::existing_object::Error::Decode) + .map(|o| (o, l)) + }) + .and_then(|(o, l)| match o { + $object_variant(o) => return Ok((o, l)), + _other => Err(find::existing_object::Error::ObjectKind { + expected: $object_kind, + }), + }) + } + }; + } + + macro_rules! make_iter_lookup { + ($method:ident, $object_kind:path, $object_type:ty, $into_iter:tt) => { + /// Like [`find(…)`][Self::find()], but flattens the `Result<Option<_>>` into a single `Result` making a non-existing object an error + /// while returning the desired iterator type. + fn $method<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + buffer: &'a mut Vec<u8>, + ) -> Result<($object_type, Option<crate::data::entry::Location>), find::existing_iter::Error<Self::Error>> { + let id = id.as_ref(); + self.try_find(id, buffer) + .map_err(find::existing_iter::Error::Find)? + .ok_or_else(|| find::existing_iter::Error::NotFound { + oid: id.as_ref().to_owned(), + }) + .and_then(|(o, l)| { + o.$into_iter() + .ok_or_else(|| find::existing_iter::Error::ObjectKind { + expected: $object_kind, + }) + .map(|i| (i, l)) + }) + } + }; + } + + /// An extension trait with convenience functions. + pub trait FindExt: super::Find { + /// Like [`try_find(…)`][super::Find::try_find()], but flattens the `Result<Option<_>>` into a single `Result` making a non-existing object an error. + fn find<'a>( + &self, + id: impl AsRef<gix_hash::oid>, + buffer: &'a mut Vec<u8>, + ) -> Result<(gix_object::Data<'a>, Option<crate::data::entry::Location>), find::existing::Error<Self::Error>> + { + let id = id.as_ref(); + self.try_find(id, buffer) + .map_err(find::existing::Error::Find)? + .ok_or_else(|| find::existing::Error::NotFound { + oid: id.as_ref().to_owned(), + }) + } + + make_obj_lookup!(find_commit, ObjectRef::Commit, Kind::Commit, CommitRef<'a>); + make_obj_lookup!(find_tree, ObjectRef::Tree, Kind::Tree, TreeRef<'a>); + make_obj_lookup!(find_tag, ObjectRef::Tag, Kind::Tag, TagRef<'a>); + make_obj_lookup!(find_blob, ObjectRef::Blob, Kind::Blob, BlobRef<'a>); + make_iter_lookup!(find_commit_iter, Kind::Blob, CommitRefIter<'a>, try_into_commit_iter); + make_iter_lookup!(find_tree_iter, Kind::Tree, TreeRefIter<'a>, try_into_tree_iter); + make_iter_lookup!(find_tag_iter, Kind::Tag, TagRefIter<'a>, try_into_tag_iter); + } + + impl<T: super::Find> FindExt for T {} +} +pub use ext::FindExt; + +mod find_impls { + use std::{ops::Deref, rc::Rc}; + + use gix_hash::oid; + + use crate::{data, find}; + + impl<T> crate::Find for &T + where + T: crate::Find, + { + type Error = T::Error; + + fn contains(&self, id: impl AsRef<oid>) -> bool { + (*self).contains(id) + } + + fn try_find_cached<'a>( + &self, + id: impl AsRef<oid>, + buffer: &'a mut Vec<u8>, + pack_cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error> { + (*self).try_find_cached(id, buffer, pack_cache) + } + + fn location_by_oid(&self, id: impl AsRef<oid>, buf: &mut Vec<u8>) -> Option<data::entry::Location> { + (*self).location_by_oid(id, buf) + } + + fn pack_offsets_and_oid(&self, pack_id: u32) -> Option<Vec<(data::Offset, gix_hash::ObjectId)>> { + (*self).pack_offsets_and_oid(pack_id) + } + + fn entry_by_location(&self, location: &data::entry::Location) -> Option<find::Entry> { + (*self).entry_by_location(location) + } + } + + impl<T> super::Find for std::sync::Arc<T> + where + T: super::Find, + { + type Error = T::Error; + + fn contains(&self, id: impl AsRef<oid>) -> bool { + self.deref().contains(id) + } + + fn try_find_cached<'a>( + &self, + id: impl AsRef<oid>, + buffer: &'a mut Vec<u8>, + pack_cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error> { + self.deref().try_find_cached(id, buffer, pack_cache) + } + + fn location_by_oid(&self, id: impl AsRef<oid>, buf: &mut Vec<u8>) -> Option<data::entry::Location> { + self.deref().location_by_oid(id, buf) + } + + fn pack_offsets_and_oid(&self, pack_id: u32) -> Option<Vec<(data::Offset, gix_hash::ObjectId)>> { + self.deref().pack_offsets_and_oid(pack_id) + } + + fn entry_by_location(&self, object: &data::entry::Location) -> Option<find::Entry> { + self.deref().entry_by_location(object) + } + } + + impl<T> super::Find for Rc<T> + where + T: super::Find, + { + type Error = T::Error; + + fn contains(&self, id: impl AsRef<oid>) -> bool { + self.deref().contains(id) + } + + fn try_find_cached<'a>( + &self, + id: impl AsRef<oid>, + buffer: &'a mut Vec<u8>, + pack_cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error> { + self.deref().try_find_cached(id, buffer, pack_cache) + } + + fn location_by_oid(&self, id: impl AsRef<oid>, buf: &mut Vec<u8>) -> Option<data::entry::Location> { + self.deref().location_by_oid(id, buf) + } + + fn pack_offsets_and_oid(&self, pack_id: u32) -> Option<Vec<(data::Offset, gix_hash::ObjectId)>> { + self.deref().pack_offsets_and_oid(pack_id) + } + + fn entry_by_location(&self, location: &data::entry::Location) -> Option<find::Entry> { + self.deref().entry_by_location(location) + } + } + + impl<T> super::Find for Box<T> + where + T: super::Find, + { + type Error = T::Error; + + fn contains(&self, id: impl AsRef<oid>) -> bool { + self.deref().contains(id) + } + + fn try_find_cached<'a>( + &self, + id: impl AsRef<oid>, + buffer: &'a mut Vec<u8>, + pack_cache: &mut impl crate::cache::DecodeEntry, + ) -> Result<Option<(gix_object::Data<'a>, Option<data::entry::Location>)>, Self::Error> { + self.deref().try_find_cached(id, buffer, pack_cache) + } + + fn location_by_oid(&self, id: impl AsRef<oid>, buf: &mut Vec<u8>) -> Option<data::entry::Location> { + self.deref().location_by_oid(id, buf) + } + + fn pack_offsets_and_oid(&self, pack_id: u32) -> Option<Vec<(data::Offset, gix_hash::ObjectId)>> { + self.deref().pack_offsets_and_oid(pack_id) + } + + fn entry_by_location(&self, location: &data::entry::Location) -> Option<find::Entry> { + self.deref().entry_by_location(location) + } + } +} diff --git a/vendor/gix-pack/src/index/access.rs b/vendor/gix-pack/src/index/access.rs new file mode 100644 index 000000000..0ac85dff7 --- /dev/null +++ b/vendor/gix-pack/src/index/access.rs @@ -0,0 +1,290 @@ +use std::{mem::size_of, ops::Range}; + +use crate::{ + data, + index::{self, EntryIndex, PrefixLookupResult, FAN_LEN}, +}; + +const N32_SIZE: usize = size_of::<u32>(); +const N64_SIZE: usize = size_of::<u64>(); +const V1_HEADER_SIZE: usize = FAN_LEN * N32_SIZE; +const V2_HEADER_SIZE: usize = N32_SIZE * 2 + FAN_LEN * N32_SIZE; +const N32_HIGH_BIT: u32 = 1 << 31; + +/// Represents an entry within a pack index file, effectively mapping object [`IDs`][gix_hash::ObjectId] to pack data file locations. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The ID of the object + pub oid: gix_hash::ObjectId, + /// The offset to the object's header in the pack data file + pub pack_offset: data::Offset, + /// The CRC32 hash over all bytes of the pack data entry. + /// + /// This can be useful for direct copies of pack data entries from one pack to another with insurance there was no bit rot. + /// _Note_: Only available in index version 2 or newer + pub crc32: Option<u32>, +} + +/// Iteration and access +impl index::File { + fn iter_v1(&self) -> impl Iterator<Item = Entry> + '_ { + match self.version { + index::Version::V1 => self.data[V1_HEADER_SIZE..] + .chunks(N32_SIZE + self.hash_len) + .take(self.num_objects as usize) + .map(|c| { + let (ofs, oid) = c.split_at(N32_SIZE); + Entry { + oid: gix_hash::ObjectId::from(oid), + pack_offset: crate::read_u32(ofs) as u64, + crc32: None, + } + }), + _ => panic!("Cannot use iter_v1() on index of type {:?}", self.version), + } + } + + fn iter_v2(&self) -> impl Iterator<Item = Entry> + '_ { + let pack64_offset = self.offset_pack_offset64_v2(); + match self.version { + index::Version::V2 => izip!( + self.data[V2_HEADER_SIZE..].chunks(self.hash_len), + self.data[self.offset_crc32_v2()..].chunks(N32_SIZE), + self.data[self.offset_pack_offset_v2()..].chunks(N32_SIZE) + ) + .take(self.num_objects as usize) + .map(move |(oid, crc32, ofs32)| Entry { + oid: gix_hash::ObjectId::from(oid), + pack_offset: self.pack_offset_from_offset_v2(ofs32, pack64_offset), + crc32: Some(crate::read_u32(crc32)), + }), + _ => panic!("Cannot use iter_v2() on index of type {:?}", self.version), + } + } + + /// Returns the object hash at the given index in our list of (sorted) sha1 hashes. + /// The index ranges from 0 to self.num_objects() + /// + /// # Panics + /// + /// If `index` is out of bounds. + pub fn oid_at_index(&self, index: EntryIndex) -> &gix_hash::oid { + let index = index as usize; + let start = match self.version { + index::Version::V2 => V2_HEADER_SIZE + index * self.hash_len, + index::Version::V1 => V1_HEADER_SIZE + index * (N32_SIZE + self.hash_len) + N32_SIZE, + }; + gix_hash::oid::from_bytes_unchecked(&self.data[start..][..self.hash_len]) + } + + /// Returns the offset into our pack data file at which to start reading the object at `index`. + /// + /// # Panics + /// + /// If `index` is out of bounds. + pub fn pack_offset_at_index(&self, index: EntryIndex) -> data::Offset { + let index = index as usize; + match self.version { + index::Version::V2 => { + let start = self.offset_pack_offset_v2() + index * N32_SIZE; + self.pack_offset_from_offset_v2(&self.data[start..][..N32_SIZE], self.offset_pack_offset64_v2()) + } + index::Version::V1 => { + let start = V1_HEADER_SIZE + index * (N32_SIZE + self.hash_len); + crate::read_u32(&self.data[start..][..N32_SIZE]) as u64 + } + } + } + + /// Returns the CRC32 of the object at the given `index`. + /// + /// _Note_: These are always present for index version 2 or higher. + /// # Panics + /// + /// If `index` is out of bounds. + pub fn crc32_at_index(&self, index: EntryIndex) -> Option<u32> { + let index = index as usize; + match self.version { + index::Version::V2 => { + let start = self.offset_crc32_v2() + index * N32_SIZE; + Some(crate::read_u32(&self.data[start..start + N32_SIZE])) + } + index::Version::V1 => None, + } + } + + /// Returns the `index` of the given hash for use with the [`oid_at_index()`][index::File::oid_at_index()], + /// [`pack_offset_at_index()`][index::File::pack_offset_at_index()] or [`crc32_at_index()`][index::File::crc32_at_index()]. + // NOTE: pretty much the same things as in `multi_index::File::lookup`, change things there + // as well. + pub fn lookup(&self, id: impl AsRef<gix_hash::oid>) -> Option<EntryIndex> { + lookup(id, &self.fan, |idx| self.oid_at_index(idx)) + } + + /// Given a `prefix`, find an object that matches it uniquely within this index and return `Some(Ok(entry_index))`. + /// If there is more than one object matching the object `Some(Err(())` is returned. + /// + /// Finally, if no object matches the index, the return value is `None`. + /// + /// Pass `candidates` to obtain the set of entry-indices matching `prefix`, with the same return value as + /// one would have received if it remained `None`. It will be empty if no object matched the `prefix`. + /// + // NOTE: pretty much the same things as in `index::File::lookup`, change things there + // as well. + pub fn lookup_prefix( + &self, + prefix: gix_hash::Prefix, + candidates: Option<&mut Range<EntryIndex>>, + ) -> Option<PrefixLookupResult> { + lookup_prefix( + prefix, + candidates, + &self.fan, + |idx| self.oid_at_index(idx), + self.num_objects, + ) + } + + /// An iterator over all [`Entries`][Entry] of this index file. + pub fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = Entry> + 'a> { + match self.version { + index::Version::V2 => Box::new(self.iter_v2()), + index::Version::V1 => Box::new(self.iter_v1()), + } + } + + /// Return a vector of ascending offsets into our respective pack data file. + /// + /// Useful to control an iteration over all pack entries in a cache-friendly way. + pub fn sorted_offsets(&self) -> Vec<data::Offset> { + let mut ofs: Vec<_> = match self.version { + index::Version::V1 => self.iter().map(|e| e.pack_offset).collect(), + index::Version::V2 => { + let offset32_start = &self.data[self.offset_pack_offset_v2()..]; + let pack_offset_64_start = self.offset_pack_offset64_v2(); + offset32_start + .chunks(N32_SIZE) + .take(self.num_objects as usize) + .map(|offset| self.pack_offset_from_offset_v2(offset, pack_offset_64_start)) + .collect() + } + }; + ofs.sort_unstable(); + ofs + } + + #[inline] + fn offset_crc32_v2(&self) -> usize { + V2_HEADER_SIZE + self.num_objects as usize * self.hash_len + } + + #[inline] + fn offset_pack_offset_v2(&self) -> usize { + self.offset_crc32_v2() + self.num_objects as usize * N32_SIZE + } + + #[inline] + fn offset_pack_offset64_v2(&self) -> usize { + self.offset_pack_offset_v2() + self.num_objects as usize * N32_SIZE + } + + #[inline] + fn pack_offset_from_offset_v2(&self, offset: &[u8], pack64_offset: usize) -> data::Offset { + debug_assert_eq!(self.version, index::Version::V2); + let ofs32 = crate::read_u32(offset); + if (ofs32 & N32_HIGH_BIT) == N32_HIGH_BIT { + let from = pack64_offset + (ofs32 ^ N32_HIGH_BIT) as usize * N64_SIZE; + crate::read_u64(&self.data[from..][..N64_SIZE]) + } else { + ofs32 as u64 + } + } +} + +pub(crate) fn lookup_prefix<'a>( + prefix: gix_hash::Prefix, + candidates: Option<&mut Range<EntryIndex>>, + fan: &[u32; FAN_LEN], + oid_at_index: impl Fn(EntryIndex) -> &'a gix_hash::oid, + num_objects: u32, +) -> Option<PrefixLookupResult> { + let first_byte = prefix.as_oid().first_byte() as usize; + let mut upper_bound = fan[first_byte]; + let mut lower_bound = if first_byte != 0 { fan[first_byte - 1] } else { 0 }; + + // Bisect using indices + while lower_bound < upper_bound { + let mid = (lower_bound + upper_bound) / 2; + let mid_sha = oid_at_index(mid); + + use std::cmp::Ordering::*; + match prefix.cmp_oid(mid_sha) { + Less => upper_bound = mid, + Equal => match candidates { + Some(candidates) => { + let first_past_entry = ((0..mid).rev()) + .take_while(|prev| prefix.cmp_oid(oid_at_index(*prev)) == Equal) + .last(); + + let last_future_entry = ((mid + 1)..num_objects) + .take_while(|next| prefix.cmp_oid(oid_at_index(*next)) == Equal) + .last(); + + *candidates = match (first_past_entry, last_future_entry) { + (Some(first), Some(last)) => first..last + 1, + (Some(first), None) => first..mid + 1, + (None, Some(last)) => mid..last + 1, + (None, None) => mid..mid + 1, + }; + + return if candidates.len() > 1 { + Some(Err(())) + } else { + Some(Ok(mid)) + }; + } + None => { + let next = mid + 1; + if next < num_objects && prefix.cmp_oid(oid_at_index(next)) == Equal { + return Some(Err(())); + } + if mid != 0 && prefix.cmp_oid(oid_at_index(mid - 1)) == Equal { + return Some(Err(())); + } + return Some(Ok(mid)); + } + }, + Greater => lower_bound = mid + 1, + } + } + + if let Some(candidates) = candidates { + *candidates = 0..0; + } + None +} + +pub(crate) fn lookup<'a>( + id: impl AsRef<gix_hash::oid>, + fan: &[u32; FAN_LEN], + oid_at_index: impl Fn(EntryIndex) -> &'a gix_hash::oid, +) -> Option<EntryIndex> { + let id = id.as_ref(); + let first_byte = id.first_byte() as usize; + let mut upper_bound = fan[first_byte]; + let mut lower_bound = if first_byte != 0 { fan[first_byte - 1] } else { 0 }; + + while lower_bound < upper_bound { + let mid = (lower_bound + upper_bound) / 2; + let mid_sha = oid_at_index(mid); + + use std::cmp::Ordering::*; + match id.cmp(mid_sha) { + Less => upper_bound = mid, + Equal => return Some(mid), + Greater => lower_bound = mid + 1, + } + } + None +} diff --git a/vendor/gix-pack/src/index/init.rs b/vendor/gix-pack/src/index/init.rs new file mode 100644 index 000000000..13eecdbda --- /dev/null +++ b/vendor/gix-pack/src/index/init.rs @@ -0,0 +1,91 @@ +use std::{mem::size_of, path::Path}; + +use crate::index::{self, Version, FAN_LEN, V2_SIGNATURE}; + +/// Returned by [`index::File::at()`]. +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("Could not open pack index file at '{path}'")] + Io { + source: std::io::Error, + path: std::path::PathBuf, + }, + #[error("{message}")] + Corrupt { message: String }, + #[error("Unsupported index version: {version})")] + UnsupportedVersion { version: u32 }, +} + +const N32_SIZE: usize = size_of::<u32>(); + +/// Instantiation +impl index::File { + /// Open the pack index file at the given `path`. + /// + /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind + /// isn't stored within the file format itself. + pub fn at(path: impl AsRef<Path>, object_hash: gix_hash::Kind) -> Result<index::File, Error> { + Self::at_inner(path.as_ref(), object_hash) + } + + fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result<index::File, Error> { + let data = crate::mmap::read_only(path).map_err(|source| Error::Io { + source, + path: path.to_owned(), + })?; + let idx_len = data.len(); + let hash_len = object_hash.len_in_bytes(); + + let footer_size = hash_len * 2; + if idx_len < FAN_LEN * N32_SIZE + footer_size { + return Err(Error::Corrupt { + message: format!("Pack index of size {idx_len} is too small for even an empty index"), + }); + } + let (kind, fan, num_objects) = { + let (kind, d) = { + let (sig, d) = data.split_at(V2_SIGNATURE.len()); + if sig == V2_SIGNATURE { + (Version::V2, d) + } else { + (Version::V1, &data[..]) + } + }; + let d = { + if let Version::V2 = kind { + let (vd, dr) = d.split_at(N32_SIZE); + let version = crate::read_u32(vd); + if version != Version::V2 as u32 { + return Err(Error::UnsupportedVersion { version }); + } + dr + } else { + d + } + }; + let (fan, bytes_read) = read_fan(d); + let (_, _d) = d.split_at(bytes_read); + let num_objects = fan[FAN_LEN - 1]; + + (kind, fan, num_objects) + }; + Ok(index::File { + data, + path: path.to_owned(), + version: kind, + num_objects, + fan, + hash_len, + object_hash, + }) + } +} + +fn read_fan(d: &[u8]) -> ([u32; FAN_LEN], usize) { + let mut fan = [0; FAN_LEN]; + for (c, f) in d.chunks(N32_SIZE).zip(fan.iter_mut()) { + *f = crate::read_u32(c); + } + (fan, FAN_LEN * N32_SIZE) +} diff --git a/vendor/gix-pack/src/index/mod.rs b/vendor/gix-pack/src/index/mod.rs new file mode 100644 index 000000000..341322f7d --- /dev/null +++ b/vendor/gix-pack/src/index/mod.rs @@ -0,0 +1,155 @@ +//! an index into the pack file +//! +/// From itertools +/// Create an iterator running multiple iterators in lockstep. +/// +/// The `izip!` iterator yields elements until any subiterator +/// returns `None`. +/// +/// This is a version of the standard ``.zip()`` that's supporting more than +/// two iterators. The iterator element type is a tuple with one element +/// from each of the input iterators. Just like ``.zip()``, the iteration stops +/// when the shortest of the inputs reaches its end. +/// +/// **Note:** The result of this macro is in the general case an iterator +/// composed of repeated `.zip()` and a `.map()`; it has an anonymous type. +/// The special cases of one and two arguments produce the equivalent of +/// `$a.into_iter()` and `$a.into_iter().zip($b)` respectively. +/// +/// Prefer this macro `izip!()` over [`multizip`] for the performance benefits +/// of using the standard library `.zip()`. +/// +/// [`multizip`]: fn.multizip.html +/// +/// ``` +/// # use itertools::izip; +/// # +/// # fn main() { +/// +/// // iterate over three sequences side-by-side +/// let mut results = [0, 0, 0, 0]; +/// let inputs = [3, 7, 9, 6]; +/// +/// for (r, index, input) in izip!(&mut results, 0..10, &inputs) { +/// *r = index * 10 + input; +/// } +/// +/// assert_eq!(results, [0 + 3, 10 + 7, 29, 36]); +/// # } +/// ``` +macro_rules! izip { + // @closure creates a tuple-flattening closure for .map() call. usage: + // @closure partial_pattern => partial_tuple , rest , of , iterators + // eg. izip!( @closure ((a, b), c) => (a, b, c) , dd , ee ) + ( @closure $p:pat => $tup:expr ) => { + |$p| $tup + }; + + // The "b" identifier is a different identifier on each recursion level thanks to hygiene. + ( @closure $p:pat => ( $($tup:tt)* ) , $_iter:expr $( , $tail:expr )* ) => { + izip!(@closure ($p, b) => ( $($tup)*, b ) $( , $tail )*) + }; + + // unary + ($first:expr $(,)*) => { + std::iter::IntoIterator::into_iter($first) + }; + + // binary + ($first:expr, $second:expr $(,)*) => { + izip!($first) + .zip($second) + }; + + // n-ary where n > 2 + ( $first:expr $( , $rest:expr )* $(,)* ) => { + izip!($first) + $( + .zip($rest) + )* + .map( + izip!(@closure a => (a) $( , $rest )*) + ) + }; +} + +use memmap2::Mmap; + +/// The version of an index file +#[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Version { + V1 = 1, + V2 = 2, +} + +impl Default for Version { + fn default() -> Self { + Version::V2 + } +} + +impl Version { + /// The kind of hash to produce to be compatible to this kind of index + pub fn hash(&self) -> gix_hash::Kind { + match self { + Version::V1 | Version::V2 => gix_hash::Kind::Sha1, + } + } +} + +/// A way to indicate if a lookup, despite successful, was ambiguous or yielded exactly +/// one result in the particular index. +pub type PrefixLookupResult = Result<EntryIndex, ()>; + +/// The type for referring to indices of an entry within the index file. +pub type EntryIndex = u32; + +const FAN_LEN: usize = 256; + +/// A representation of a pack index file +pub struct File { + data: Mmap, + path: std::path::PathBuf, + version: Version, + num_objects: u32, + fan: [u32; FAN_LEN], + hash_len: usize, + object_hash: gix_hash::Kind, +} + +/// Basic file information +impl File { + /// The version of the pack index + pub fn version(&self) -> Version { + self.version + } + /// The path of the opened index file + pub fn path(&self) -> &std::path::Path { + &self.path + } + /// The amount of objects stored in the pack and index, as one past the highest entry index. + pub fn num_objects(&self) -> EntryIndex { + self.num_objects + } + /// The kind of hash we assume + pub fn object_hash(&self) -> gix_hash::Kind { + self.object_hash + } +} + +const V2_SIGNATURE: &[u8] = b"\xfftOc"; +/// +pub mod init; + +pub(crate) mod access; +pub use access::Entry; + +/// +pub mod traverse; +mod util; +/// +pub mod verify; +/// +pub mod write; diff --git a/vendor/gix-pack/src/index/traverse/error.rs b/vendor/gix-pack/src/index/traverse/error.rs new file mode 100644 index 000000000..2310c3bab --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/error.rs @@ -0,0 +1,44 @@ +use crate::index; + +/// Returned by [`index::File::traverse_with_index()`] and [`index::File::traverse_with_lookup`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error<E: std::error::Error + Send + Sync + 'static> { + #[error("One of the traversal processors failed")] + Processor(#[source] E), + #[error("Index file, pack file or object verification failed")] + VerifyChecksum(#[from] index::verify::checksum::Error), + #[error("The pack delta tree index could not be built")] + Tree(#[from] crate::cache::delta::from_offsets::Error), + #[error("The tree traversal failed")] + TreeTraversal(#[from] crate::cache::delta::traverse::Error), + #[error("Object {id} at offset {offset} could not be decoded")] + PackDecode { + id: gix_hash::ObjectId, + offset: u64, + source: crate::data::decode::Error, + }, + #[error("The packfiles checksum didn't match the index file checksum: expected {expected}, got {actual}")] + PackMismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + }, + #[error("The hash of {kind} object at offset {offset} didn't match the checksum in the index file: expected {expected}, got {actual}")] + PackObjectMismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + offset: u64, + kind: gix_object::Kind, + }, + #[error( + "The CRC32 of {kind} object at offset {offset} didn't match the checksum in the index file: expected {expected}, got {actual}" + )] + Crc32Mismatch { + expected: u32, + actual: u32, + offset: u64, + kind: gix_object::Kind, + }, + #[error("Interrupted")] + Interrupted, +} diff --git a/vendor/gix-pack/src/index/traverse/mod.rs b/vendor/gix-pack/src/index/traverse/mod.rs new file mode 100644 index 000000000..42c820b0e --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/mod.rs @@ -0,0 +1,245 @@ +use std::sync::atomic::AtomicBool; + +use gix_features::{parallel, progress::Progress}; + +use crate::index; + +mod reduce; +/// +pub mod with_index; +/// +pub mod with_lookup; +use reduce::Reducer; + +mod error; +pub use error::Error; + +mod types; +pub use types::{Algorithm, ProgressId, SafetyCheck, Statistics}; + +/// Traversal options for [`index::File::traverse()`]. +#[derive(Debug, Clone)] +pub struct Options<F> { + /// The algorithm to employ. + pub traversal: Algorithm, + /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on + /// the amount of available logical cores. + pub thread_limit: Option<usize>, + /// The kinds of safety checks to perform. + pub check: SafetyCheck, + /// A function to create a pack cache + pub make_pack_lookup_cache: F, +} + +impl Default for Options<fn() -> crate::cache::Never> { + fn default() -> Self { + Options { + check: Default::default(), + traversal: Default::default(), + thread_limit: None, + make_pack_lookup_cache: || crate::cache::Never, + } + } +} + +/// The outcome of the [`traverse()`][index::File::traverse()] method. +pub struct Outcome<P> { + /// The checksum obtained when hashing the file, which matched the checksum contained within the file. + pub actual_index_checksum: gix_hash::ObjectId, + /// The statistics obtained during traversal. + pub statistics: Statistics, + /// The input progress to allow reuse. + pub progress: P, +} + +/// Traversal of pack data files using an index file +impl index::File { + /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor`. + /// The return value is (pack-checksum, [`Outcome`], `progress`), thus the pack traversal will always verify + /// the whole packs checksum to assure it was correct. In case of bit-rod, the operation will abort early without + /// verifying all objects using the [interrupt mechanism][gix_features::interrupt] mechanism. + /// + /// # Algorithms + /// + /// Using the [`Options::traversal`] field one can chose between two algorithms providing different tradeoffs. Both invoke + /// `new_processor()` to create functions receiving decoded objects, their object kind, index entry and a progress instance to provide + /// progress information. + /// + /// * [`Algorithm::DeltaTreeLookup`] builds an index to avoid any unnecessary computation while resolving objects, avoiding + /// the need for a cache entirely, rendering `new_cache()` unused. + /// One could also call [`traverse_with_index()`][index::File::traverse_with_index()] directly. + /// * [`Algorithm::Lookup`] uses a cache created by `new_cache()` to avoid having to re-compute all bases of a delta-chain while + /// decoding objects. + /// One could also call [`traverse_with_lookup()`][index::File::traverse_with_lookup()] directly. + /// + /// Use [`thread_limit`][Options::thread_limit] to further control parallelism and [`check`][SafetyCheck] to define how much the passed + /// objects shall be verified beforehand. + pub fn traverse<P, C, Processor, E, F>( + &self, + pack: &crate::data::File, + progress: P, + should_interrupt: &AtomicBool, + new_processor: impl Fn() -> Processor + Send + Clone, + Options { + traversal, + thread_limit, + check, + make_pack_lookup_cache, + }: Options<F>, + ) -> Result<Outcome<P>, Error<E>> + where + P: Progress, + C: crate::cache::DecodeEntry, + E: std::error::Error + Send + Sync + 'static, + Processor: FnMut( + gix_object::Kind, + &[u8], + &index::Entry, + &mut <P::SubProgress as Progress>::SubProgress, + ) -> Result<(), E>, + F: Fn() -> C + Send + Clone, + { + match traversal { + Algorithm::Lookup => self.traverse_with_lookup( + new_processor, + pack, + progress, + should_interrupt, + with_lookup::Options { + thread_limit, + check, + make_pack_lookup_cache, + }, + ), + Algorithm::DeltaTreeLookup => self.traverse_with_index( + pack, + new_processor, + progress, + should_interrupt, + crate::index::traverse::with_index::Options { check, thread_limit }, + ), + } + } + + fn possibly_verify<E>( + &self, + pack: &crate::data::File, + check: SafetyCheck, + pack_progress: impl Progress, + index_progress: impl Progress, + should_interrupt: &AtomicBool, + ) -> Result<gix_hash::ObjectId, Error<E>> + where + E: std::error::Error + Send + Sync + 'static, + { + Ok(if check.file_checksum() { + if self.pack_checksum() != pack.checksum() { + return Err(Error::PackMismatch { + actual: pack.checksum(), + expected: self.pack_checksum(), + }); + } + let (pack_res, id) = parallel::join( + move || pack.verify_checksum(pack_progress, should_interrupt), + move || self.verify_checksum(index_progress, should_interrupt), + ); + pack_res?; + id? + } else { + self.index_checksum() + }) + } + + #[allow(clippy::too_many_arguments)] + fn decode_and_process_entry<C, P, E>( + &self, + check: SafetyCheck, + pack: &crate::data::File, + cache: &mut C, + buf: &mut Vec<u8>, + progress: &mut P, + index_entry: &crate::index::Entry, + processor: &mut impl FnMut(gix_object::Kind, &[u8], &index::Entry, &mut P) -> Result<(), E>, + ) -> Result<crate::data::decode::entry::Outcome, Error<E>> + where + C: crate::cache::DecodeEntry, + P: Progress, + E: std::error::Error + Send + Sync + 'static, + { + let pack_entry = pack.entry(index_entry.pack_offset); + let pack_entry_data_offset = pack_entry.data_offset; + let entry_stats = pack + .decode_entry( + pack_entry, + buf, + |id, _| { + self.lookup(id).map(|index| { + crate::data::decode::entry::ResolvedBase::InPack(pack.entry(self.pack_offset_at_index(index))) + }) + }, + cache, + ) + .map_err(|e| Error::PackDecode { + source: e, + id: index_entry.oid, + offset: index_entry.pack_offset, + })?; + let object_kind = entry_stats.kind; + let header_size = (pack_entry_data_offset - index_entry.pack_offset) as usize; + let entry_len = header_size + entry_stats.compressed_size; + + process_entry( + check, + object_kind, + buf, + progress, + index_entry, + || pack.entry_crc32(index_entry.pack_offset, entry_len), + processor, + )?; + Ok(entry_stats) + } +} + +#[allow(clippy::too_many_arguments)] +fn process_entry<P, E>( + check: SafetyCheck, + object_kind: gix_object::Kind, + decompressed: &[u8], + progress: &mut P, + index_entry: &crate::index::Entry, + pack_entry_crc32: impl FnOnce() -> u32, + processor: &mut impl FnMut(gix_object::Kind, &[u8], &index::Entry, &mut P) -> Result<(), E>, +) -> Result<(), Error<E>> +where + P: Progress, + E: std::error::Error + Send + Sync + 'static, +{ + if check.object_checksum() { + let mut hasher = gix_features::hash::hasher(index_entry.oid.kind()); + hasher.update(&gix_object::encode::loose_header(object_kind, decompressed.len())); + hasher.update(decompressed); + + let actual_oid = gix_hash::ObjectId::from(hasher.digest()); + if actual_oid != index_entry.oid { + return Err(Error::PackObjectMismatch { + actual: actual_oid, + expected: index_entry.oid, + offset: index_entry.pack_offset, + kind: object_kind, + }); + } + if let Some(desired_crc32) = index_entry.crc32 { + let actual_crc32 = pack_entry_crc32(); + if actual_crc32 != desired_crc32 { + return Err(Error::Crc32Mismatch { + actual: actual_crc32, + expected: desired_crc32, + offset: index_entry.pack_offset, + kind: object_kind, + }); + } + } + } + processor(object_kind, decompressed, index_entry, progress).map_err(Error::Processor) +} diff --git a/vendor/gix-pack/src/index/traverse/reduce.rs b/vendor/gix-pack/src/index/traverse/reduce.rs new file mode 100644 index 000000000..e05341242 --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/reduce.rs @@ -0,0 +1,129 @@ +use std::{ + sync::atomic::{AtomicBool, Ordering}, + time::Instant, +}; + +use gix_features::{ + parallel, + progress::Progress, + threading::{lock, Mutable, OwnShared}, +}; + +use crate::{data, index::traverse}; + +fn add_decode_result(lhs: &mut data::decode::entry::Outcome, rhs: data::decode::entry::Outcome) { + lhs.num_deltas += rhs.num_deltas; + lhs.decompressed_size += rhs.decompressed_size; + lhs.compressed_size += rhs.compressed_size; + lhs.object_size += rhs.object_size; +} + +fn div_decode_result(lhs: &mut data::decode::entry::Outcome, div: usize) { + if div != 0 { + lhs.num_deltas = (lhs.num_deltas as f32 / div as f32) as u32; + lhs.decompressed_size /= div as u64; + lhs.compressed_size /= div; + lhs.object_size /= div as u64; + } +} + +pub struct Reducer<'a, P, E> { + progress: OwnShared<Mutable<P>>, + check: traverse::SafetyCheck, + then: Instant, + entries_seen: usize, + stats: traverse::Statistics, + should_interrupt: &'a AtomicBool, + _error: std::marker::PhantomData<E>, +} + +impl<'a, P, E> Reducer<'a, P, E> +where + P: Progress, +{ + pub fn from_progress( + progress: OwnShared<Mutable<P>>, + pack_data_len_in_bytes: usize, + check: traverse::SafetyCheck, + should_interrupt: &'a AtomicBool, + ) -> Self { + let stats = traverse::Statistics { + pack_size: pack_data_len_in_bytes as u64, + ..Default::default() + }; + Reducer { + progress, + check, + then: Instant::now(), + entries_seen: 0, + should_interrupt, + stats, + _error: Default::default(), + } + } +} + +impl<'a, P, E> parallel::Reduce for Reducer<'a, P, E> +where + P: Progress, + E: std::error::Error + Send + Sync + 'static, +{ + type Input = Result<Vec<data::decode::entry::Outcome>, traverse::Error<E>>; + type FeedProduce = (); + type Output = traverse::Statistics; + type Error = traverse::Error<E>; + + fn feed(&mut self, input: Self::Input) -> Result<(), Self::Error> { + let chunk_stats: Vec<_> = match input { + Err(err @ traverse::Error::PackDecode { .. }) if !self.check.fatal_decode_error() => { + lock(&self.progress).info(format!("Ignoring decode error: {err}")); + return Ok(()); + } + res => res, + }?; + self.entries_seen += chunk_stats.len(); + + let chunk_total = chunk_stats.into_iter().fold( + data::decode::entry::Outcome::default_from_kind(gix_object::Kind::Tree), + |mut total, stats| { + *self.stats.objects_per_chain_length.entry(stats.num_deltas).or_insert(0) += 1; + self.stats.total_decompressed_entries_size += stats.decompressed_size; + self.stats.total_compressed_entries_size += stats.compressed_size as u64; + self.stats.total_object_size += stats.object_size; + use gix_object::Kind::*; + match stats.kind { + Commit => self.stats.num_commits += 1, + Tree => self.stats.num_trees += 1, + Blob => self.stats.num_blobs += 1, + Tag => self.stats.num_tags += 1, + } + add_decode_result(&mut total, stats); + total + }, + ); + + add_decode_result(&mut self.stats.average, chunk_total); + lock(&self.progress).set(self.entries_seen); + + if self.should_interrupt.load(Ordering::SeqCst) { + return Err(Self::Error::Interrupted); + } + Ok(()) + } + + fn finalize(mut self) -> Result<Self::Output, Self::Error> { + div_decode_result(&mut self.stats.average, self.entries_seen); + + let elapsed_s = self.then.elapsed().as_secs_f32(); + let objects_per_second = (self.entries_seen as f32 / elapsed_s) as u32; + + lock(&self.progress).info(format!( + "of {} objects done in {:.2}s ({} objects/s, ~{}/s)", + self.entries_seen, + elapsed_s, + objects_per_second, + gix_features::progress::bytesize::ByteSize(self.stats.average.object_size * objects_per_second as u64) + )); + Ok(self.stats) + } +} diff --git a/vendor/gix-pack/src/index/traverse/types.rs b/vendor/gix-pack/src/index/traverse/types.rs new file mode 100644 index 000000000..84ebc8932 --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/types.rs @@ -0,0 +1,123 @@ +use std::{collections::BTreeMap, marker::PhantomData}; + +/// Statistics regarding object encountered during execution of the [`traverse()`][crate::index::File::traverse()] method. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Statistics { + /// The average over all decoded objects + pub average: crate::data::decode::entry::Outcome, + /// A mapping of the length of the chain to the amount of objects at that length. + /// + /// A length of 0 indicates full objects, and everything above that involves the given amount + /// of delta objects. + pub objects_per_chain_length: BTreeMap<u32, u32>, + /// The amount of bytes in all compressed streams, one per entry + pub total_compressed_entries_size: u64, + /// The amount of bytes in all decompressed streams, one per entry + pub total_decompressed_entries_size: u64, + /// The amount of bytes occupied by all undeltified, decompressed objects + pub total_object_size: u64, + /// The amount of bytes occupied by the pack itself, in bytes + pub pack_size: u64, + /// The amount of objects encountered that where commits + pub num_commits: u32, + /// The amount of objects encountered that where trees + pub num_trees: u32, + /// The amount of objects encountered that where tags + pub num_tags: u32, + /// The amount of objects encountered that where blobs + pub num_blobs: u32, +} + +impl Default for Statistics { + fn default() -> Self { + Statistics { + average: crate::data::decode::entry::Outcome::default_from_kind(gix_object::Kind::Tree), + objects_per_chain_length: Default::default(), + total_compressed_entries_size: 0, + total_decompressed_entries_size: 0, + total_object_size: 0, + pack_size: 0, + num_blobs: 0, + num_commits: 0, + num_trees: 0, + num_tags: 0, + } + } +} + +/// The ways to validate decoded objects before passing them to the processor. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum SafetyCheck { + /// Don't verify the validity of the checksums stored in the index and pack file + SkipFileChecksumVerification, + + /// All of the above, and also don't perform any object checksum verification + SkipFileAndObjectChecksumVerification, + + /// All of the above, and only log object decode errors. + /// + /// Useful if there is a damaged pack and you would like to traverse as many objects as possible. + SkipFileAndObjectChecksumVerificationAndNoAbortOnDecodeError, + + /// Perform all available safety checks before operating on the pack and + /// abort if any of them fails + All, +} + +impl SafetyCheck { + pub(crate) fn file_checksum(&self) -> bool { + matches!(self, SafetyCheck::All) + } + pub(crate) fn object_checksum(&self) -> bool { + matches!(self, SafetyCheck::All | SafetyCheck::SkipFileChecksumVerification) + } + pub(crate) fn fatal_decode_error(&self) -> bool { + match self { + SafetyCheck::All + | SafetyCheck::SkipFileChecksumVerification + | SafetyCheck::SkipFileAndObjectChecksumVerification => true, + SafetyCheck::SkipFileAndObjectChecksumVerificationAndNoAbortOnDecodeError => false, + } + } +} + +impl Default for SafetyCheck { + fn default() -> Self { + SafetyCheck::All + } +} + +/// The way we verify the pack +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum Algorithm { + /// Build an index to allow decoding each delta and base exactly once, saving a lot of computational + /// resource at the expense of resident memory, as we will use an additional `DeltaTree` to accelerate + /// delta chain resolution. + DeltaTreeLookup, + /// We lookup each object similarly to what would happen during normal repository use. + /// Uses more compute resources as it will resolve delta chains from back to front, but start right away + /// without indexing or investing any memory in indices. + /// + /// This option may be well suited for big packs in memory-starved system that support memory mapping. + Lookup, +} + +impl Default for Algorithm { + fn default() -> Self { + Algorithm::DeltaTreeLookup + } +} + +/// The progress ids used in [`traverse()`][crate::index::File::traverse()] . +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// A root progress which isn't actually used, but links to the `ProgressId` of the lookup version of the algorithm. + WithLookup(PhantomData<super::with_lookup::ProgressId>), + /// A root progress which isn't actually used, but links to the `ProgressId` of the indexed version of the algorithm. + WithIndex(PhantomData<super::with_index::ProgressId>), +} diff --git a/vendor/gix-pack/src/index/traverse/with_index.rs b/vendor/gix-pack/src/index/traverse/with_index.rs new file mode 100644 index 000000000..769bbd07f --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/with_index.rs @@ -0,0 +1,230 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +use gix_features::{parallel, progress::Progress}; + +use super::Error; +use crate::{ + cache::delta::traverse, + index::{self, traverse::Outcome, util::index_entries_sorted_by_offset_ascending}, +}; + +/// Traversal options for [`traverse_with_index()`][index::File::traverse_with_index()] +#[derive(Default)] +pub struct Options { + /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on + /// the amount of available logical cores. + pub thread_limit: Option<usize>, + /// The kinds of safety checks to perform. + pub check: crate::index::traverse::SafetyCheck, +} + +/// The progress ids used in [`index::File::traverse_with_index()`]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// The amount of bytes currently processed to generate a checksum of the *pack data file*. + HashPackDataBytes, + /// The amount of bytes currently processed to generate a checksum of the *pack index file*. + HashPackIndexBytes, + /// Collect all object hashes into a vector and sort it by their pack offset. + CollectSortedIndexEntries, + /// Count the objects processed when building a cache tree from all objects in a pack index. + TreeFromOffsetsObjects, + /// The amount of objects which were decoded. + DecodedObjects, + /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all decoded objects. + DecodedBytes, +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::HashPackDataBytes => *b"PTHP", + ProgressId::HashPackIndexBytes => *b"PTHI", + ProgressId::CollectSortedIndexEntries => *b"PTCE", + ProgressId::TreeFromOffsetsObjects => *b"PTDI", + ProgressId::DecodedObjects => *b"PTRO", + ProgressId::DecodedBytes => *b"PTDB", + } + } +} + +/// Traversal with index +impl index::File { + /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor`, using an index to reduce waste + /// at the cost of memory. + /// + /// For more details, see the documentation on the [`traverse()`][index::File::traverse()] method. + pub fn traverse_with_index<P, Processor, E>( + &self, + pack: &crate::data::File, + new_processor: impl Fn() -> Processor + Send + Clone, + mut progress: P, + should_interrupt: &AtomicBool, + Options { check, thread_limit }: Options, + ) -> Result<Outcome<P>, Error<E>> + where + P: Progress, + Processor: FnMut( + gix_object::Kind, + &[u8], + &index::Entry, + &mut <P::SubProgress as Progress>::SubProgress, + ) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, + { + let (verify_result, traversal_result) = parallel::join( + { + let pack_progress = progress.add_child_with_id( + format!( + "Hash of pack '{}'", + pack.path().file_name().expect("pack has filename").to_string_lossy() + ), + ProgressId::HashPackDataBytes.into(), + ); + let index_progress = progress.add_child_with_id( + format!( + "Hash of index '{}'", + self.path.file_name().expect("index has filename").to_string_lossy() + ), + ProgressId::HashPackIndexBytes.into(), + ); + move || { + let res = self.possibly_verify(pack, check, pack_progress, index_progress, should_interrupt); + if res.is_err() { + should_interrupt.store(true, Ordering::SeqCst); + } + res + } + }, + || -> Result<_, Error<_>> { + let sorted_entries = index_entries_sorted_by_offset_ascending( + self, + progress.add_child_with_id("collecting sorted index", ProgressId::CollectSortedIndexEntries.into()), + ); /* Pack Traverse Collect sorted Entries */ + let tree = crate::cache::delta::Tree::from_offsets_in_pack( + pack.path(), + sorted_entries.into_iter().map(Entry::from), + |e| e.index_entry.pack_offset, + |id| self.lookup(id).map(|idx| self.pack_offset_at_index(idx)), + progress.add_child_with_id("indexing", ProgressId::TreeFromOffsetsObjects.into()), + should_interrupt, + self.object_hash, + )?; + let mut outcome = digest_statistics(tree.traverse( + |slice, out| pack.entry_slice(slice).map(|entry| out.copy_from_slice(entry)), + pack.pack_end() as u64, + new_processor, + |data, + progress, + traverse::Context { + entry: pack_entry, + entry_end, + decompressed: bytes, + state: ref mut processor, + level, + }| { + let object_kind = pack_entry.header.as_kind().expect("non-delta object"); + data.level = level; + data.decompressed_size = pack_entry.decompressed_size; + data.object_kind = object_kind; + data.compressed_size = entry_end - pack_entry.data_offset; + data.object_size = bytes.len() as u64; + let result = crate::index::traverse::process_entry( + check, + object_kind, + bytes, + progress, + &data.index_entry, + || { + // TODO: Fix this - we overwrite the header of 'data' which also changes the computed entry size, + // causing index and pack to seemingly mismatch. This is surprising, and should be done differently. + // debug_assert_eq!(&data.index_entry.pack_offset, &pack_entry.pack_offset()); + gix_features::hash::crc32( + pack.entry_slice(data.index_entry.pack_offset..entry_end) + .expect("slice pointing into the pack (by now data is verified)"), + ) + }, + processor, + ); + match result { + Err(err @ Error::PackDecode { .. }) if !check.fatal_decode_error() => { + progress.info(format!("Ignoring decode error: {err}")); + Ok(()) + } + res => res, + } + }, + crate::cache::delta::traverse::Options { + object_progress: progress.add_child_with_id("Resolving", ProgressId::DecodedObjects.into()), + size_progress: progress.add_child_with_id("Decoding", ProgressId::DecodedBytes.into()), + thread_limit, + should_interrupt, + object_hash: self.object_hash, + }, + )?); + outcome.pack_size = pack.data_len() as u64; + Ok(outcome) + }, + ); + Ok(Outcome { + actual_index_checksum: verify_result?, + statistics: traversal_result?, + progress, + }) + } +} + +struct Entry { + index_entry: crate::index::Entry, + object_kind: gix_object::Kind, + object_size: u64, + decompressed_size: u64, + compressed_size: u64, + level: u16, +} + +impl From<crate::index::Entry> for Entry { + fn from(index_entry: crate::index::Entry) -> Self { + Entry { + index_entry, + level: 0, + object_kind: gix_object::Kind::Tree, + object_size: 0, + decompressed_size: 0, + compressed_size: 0, + } + } +} + +fn digest_statistics(traverse::Outcome { roots, children }: traverse::Outcome<Entry>) -> index::traverse::Statistics { + let mut res = index::traverse::Statistics::default(); + let average = &mut res.average; + for item in roots.iter().chain(children.iter()) { + res.total_compressed_entries_size += item.data.compressed_size; + res.total_decompressed_entries_size += item.data.decompressed_size; + res.total_object_size += item.data.object_size; + *res.objects_per_chain_length.entry(item.data.level as u32).or_insert(0) += 1; + + average.decompressed_size += item.data.decompressed_size; + average.compressed_size += item.data.compressed_size as usize; + average.object_size += item.data.object_size; + average.num_deltas += item.data.level as u32; + use gix_object::Kind::*; + match item.data.object_kind { + Blob => res.num_blobs += 1, + Tree => res.num_trees += 1, + Tag => res.num_tags += 1, + Commit => res.num_commits += 1, + }; + } + + let num_nodes = roots.len() + children.len(); + average.decompressed_size /= num_nodes as u64; + average.compressed_size /= num_nodes; + average.object_size /= num_nodes as u64; + average.num_deltas /= num_nodes as u32; + + res +} diff --git a/vendor/gix-pack/src/index/traverse/with_lookup.rs b/vendor/gix-pack/src/index/traverse/with_lookup.rs new file mode 100644 index 000000000..509ae4e4f --- /dev/null +++ b/vendor/gix-pack/src/index/traverse/with_lookup.rs @@ -0,0 +1,190 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +use gix_features::{ + parallel::{self, in_parallel_if}, + progress::{self, Progress}, + threading::{lock, Mutable, OwnShared}, +}; + +use super::{Error, Reducer}; +use crate::{ + data, index, + index::{traverse::Outcome, util}, +}; + +/// Traversal options for [`index::File::traverse_with_lookup()`] +pub struct Options<F> { + /// If `Some`, only use the given amount of threads. Otherwise, the amount of threads to use will be selected based on + /// the amount of available logical cores. + pub thread_limit: Option<usize>, + /// The kinds of safety checks to perform. + pub check: index::traverse::SafetyCheck, + /// A function to create a pack cache + pub make_pack_lookup_cache: F, +} + +impl Default for Options<fn() -> crate::cache::Never> { + fn default() -> Self { + Options { + check: Default::default(), + thread_limit: None, + make_pack_lookup_cache: || crate::cache::Never, + } + } +} + +/// The progress ids used in [`index::File::traverse_with_lookup()`]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// The amount of bytes currently processed to generate a checksum of the *pack data file*. + HashPackDataBytes, + /// The amount of bytes currently processed to generate a checksum of the *pack index file*. + HashPackIndexBytes, + /// Collect all object hashes into a vector and sort it by their pack offset. + CollectSortedIndexEntries, + /// The amount of objects which were decoded by brute-force. + DecodedObjects, +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::HashPackDataBytes => *b"PTHP", + ProgressId::HashPackIndexBytes => *b"PTHI", + ProgressId::CollectSortedIndexEntries => *b"PTCE", + ProgressId::DecodedObjects => *b"PTRO", + } + } +} + +/// Verify and validate the content of the index file +impl index::File { + /// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor` using a cache to reduce the amount of + /// waste while decoding objects. + /// + /// For more details, see the documentation on the [`traverse()`][index::File::traverse()] method. + pub fn traverse_with_lookup<P, C, Processor, E, F>( + &self, + new_processor: impl Fn() -> Processor + Send + Clone, + pack: &crate::data::File, + mut progress: P, + should_interrupt: &AtomicBool, + Options { + thread_limit, + check, + make_pack_lookup_cache, + }: Options<F>, + ) -> Result<Outcome<P>, Error<E>> + where + P: Progress, + C: crate::cache::DecodeEntry, + E: std::error::Error + Send + Sync + 'static, + Processor: FnMut( + gix_object::Kind, + &[u8], + &index::Entry, + &mut <P::SubProgress as Progress>::SubProgress, + ) -> Result<(), E>, + F: Fn() -> C + Send + Clone, + { + let (verify_result, traversal_result) = parallel::join( + { + let pack_progress = progress.add_child_with_id( + format!( + "Hash of pack '{}'", + pack.path().file_name().expect("pack has filename").to_string_lossy() + ), + ProgressId::HashPackDataBytes.into(), + ); + let index_progress = progress.add_child_with_id( + format!( + "Hash of index '{}'", + self.path.file_name().expect("index has filename").to_string_lossy() + ), + ProgressId::HashPackIndexBytes.into(), + ); + move || { + let res = self.possibly_verify(pack, check, pack_progress, index_progress, should_interrupt); + if res.is_err() { + should_interrupt.store(true, Ordering::SeqCst); + } + res + } + }, + || { + let index_entries = util::index_entries_sorted_by_offset_ascending( + self, + progress.add_child_with_id("collecting sorted index", ProgressId::CollectSortedIndexEntries.into()), + ); + + let (chunk_size, thread_limit, available_cores) = + parallel::optimize_chunk_size_and_thread_limit(1000, Some(index_entries.len()), thread_limit, None); + let there_are_enough_entries_to_process = || index_entries.len() > chunk_size * available_cores; + let input_chunks = index_entries.chunks(chunk_size.max(chunk_size)); + let reduce_progress = OwnShared::new(Mutable::new({ + let mut p = progress.add_child_with_id("Traversing", ProgressId::DecodedObjects.into()); + p.init(Some(self.num_objects() as usize), progress::count("objects")); + p + })); + let state_per_thread = { + let reduce_progress = reduce_progress.clone(); + move |index| { + ( + make_pack_lookup_cache(), + new_processor(), + Vec::with_capacity(2048), // decode buffer + lock(&reduce_progress) + .add_child_with_id(format!("thread {index}"), gix_features::progress::UNKNOWN), // per thread progress + ) + } + }; + + in_parallel_if( + there_are_enough_entries_to_process, + input_chunks, + thread_limit, + state_per_thread, + |entries: &[index::Entry], + (cache, ref mut processor, buf, progress)| + -> Result<Vec<data::decode::entry::Outcome>, Error<_>> { + progress.init( + Some(entries.len()), + gix_features::progress::count_with_decimals("objects", 2), + ); + let mut stats = Vec::with_capacity(entries.len()); + progress.set(0); + for index_entry in entries.iter() { + let result = self.decode_and_process_entry( + check, + pack, + cache, + buf, + progress, + index_entry, + processor, + ); + progress.inc(); + let stat = match result { + Err(err @ Error::PackDecode { .. }) if !check.fatal_decode_error() => { + progress.info(format!("Ignoring decode error: {err}")); + continue; + } + res => res, + }?; + stats.push(stat); + } + Ok(stats) + }, + Reducer::from_progress(reduce_progress, pack.data_len(), check, should_interrupt), + ) + }, + ); + Ok(Outcome { + actual_index_checksum: verify_result?, + statistics: traversal_result?, + progress, + }) + } +} diff --git a/vendor/gix-pack/src/index/util.rs b/vendor/gix-pack/src/index/util.rs new file mode 100644 index 000000000..284ee6158 --- /dev/null +++ b/vendor/gix-pack/src/index/util.rs @@ -0,0 +1,47 @@ +use std::{io, time::Instant}; + +use gix_features::progress::{self, Progress}; + +pub(crate) fn index_entries_sorted_by_offset_ascending( + idx: &crate::index::File, + mut progress: impl Progress, +) -> Vec<crate::index::Entry> { + progress.init(Some(idx.num_objects as usize), progress::count("entries")); + let start = Instant::now(); + + let mut v = Vec::with_capacity(idx.num_objects as usize); + for entry in idx.iter() { + v.push(entry); + progress.inc(); + } + v.sort_by_key(|e| e.pack_offset); + + progress.show_throughput(start); + v +} + +pub(crate) struct Count<W> { + pub bytes: u64, + pub inner: W, +} + +impl<W> Count<W> { + pub fn new(inner: W) -> Self { + Count { bytes: 0, inner } + } +} + +impl<W> io::Write for Count<W> +where + W: io::Write, +{ + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + let written = self.inner.write(buf)?; + self.bytes += written as u64; + Ok(written) + } + + fn flush(&mut self) -> io::Result<()> { + self.inner.flush() + } +} diff --git a/vendor/gix-pack/src/index/verify.rs b/vendor/gix-pack/src/index/verify.rs new file mode 100644 index 000000000..4a4852fb6 --- /dev/null +++ b/vendor/gix-pack/src/index/verify.rs @@ -0,0 +1,290 @@ +use std::sync::atomic::AtomicBool; + +use gix_features::progress::Progress; +use gix_object::{bstr::ByteSlice, WriteTo}; + +use crate::index; + +/// +pub mod integrity { + use std::marker::PhantomData; + + use gix_object::bstr::BString; + + /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. + #[derive(thiserror::Error, Debug)] + #[allow(missing_docs)] + pub enum Error { + #[error("The fan at index {index} is out of order as it's larger then the following value.")] + Fan { index: usize }, + #[error("{kind} object {id} could not be decoded")] + ObjectDecode { + source: gix_object::decode::Error, + kind: gix_object::Kind, + id: gix_hash::ObjectId, + }, + #[error("{kind} object {id} wasn't re-encoded without change, wanted\n{expected}\n\nGOT\n\n{actual}")] + ObjectEncodeMismatch { + kind: gix_object::Kind, + id: gix_hash::ObjectId, + expected: BString, + actual: BString, + }, + } + + /// Returned by [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. + pub struct Outcome<P> { + /// The computed checksum of the index which matched the stored one. + pub actual_index_checksum: gix_hash::ObjectId, + /// The packs traversal outcome, if one was provided + pub pack_traverse_statistics: Option<crate::index::traverse::Statistics>, + /// The provided progress instance. + pub progress: P, + } + + /// Additional options to define how the integrity should be verified. + #[derive(Clone)] + pub struct Options<F> { + /// The thoroughness of the verification + pub verify_mode: crate::index::verify::Mode, + /// The way to traverse packs + pub traversal: crate::index::traverse::Algorithm, + /// The amount of threads to use of `Some(N)`, with `None|Some(0)` using all available cores are used. + pub thread_limit: Option<usize>, + /// A function to create a pack cache + pub make_pack_lookup_cache: F, + } + + impl Default for Options<fn() -> crate::cache::Never> { + fn default() -> Self { + Options { + verify_mode: Default::default(), + traversal: Default::default(), + thread_limit: None, + make_pack_lookup_cache: || crate::cache::Never, + } + } + } + + /// The progress ids used in [`index::File::verify_integrity()`][crate::index::File::verify_integrity()]. + /// + /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. + #[derive(Debug, Copy, Clone)] + pub enum ProgressId { + /// The amount of bytes read to verify the index checksum. + ChecksumBytes, + /// A root progress for traversal which isn't actually used directly, but here to link to the respective `ProgressId` types. + Traverse(PhantomData<crate::index::verify::index::traverse::ProgressId>), + } + + impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::ChecksumBytes => *b"PTHI", + ProgressId::Traverse(_) => gix_features::progress::UNKNOWN, + } + } + } +} + +/// +pub mod checksum { + /// Returned by [`index::File::verify_checksum()`][crate::index::File::verify_checksum()]. + pub type Error = crate::verify::checksum::Error; +} + +/// Various ways in which a pack and index can be verified +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum Mode { + /// Validate the object hash and CRC32 + HashCrc32, + /// Validate hash and CRC32, and decode each non-Blob object. + /// Each object should be valid, i.e. be decodable. + HashCrc32Decode, + /// Validate hash and CRC32, and decode and encode each non-Blob object. + /// Each object should yield exactly the same hash when re-encoded. + HashCrc32DecodeEncode, +} + +impl Default for Mode { + fn default() -> Self { + Mode::HashCrc32DecodeEncode + } +} + +/// Information to allow verifying the integrity of an index with the help of its corresponding pack. +pub struct PackContext<'a, F> { + /// The pack data file itself. + pub data: &'a crate::data::File, + /// The options further configuring the pack traversal and verification + pub options: integrity::Options<F>, +} + +/// Verify and validate the content of the index file +impl index::File { + /// Returns the trailing hash stored at the end of this index file. + /// + /// It's a hash over all bytes of the index. + pub fn index_checksum(&self) -> gix_hash::ObjectId { + gix_hash::ObjectId::from(&self.data[self.data.len() - self.hash_len..]) + } + + /// Returns the hash of the pack data file that this index file corresponds to. + /// + /// It should [`crate::data::File::checksum()`] of the corresponding pack data file. + pub fn pack_checksum(&self) -> gix_hash::ObjectId { + let from = self.data.len() - self.hash_len * 2; + gix_hash::ObjectId::from(&self.data[from..][..self.hash_len]) + } + + /// Validate that our [`index_checksum()`][index::File::index_checksum()] matches the actual contents + /// of this index file, and return it if it does. + pub fn verify_checksum( + &self, + progress: impl Progress, + should_interrupt: &AtomicBool, + ) -> Result<gix_hash::ObjectId, checksum::Error> { + crate::verify::checksum_on_disk_or_mmap( + self.path(), + &self.data, + self.index_checksum(), + self.object_hash, + progress, + should_interrupt, + ) + } + + /// The most thorough validation of integrity of both index file and the corresponding pack data file, if provided. + /// Returns the checksum of the index file, the traversal outcome and the given progress if the integrity check is successful. + /// + /// If `pack` is provided, it is expected (and validated to be) the pack belonging to this index. + /// It will be used to validate internal integrity of the pack before checking each objects integrity + /// is indeed as advertised via its SHA1 as stored in this index, as well as the CRC32 hash. + /// The last member of the Option is a function returning an implementation of [`crate::cache::DecodeEntry`] to be used if + /// the [`index::traverse::Algorithm`] is `Lookup`. + /// To set this to `None`, use `None::<(_, _, _, fn() -> crate::cache::Never)>`. + /// + /// The `thread_limit` optionally specifies the amount of threads to be used for the [pack traversal][index::File::traverse()]. + /// `make_cache` is only used in case a `pack` is specified, use existing implementations in the [`crate::cache`] module. + /// + /// # Tradeoffs + /// + /// The given `progress` is inevitably consumed if there is an error, which is a tradeoff chosen to easily allow using `?` in the + /// error case. + pub fn verify_integrity<P, C, F>( + &self, + pack: Option<PackContext<'_, F>>, + mut progress: P, + should_interrupt: &AtomicBool, + ) -> Result<integrity::Outcome<P>, index::traverse::Error<index::verify::integrity::Error>> + where + P: Progress, + C: crate::cache::DecodeEntry, + F: Fn() -> C + Send + Clone, + { + if let Some(first_invalid) = crate::verify::fan(&self.fan) { + return Err(index::traverse::Error::Processor(integrity::Error::Fan { + index: first_invalid, + })); + } + + match pack { + Some(PackContext { + data: pack, + options: + integrity::Options { + verify_mode, + traversal, + thread_limit, + make_pack_lookup_cache, + }, + }) => self + .traverse( + pack, + progress, + should_interrupt, + || { + let mut encode_buf = Vec::with_capacity(2048); + move |kind, data, index_entry, progress| { + Self::verify_entry(verify_mode, &mut encode_buf, kind, data, index_entry, progress) + } + }, + index::traverse::Options { + traversal, + thread_limit, + check: index::traverse::SafetyCheck::All, + make_pack_lookup_cache, + }, + ) + .map(|o| integrity::Outcome { + actual_index_checksum: o.actual_index_checksum, + pack_traverse_statistics: Some(o.statistics), + progress: o.progress, + }), + None => self + .verify_checksum( + progress.add_child_with_id("Sha1 of index", integrity::ProgressId::ChecksumBytes.into()), + should_interrupt, + ) + .map_err(Into::into) + .map(|id| integrity::Outcome { + actual_index_checksum: id, + pack_traverse_statistics: None, + progress, + }), + } + } + + #[allow(clippy::too_many_arguments)] + fn verify_entry<P>( + verify_mode: Mode, + encode_buf: &mut Vec<u8>, + object_kind: gix_object::Kind, + buf: &[u8], + index_entry: &index::Entry, + progress: &mut P, + ) -> Result<(), integrity::Error> + where + P: Progress, + { + if let Mode::HashCrc32Decode | Mode::HashCrc32DecodeEncode = verify_mode { + use gix_object::Kind::*; + match object_kind { + Tree | Commit | Tag => { + let object = gix_object::ObjectRef::from_bytes(object_kind, buf).map_err(|err| { + integrity::Error::ObjectDecode { + source: err, + kind: object_kind, + id: index_entry.oid, + } + })?; + if let Mode::HashCrc32DecodeEncode = verify_mode { + encode_buf.clear(); + object + .write_to(&mut *encode_buf) + .expect("writing to a memory buffer never fails"); + if encode_buf.as_slice() != buf { + let mut should_return_error = true; + if let gix_object::Kind::Tree = object_kind { + if buf.as_bstr().find(b"100664").is_some() || buf.as_bstr().find(b"100640").is_some() { + progress.info(format!("Tree object {} would be cleaned up during re-serialization, replacing mode '100664|100640' with '100644'", index_entry.oid)); + should_return_error = false + } + } + if should_return_error { + return Err(integrity::Error::ObjectEncodeMismatch { + kind: object_kind, + id: index_entry.oid, + expected: buf.into(), + actual: encode_buf.clone().into(), + }); + } + } + } + } + Blob => {} + }; + } + Ok(()) + } +} diff --git a/vendor/gix-pack/src/index/write/encode.rs b/vendor/gix-pack/src/index/write/encode.rs new file mode 100644 index 000000000..80f0cac61 --- /dev/null +++ b/vendor/gix-pack/src/index/write/encode.rs @@ -0,0 +1,127 @@ +use std::{cmp::Ordering, io}; + +pub(crate) const LARGE_OFFSET_THRESHOLD: u64 = 0x7fff_ffff; +pub(crate) const HIGH_BIT: u32 = 0x8000_0000; + +use gix_features::{ + hash, + progress::{self, Progress}, +}; + +use crate::index::{util::Count, V2_SIGNATURE}; + +pub(crate) fn write_to( + out: impl io::Write, + entries_sorted_by_oid: Vec<crate::cache::delta::Item<crate::index::write::TreeEntry>>, + pack_hash: &gix_hash::ObjectId, + kind: crate::index::Version, + mut progress: impl Progress, +) -> io::Result<gix_hash::ObjectId> { + use io::Write; + assert_eq!(kind, crate::index::Version::V2, "Can only write V2 packs right now"); + assert!( + entries_sorted_by_oid.len() <= u32::MAX as usize, + "a pack cannot have more than u32::MAX objects" + ); + + // Write header + let mut out = Count::new(std::io::BufWriter::with_capacity( + 8 * 4096, + hash::Write::new(out, kind.hash()), + )); + out.write_all(V2_SIGNATURE)?; + out.write_all(&(kind as u32).to_be_bytes())?; + + progress.init(Some(4), progress::steps()); + let start = std::time::Instant::now(); + let _info = progress.add_child_with_id("writing fan-out table", gix_features::progress::UNKNOWN); + let fan_out = fanout(entries_sorted_by_oid.iter().map(|e| e.data.id.first_byte())); + + for value in fan_out.iter() { + out.write_all(&value.to_be_bytes())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing ids", gix_features::progress::UNKNOWN); + for entry in &entries_sorted_by_oid { + out.write_all(entry.data.id.as_slice())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing crc32", gix_features::progress::UNKNOWN); + for entry in &entries_sorted_by_oid { + out.write_all(&entry.data.crc32.to_be_bytes())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing offsets", gix_features::progress::UNKNOWN); + { + let mut offsets64 = Vec::<u64>::new(); + for entry in &entries_sorted_by_oid { + let offset: u32 = if entry.offset > LARGE_OFFSET_THRESHOLD { + assert!( + offsets64.len() < LARGE_OFFSET_THRESHOLD as usize, + "Encoding breakdown - way too many 64bit offsets" + ); + offsets64.push(entry.offset); + ((offsets64.len() - 1) as u32) | HIGH_BIT + } else { + entry.offset as u32 + }; + out.write_all(&offset.to_be_bytes())?; + } + for value in offsets64 { + out.write_all(&value.to_be_bytes())?; + } + } + + out.write_all(pack_hash.as_slice())?; + + let bytes_written_without_trailer = out.bytes; + let mut out = out.inner.into_inner()?; + let index_hash: gix_hash::ObjectId = out.hash.digest().into(); + out.inner.write_all(index_hash.as_slice())?; + out.inner.flush()?; + + progress.inc(); + progress.show_throughput_with( + start, + (bytes_written_without_trailer + 20) as usize, + progress::bytes().expect("unit always set"), + progress::MessageLevel::Success, + ); + + Ok(index_hash) +} + +pub(crate) fn fanout(iter: impl ExactSizeIterator<Item = u8>) -> [u32; 256] { + let mut fan_out = [0u32; 256]; + let entries_len = iter.len() as u32; + let mut iter = iter.enumerate(); + let mut idx_and_entry = iter.next(); + let mut upper_bound = 0; + + for (offset_be, byte) in fan_out.iter_mut().zip(0u8..=255) { + *offset_be = match idx_and_entry.as_ref() { + Some((_idx, first_byte)) => match first_byte.cmp(&byte) { + Ordering::Less => unreachable!("ids should be ordered, and we make sure to keep ahead with them"), + Ordering::Greater => upper_bound, + Ordering::Equal => { + if byte == 255 { + entries_len + } else { + idx_and_entry = iter.find(|(_, first_byte)| *first_byte != byte); + upper_bound = idx_and_entry + .as_ref() + .map(|(idx, _)| *idx as u32) + .unwrap_or(entries_len); + upper_bound + } + } + }, + None => entries_len, + }; + } + + fan_out +} diff --git a/vendor/gix-pack/src/index/write/error.rs b/vendor/gix-pack/src/index/write/error.rs new file mode 100644 index 000000000..a5ef6ad67 --- /dev/null +++ b/vendor/gix-pack/src/index/write/error.rs @@ -0,0 +1,25 @@ +use std::io; + +/// Returned by [`crate::index::File::write_data_iter_to_stream()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An IO error occurred when reading the pack or creating a temporary file")] + Io(#[from] io::Error), + #[error("A pack entry could not be extracted")] + PackEntryDecode(#[from] crate::data::input::Error), + #[error("Indices of type {} cannot be written, only {} are supported", *.0 as usize, crate::index::Version::default() as usize)] + Unsupported(crate::index::Version), + #[error("Ref delta objects are not supported as there is no way to look them up. Resolve them beforehand.")] + IteratorInvariantNoRefDelta, + #[error("The iterator failed to set a trailing hash over all prior pack entries in the last provided entry")] + IteratorInvariantTrailer, + #[error("Only u32::MAX objects can be stored in a pack, found {0}")] + IteratorInvariantTooManyObjects(usize), + #[error("{pack_offset} is not a valid offset for pack offset {distance}")] + IteratorInvariantBaseOffset { pack_offset: u64, distance: u64 }, + #[error(transparent)] + Tree(#[from] crate::cache::delta::Error), + #[error(transparent)] + TreeTraversal(#[from] crate::cache::delta::traverse::Error), +} diff --git a/vendor/gix-pack/src/index/write/mod.rs b/vendor/gix-pack/src/index/write/mod.rs new file mode 100644 index 000000000..c8fdaa271 --- /dev/null +++ b/vendor/gix-pack/src/index/write/mod.rs @@ -0,0 +1,263 @@ +use std::{convert::TryInto, io, sync::atomic::AtomicBool}; + +pub use error::Error; +use gix_features::progress::{self, Progress}; + +use crate::cache::delta::{traverse, Tree}; + +pub(crate) mod encode; +mod error; + +pub(crate) struct TreeEntry { + pub id: gix_hash::ObjectId, + pub crc32: u32, +} + +/// Information gathered while executing [`write_data_iter_to_stream()`][crate::index::File::write_data_iter_to_stream] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The version of the verified index + pub index_version: crate::index::Version, + /// The verified checksum of the verified index + pub index_hash: gix_hash::ObjectId, + + /// The hash of the '.pack' file, also found in its trailing bytes + pub data_hash: gix_hash::ObjectId, + /// The amount of objects that were verified, always the amount of objects in the pack. + pub num_objects: u32, +} + +/// The progress ids used in [`write_data_iter_from_stream()`][crate::index::File::write_data_iter_to_stream()]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// Counts the amount of objects that were index thus far. + IndexObjects, + /// The amount of bytes that were decompressed while decoding pack entries. + /// + /// This is done to determine entry boundaries. + DecompressedBytes, + /// The amount of objects whose hashes were computed. + /// + /// This is done by decoding them, which typically involves decoding delta objects. + ResolveObjects, + /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all resolved objects. + DecodedBytes, + /// The amount of bytes written to the index file. + IndexBytesWritten, +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::IndexObjects => *b"IWIO", + ProgressId::DecompressedBytes => *b"IWDB", + ProgressId::ResolveObjects => *b"IWRO", + ProgressId::DecodedBytes => *b"IWDB", + ProgressId::IndexBytesWritten => *b"IWBW", + } + } +} + +/// Various ways of writing an index file from pack entries +impl crate::index::File { + /// Write information about `entries` as obtained from a pack data file into a pack index file via the `out` stream. + /// The resolver produced by `make_resolver` must resolve pack entries from the same pack data file that produced the + /// `entries` iterator. + /// + /// * `kind` is the version of pack index to produce, use [`crate::index::Version::default()`] if in doubt. + /// * `tread_limit` is used for a parallel tree traversal for obtaining object hashes with optimal performance. + /// * `root_progress` is the top-level progress to stay informed about the progress of this potentially long-running + /// computation. + /// * `object_hash` defines what kind of object hash we write into the index file. + /// * `pack_version` is the version of the underlying pack for which `entries` are read. It's used in case none of these objects are provided + /// to compute a pack-hash. + /// + /// # Remarks + /// + /// * neither in-pack nor out-of-pack Ref Deltas are supported here, these must have been resolved beforehand. + /// * `make_resolver()` will only be called after the iterator stopped returning elements and produces a function that + /// provides all bytes belonging to a pack entry writing them to the given mutable output `Vec`. + /// It should return `None` if the entry cannot be resolved from the pack that produced the `entries` iterator, causing + /// the write operation to fail. + #[allow(clippy::too_many_arguments)] + pub fn write_data_iter_to_stream<F, F2>( + version: crate::index::Version, + make_resolver: F, + entries: impl Iterator<Item = Result<crate::data::input::Entry, crate::data::input::Error>>, + thread_limit: Option<usize>, + mut root_progress: impl Progress, + out: impl io::Write, + should_interrupt: &AtomicBool, + object_hash: gix_hash::Kind, + pack_version: crate::data::Version, + ) -> Result<Outcome, Error> + where + F: FnOnce() -> io::Result<F2>, + F2: for<'r> Fn(crate::data::EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone, + { + if version != crate::index::Version::default() { + return Err(Error::Unsupported(version)); + } + let mut num_objects: usize = 0; + let mut last_seen_trailer = None; + let (anticipated_num_objects, upper_bound) = entries.size_hint(); + let worst_case_num_objects_after_thin_pack_resolution = upper_bound.unwrap_or(anticipated_num_objects); + let mut tree = Tree::with_capacity(worst_case_num_objects_after_thin_pack_resolution)?; + let indexing_start = std::time::Instant::now(); + + root_progress.init(Some(4), progress::steps()); + let mut objects_progress = root_progress.add_child_with_id("indexing", ProgressId::IndexObjects.into()); + objects_progress.init(Some(anticipated_num_objects), progress::count("objects")); + let mut decompressed_progress = + root_progress.add_child_with_id("decompressing", ProgressId::DecompressedBytes.into()); + decompressed_progress.init(None, progress::bytes()); + let mut pack_entries_end: u64 = 0; + + for entry in entries { + let crate::data::input::Entry { + header, + pack_offset, + crc32, + header_size, + compressed: _, + compressed_size, + decompressed_size, + trailer, + } = entry?; + + decompressed_progress.inc_by(decompressed_size as usize); + + let entry_len = header_size as u64 + compressed_size; + pack_entries_end = pack_offset + entry_len; + + let crc32 = crc32.expect("crc32 to be computed by the iterator. Caller assures correct configuration."); + + use crate::data::entry::Header::*; + match header { + Tree | Blob | Commit | Tag => { + tree.add_root( + pack_offset, + TreeEntry { + id: object_hash.null(), + crc32, + }, + )?; + } + RefDelta { .. } => return Err(Error::IteratorInvariantNoRefDelta), + OfsDelta { base_distance } => { + let base_pack_offset = + crate::data::entry::Header::verified_base_pack_offset(pack_offset, base_distance).ok_or( + Error::IteratorInvariantBaseOffset { + pack_offset, + distance: base_distance, + }, + )?; + tree.add_child( + base_pack_offset, + pack_offset, + TreeEntry { + id: object_hash.null(), + crc32, + }, + )?; + } + }; + last_seen_trailer = trailer; + num_objects += 1; + objects_progress.inc(); + } + let num_objects: u32 = num_objects + .try_into() + .map_err(|_| Error::IteratorInvariantTooManyObjects(num_objects))?; + + objects_progress.show_throughput(indexing_start); + decompressed_progress.show_throughput(indexing_start); + drop(objects_progress); + drop(decompressed_progress); + + root_progress.inc(); + + let resolver = make_resolver()?; + let sorted_pack_offsets_by_oid = { + let traverse::Outcome { roots, children } = tree.traverse( + resolver, + pack_entries_end, + || (), + |data, + _progress, + traverse::Context { + entry, + decompressed: bytes, + .. + }| { + modify_base(data, entry, bytes, version.hash()); + Ok::<_, Error>(()) + }, + traverse::Options { + object_progress: root_progress.add_child_with_id("Resolving", ProgressId::ResolveObjects.into()), + size_progress: root_progress.add_child_with_id("Decoding", ProgressId::DecodedBytes.into()), + thread_limit, + should_interrupt, + object_hash, + }, + )?; + root_progress.inc(); + + let mut items = roots; + items.extend(children); + { + let _progress = root_progress.add_child_with_id("sorting by id", gix_features::progress::UNKNOWN); + items.sort_by_key(|e| e.data.id); + } + + root_progress.inc(); + items + }; + + let pack_hash = match last_seen_trailer { + Some(ph) => ph, + None if num_objects == 0 => { + let header = crate::data::header::encode(pack_version, 0); + let mut hasher = gix_features::hash::hasher(object_hash); + hasher.update(&header); + gix_hash::ObjectId::from(hasher.digest()) + } + None => return Err(Error::IteratorInvariantTrailer), + }; + let index_hash = encode::write_to( + out, + sorted_pack_offsets_by_oid, + &pack_hash, + version, + root_progress.add_child_with_id("writing index file", ProgressId::IndexBytesWritten.into()), + )?; + root_progress.show_throughput_with( + indexing_start, + num_objects as usize, + progress::count("objects").expect("unit always set"), + progress::MessageLevel::Success, + ); + Ok(Outcome { + index_version: version, + index_hash, + data_hash: pack_hash, + num_objects, + }) + } +} + +fn modify_base(entry: &mut TreeEntry, pack_entry: &crate::data::Entry, decompressed: &[u8], hash: gix_hash::Kind) { + fn compute_hash(kind: gix_object::Kind, bytes: &[u8], object_hash: gix_hash::Kind) -> gix_hash::ObjectId { + let mut hasher = gix_features::hash::hasher(object_hash); + hasher.update(&gix_object::encode::loose_header(kind, bytes.len())); + hasher.update(bytes); + gix_hash::ObjectId::from(hasher.digest()) + } + + let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration"); + let id = compute_hash(object_kind, decompressed, hash); + entry.id = id; +} diff --git a/vendor/gix-pack/src/lib.rs b/vendor/gix-pack/src/lib.rs new file mode 100755 index 000000000..200b87019 --- /dev/null +++ b/vendor/gix-pack/src/lib.rs @@ -0,0 +1,73 @@ +//! Git stores all of its data as _Objects_, which are data along with a hash over all data. Storing objects efficiently +//! is what git packs are concerned about. +//! +//! Packs consist of [data files][data::File] and [index files][index::File]. The latter can be generated from a data file +//! and make accessing objects within a pack feasible. +//! +//! A [Bundle] conveniently combines a data pack alongside its index to allow [finding][Find] objects or verifying the pack. +//! Objects returned by `.find(…)` are [objects][gix_object::Data] which know their pack location in order to speed up +//! various common operations like creating new packs from existing ones. +//! +//! When traversing all objects in a pack, a _delta tree acceleration structure_ can be built from pack data or an index +//! in order to decompress packs in parallel and without any waste. +//! ## Feature Flags +#![cfg_attr( + feature = "document-features", + cfg_attr(doc, doc = ::document_features::document_features!()) +)] +#![cfg_attr(docsrs, feature(doc_cfg, doc_auto_cfg))] +#![deny(missing_docs, rust_2018_idioms, unsafe_code)] + +/// +pub mod bundle; +/// A bundle of pack data and the corresponding pack index +pub struct Bundle { + /// The pack file corresponding to `index` + pub pack: data::File, + /// The index file corresponding to `pack` + pub index: index::File, +} + +/// +pub mod find; + +/// +pub mod cache; +/// +pub mod data; + +mod find_traits; +pub use find_traits::{Find, FindExt}; + +/// +pub mod index; +/// +pub mod multi_index; + +/// +pub mod verify; + +mod mmap { + use std::path::Path; + + pub fn read_only(path: &Path) -> std::io::Result<memmap2::Mmap> { + let file = std::fs::File::open(path)?; + // SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file. + #[allow(unsafe_code)] + unsafe { + memmap2::Mmap::map(&file) + } + } +} + +use std::convert::TryInto; + +#[inline] +fn read_u32(b: &[u8]) -> u32 { + u32::from_be_bytes(b.try_into().unwrap()) +} + +#[inline] +fn read_u64(b: &[u8]) -> u64 { + u64::from_be_bytes(b.try_into().unwrap()) +} diff --git a/vendor/gix-pack/src/multi_index/access.rs b/vendor/gix-pack/src/multi_index/access.rs new file mode 100644 index 000000000..d209cd0b9 --- /dev/null +++ b/vendor/gix-pack/src/multi_index/access.rs @@ -0,0 +1,143 @@ +use std::{ + ops::Range, + path::{Path, PathBuf}, +}; + +use crate::{ + data, + index::PrefixLookupResult, + multi_index::{EntryIndex, File, PackIndex, Version}, +}; + +/// Represents an entry within a multi index file, effectively mapping object [`IDs`][gix_hash::ObjectId] to pack data +/// files and the offset within. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The ID of the object. + pub oid: gix_hash::ObjectId, + /// The offset to the object's header in the pack data file. + pub pack_offset: data::Offset, + /// The index of the pack matching our [`File::index_names()`] slice. + pub pack_index: PackIndex, +} + +/// Access methods +impl File { + /// Returns the version of the multi-index file. + pub fn version(&self) -> Version { + self.version + } + /// Returns the path from which the multi-index file was loaded. + /// + /// Note that it might have changed in the mean time, or might have been removed as well. + pub fn path(&self) -> &Path { + &self.path + } + /// Returns the amount of indices stored in this multi-index file. It's the same as [File::index_names().len()][File::index_names()], + /// and returned as one past the highest known index. + pub fn num_indices(&self) -> PackIndex { + self.num_indices + } + /// Returns the total amount of objects available for lookup, and returned as one past the highest known entry index + pub fn num_objects(&self) -> EntryIndex { + self.num_objects + } + /// Returns the kind of hash function used for object ids available in this index. + pub fn object_hash(&self) -> gix_hash::Kind { + self.object_hash + } + /// Returns the checksum over the entire content of the file (excluding the checksum itself). + /// + /// It can be used to validate it didn't change after creation. + pub fn checksum(&self) -> gix_hash::ObjectId { + gix_hash::ObjectId::from(&self.data[self.data.len() - self.hash_len..]) + } + /// Return all names of index files (`*.idx`) whose objects we contain. + /// + /// The corresponding pack can be found by replacing the `.idx` extension with `.pack`. + pub fn index_names(&self) -> &[PathBuf] { + &self.index_names + } +} + +impl File { + /// Return the object id at the given `index`, which ranges from 0 to [File::num_objects()]. + pub fn oid_at_index(&self, index: EntryIndex) -> &gix_hash::oid { + debug_assert!(index < self.num_objects, "index out of bounds"); + let index: usize = index as usize; + let start = self.lookup_ofs + index * self.hash_len; + gix_hash::oid::from_bytes_unchecked(&self.data[start..][..self.hash_len]) + } + + /// Given a `prefix`, find an object that matches it uniquely within this index and return `Some(Ok(entry_index))`. + /// If there is more than one object matching the object `Some(Err(())` is returned. + /// + /// Finally, if no object matches the index, the return value is `None`. + /// + /// Pass `candidates` to obtain the set of entry-indices matching `prefix`, with the same return value as + /// one would have received if it remained `None`. It will be empty if no object matched the `prefix`. + /// + // NOTE: pretty much the same things as in `index::File::lookup`, change things there + // as well. + pub fn lookup_prefix( + &self, + prefix: gix_hash::Prefix, + candidates: Option<&mut Range<EntryIndex>>, + ) -> Option<PrefixLookupResult> { + crate::index::access::lookup_prefix( + prefix, + candidates, + &self.fan, + |idx| self.oid_at_index(idx), + self.num_objects, + ) + } + + /// Find the index ranging from 0 to [File::num_objects()] that belongs to data associated with `id`, or `None` if it wasn't found. + /// + /// Use this index for finding additional information via [`File::pack_id_and_pack_offset_at_index()`]. + pub fn lookup(&self, id: impl AsRef<gix_hash::oid>) -> Option<EntryIndex> { + crate::index::access::lookup(id, &self.fan, |idx| self.oid_at_index(idx)) + } + + /// Given the `index` ranging from 0 to [File::num_objects()], return the pack index and its absolute offset into the pack. + /// + /// The pack-index refers to an entry in the [`index_names`][File::index_names()] list, from which the pack can be derived. + pub fn pack_id_and_pack_offset_at_index(&self, index: EntryIndex) -> (PackIndex, data::Offset) { + const OFFSET_ENTRY_SIZE: usize = 4 + 4; + let index = index as usize; + let start = self.offsets_ofs + index * OFFSET_ENTRY_SIZE; + + const HIGH_BIT: u32 = 1 << 31; + + let pack_index = crate::read_u32(&self.data[start..][..4]); + let offset = &self.data[start + 4..][..4]; + let ofs32 = crate::read_u32(offset); + let pack_offset = if (ofs32 & HIGH_BIT) == HIGH_BIT { + // We determine if large offsets are actually larger than 4GB and if not, we don't use the high-bit to signal anything + // but allow the presence of the large-offset chunk to signal what's happening. + if let Some(offsets_64) = self.large_offsets_ofs { + let from = offsets_64 + (ofs32 ^ HIGH_BIT) as usize * 8; + crate::read_u64(&self.data[from..][..8]) + } else { + ofs32 as u64 + } + } else { + ofs32 as u64 + }; + (pack_index, pack_offset) + } + + /// Return an iterator over all entries within this file. + pub fn iter(&self) -> impl Iterator<Item = Entry> + '_ { + (0..self.num_objects).map(move |idx| { + let (pack_index, pack_offset) = self.pack_id_and_pack_offset_at_index(idx); + Entry { + oid: self.oid_at_index(idx).to_owned(), + pack_offset, + pack_index, + } + }) + } +} diff --git a/vendor/gix-pack/src/multi_index/chunk.rs b/vendor/gix-pack/src/multi_index/chunk.rs new file mode 100644 index 000000000..7ed8eebcb --- /dev/null +++ b/vendor/gix-pack/src/multi_index/chunk.rs @@ -0,0 +1,276 @@ +/// Information for the chunk about index names +pub mod index_names { + use std::path::{Path, PathBuf}; + + use gix_object::bstr::{BString, ByteSlice}; + + /// The ID used for the index-names chunk. + pub const ID: gix_chunk::Id = *b"PNAM"; + + /// + pub mod decode { + use gix_object::bstr::BString; + + /// The error returned by [from_bytes()][super::from_bytes()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("The pack names were not ordered alphabetically.")] + NotOrderedAlphabetically, + #[error("Each pack path name must be terminated with a null byte")] + MissingNullByte, + #[error("Couldn't turn path '{path}' into OS path due to encoding issues")] + PathEncoding { path: BString }, + #[error("non-padding bytes found after all paths were read.")] + UnknownTrailerBytes, + } + } + + /// Parse null-separated index names from the given `chunk` of bytes and the expected number of packs and indices. + /// Ignore padding bytes which are typically \0. + pub fn from_bytes(mut chunk: &[u8], num_packs: u32) -> Result<Vec<PathBuf>, decode::Error> { + let mut out = Vec::new(); + for _ in 0..num_packs { + let null_byte_pos = chunk.find_byte(b'\0').ok_or(decode::Error::MissingNullByte)?; + + let path = &chunk[..null_byte_pos]; + let path = gix_path::try_from_byte_slice(path) + .map_err(|_| decode::Error::PathEncoding { + path: BString::from(path), + })? + .to_owned(); + + if let Some(previous) = out.last() { + if previous >= &path { + return Err(decode::Error::NotOrderedAlphabetically); + } + } + out.push(path); + + chunk = &chunk[null_byte_pos + 1..]; + } + + if !chunk.is_empty() && !chunk.iter().all(|b| *b == 0) { + return Err(decode::Error::UnknownTrailerBytes); + } + // NOTE: git writes garbage into this chunk, usually extra \0 bytes, which we simply ignore. If we were strict + // about it we couldn't read this chunk data at all. + Ok(out) + } + + /// Calculate the size on disk for our chunk with the given index paths. Note that these are expected to have been processed already + /// to actually be file names. + pub fn storage_size(paths: impl IntoIterator<Item = impl AsRef<Path>>) -> u64 { + let mut count = 0u64; + for path in paths { + let path = path.as_ref(); + let ascii_path = path.to_str().expect("UTF-8 compatible paths"); + assert!( + ascii_path.is_ascii(), + "must use ascii bytes for correct size computation" + ); + count += (ascii_path.as_bytes().len() + 1/* null byte */) as u64 + } + + let needed_alignment = CHUNK_ALIGNMENT - (count % CHUNK_ALIGNMENT); + if needed_alignment < CHUNK_ALIGNMENT { + count += needed_alignment; + } + count + } + + /// Write all `paths` in order to `out`, including padding. + pub fn write( + paths: impl IntoIterator<Item = impl AsRef<Path>>, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + let mut written_bytes = 0; + for path in paths { + let path = path.as_ref().to_str().expect("UTF-8 path"); + out.write_all(path.as_bytes())?; + out.write_all(&[0])?; + written_bytes += path.as_bytes().len() as u64 + 1; + } + + let needed_alignment = CHUNK_ALIGNMENT - (written_bytes % CHUNK_ALIGNMENT); + if needed_alignment < CHUNK_ALIGNMENT { + let padding = [0u8; CHUNK_ALIGNMENT as usize]; + out.write_all(&padding[..needed_alignment as usize])?; + } + Ok(()) + } + + const CHUNK_ALIGNMENT: u64 = 4; +} + +/// Information for the chunk with the fanout table +pub mod fanout { + use std::convert::TryInto; + + use crate::multi_index; + + /// The size of the fanout table + pub const SIZE: usize = 4 * 256; + + /// The id uniquely identifying the fanout table. + pub const ID: gix_chunk::Id = *b"OIDF"; + + /// Decode the fanout table contained in `chunk`, or return `None` if it didn't have the expected size. + pub fn from_bytes(chunk: &[u8]) -> Option<[u32; 256]> { + if chunk.len() != SIZE { + return None; + } + let mut out = [0; 256]; + for (c, f) in chunk.chunks(4).zip(out.iter_mut()) { + *f = u32::from_be_bytes(c.try_into().unwrap()); + } + out.into() + } + + /// Write the fanout for the given entries, which must be sorted by oid + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + let fanout = crate::index::write::encode::fanout(sorted_entries.iter().map(|e| e.id.first_byte())); + + for value in fanout.iter() { + out.write_all(&value.to_be_bytes())?; + } + Ok(()) + } +} + +/// Information about the oid lookup table. +pub mod lookup { + use std::ops::Range; + + use crate::multi_index; + + /// The id uniquely identifying the oid lookup table. + pub const ID: gix_chunk::Id = *b"OIDL"; + + /// Return the amount of bytes needed to store the data on disk for the given amount of `entries` + pub fn storage_size(entries: usize, object_hash: gix_hash::Kind) -> u64 { + (entries * object_hash.len_in_bytes()) as u64 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + for entry in sorted_entries { + out.write_all(entry.id.as_slice())?; + } + Ok(()) + } + + /// Return true if the size of the `offset` range seems to match for a `hash` of the given kind and the amount of objects. + pub fn is_valid(offset: &Range<usize>, hash: gix_hash::Kind, num_objects: u32) -> bool { + (offset.end - offset.start) / hash.len_in_bytes() == num_objects as usize + } +} + +/// Information about the offsets table. +pub mod offsets { + use std::{convert::TryInto, ops::Range}; + + use crate::multi_index; + + /// The id uniquely identifying the offsets table. + pub const ID: gix_chunk::Id = *b"OOFF"; + + /// Return the amount of bytes needed to offset data for `entries`. + pub fn storage_size(entries: usize) -> u64 { + (entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + large_offsets_needed: bool, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + use crate::index::write::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD}; + let mut num_large_offsets = 0u32; + + for entry in sorted_entries { + out.write_all(&entry.pack_index.to_be_bytes())?; + + let offset: u32 = if large_offsets_needed { + if entry.pack_offset > LARGE_OFFSET_THRESHOLD { + let res = num_large_offsets | HIGH_BIT; + num_large_offsets += 1; + res + } else { + entry.pack_offset as u32 + } + } else { + entry + .pack_offset + .try_into() + .expect("without large offsets, pack-offset fits u32") + }; + out.write_all(&offset.to_be_bytes())?; + } + Ok(()) + } + + /// Returns true if the `offset` range seems to match the size required for `num_objects`. + pub fn is_valid(offset: &Range<usize>, num_objects: u32) -> bool { + let entry_size = 4 /* pack-id */ + 4 /* pack-offset */; + ((offset.end - offset.start) / num_objects as usize) == entry_size + } +} + +/// Information about the large offsets table. +pub mod large_offsets { + use std::ops::Range; + + use crate::{index::write::encode::LARGE_OFFSET_THRESHOLD, multi_index}; + + /// The id uniquely identifying the large offsets table (with 64 bit offsets) + pub const ID: gix_chunk::Id = *b"LOFF"; + + /// Returns Some(num-large-offset) if there are offsets larger than u32. + pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option<usize> { + let mut num_large_offsets = 0; + let mut needs_large_offsets = false; + for entry in entries { + if entry.pack_offset > LARGE_OFFSET_THRESHOLD { + num_large_offsets += 1; + } + if entry.pack_offset > u32::MAX as crate::data::Offset { + needs_large_offsets = true; + } + } + + needs_large_offsets.then_some(num_large_offsets) + } + /// Returns true if the `offsets` range seems to be properly aligned for the data we expect. + pub fn is_valid(offset: &Range<usize>) -> bool { + (offset.end - offset.start) % 8 == 0 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut num_large_offsets: usize, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + for offset in sorted_entries + .iter() + .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then_some(e.pack_offset)) + { + out.write_all(&offset.to_be_bytes())?; + num_large_offsets = num_large_offsets + .checked_sub(1) + .expect("BUG: wrote more offsets the previously found"); + } + assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted"); + Ok(()) + } + + /// Return the amount of bytes needed to store the given amount of `large_offsets` + pub(crate) fn storage_size(large_offsets: usize) -> u64 { + 8 * large_offsets as u64 + } +} diff --git a/vendor/gix-pack/src/multi_index/init.rs b/vendor/gix-pack/src/multi_index/init.rs new file mode 100644 index 000000000..190b40a7b --- /dev/null +++ b/vendor/gix-pack/src/multi_index/init.rs @@ -0,0 +1,157 @@ +use std::{convert::TryFrom, path::Path}; + +use crate::multi_index::{chunk, File, Version}; + +mod error { + use crate::multi_index::chunk; + + /// The error returned by [File::at()][super::File::at()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not open multi-index file at '{path}'")] + Io { + source: std::io::Error, + path: std::path::PathBuf, + }, + #[error("{message}")] + Corrupt { message: &'static str }, + #[error("Unsupported multi-index version: {version})")] + UnsupportedVersion { version: u8 }, + #[error("Unsupported hash kind: {kind})")] + UnsupportedObjectHash { kind: u8 }, + #[error(transparent)] + ChunkFileDecode(#[from] gix_chunk::file::decode::Error), + #[error(transparent)] + MissingChunk(#[from] gix_chunk::file::index::offset_by_kind::Error), + #[error(transparent)] + FileTooLarge(#[from] gix_chunk::file::index::data_by_kind::Error), + #[error("The multi-pack fan doesn't have the correct size of 256 * 4 bytes")] + MultiPackFanSize, + #[error(transparent)] + PackNames(#[from] chunk::index_names::decode::Error), + #[error("multi-index chunk {:?} has invalid size: {message}", String::from_utf8_lossy(.id))] + InvalidChunkSize { id: gix_chunk::Id, message: &'static str }, + } +} + +pub use error::Error; + +/// Initialization +impl File { + /// Open the multi-index file at the given `path`. + pub fn at(path: impl AsRef<Path>) -> Result<Self, Error> { + Self::try_from(path.as_ref()) + } +} + +impl TryFrom<&Path> for File { + type Error = Error; + + fn try_from(path: &Path) -> Result<Self, Self::Error> { + let data = crate::mmap::read_only(path).map_err(|source| Error::Io { + source, + path: path.to_owned(), + })?; + + const TRAILER_LEN: usize = gix_hash::Kind::shortest().len_in_bytes(); /* trailing hash */ + if data.len() + < Self::HEADER_LEN + + gix_chunk::file::Index::size_for_entries(4 /*index names, fan, offsets, oids*/) + + chunk::fanout::SIZE + + TRAILER_LEN + { + return Err(Error::Corrupt { + message: "multi-index file is truncated and too short", + }); + } + + let (version, object_hash, num_chunks, num_indices) = { + let (signature, data) = data.split_at(4); + if signature != Self::SIGNATURE { + return Err(Error::Corrupt { + message: "Invalid signature", + }); + } + let (version, data) = data.split_at(1); + let version = match version[0] { + 1 => Version::V1, + version => return Err(Error::UnsupportedVersion { version }), + }; + + let (object_hash, data) = data.split_at(1); + let object_hash = gix_hash::Kind::try_from(object_hash[0]) + .map_err(|unknown| Error::UnsupportedObjectHash { kind: unknown })?; + let (num_chunks, data) = data.split_at(1); + let num_chunks = num_chunks[0]; + + let (_num_base_files, data) = data.split_at(1); // TODO: handle base files once it's clear what this does + + let (num_indices, _) = data.split_at(4); + let num_indices = crate::read_u32(num_indices); + + (version, object_hash, num_chunks, num_indices) + }; + + let chunks = gix_chunk::file::Index::from_bytes(&data, Self::HEADER_LEN, num_chunks as u32)?; + + let index_names = chunks.data_by_id(&data, chunk::index_names::ID)?; + let index_names = chunk::index_names::from_bytes(index_names, num_indices)?; + + let fan = chunks.data_by_id(&data, chunk::fanout::ID)?; + let fan = chunk::fanout::from_bytes(fan).ok_or(Error::MultiPackFanSize)?; + let num_objects = fan[255]; + + let lookup = chunks.validated_usize_offset_by_id(chunk::lookup::ID, |offset| { + chunk::lookup::is_valid(&offset, object_hash, num_objects) + .then_some(offset) + .ok_or(Error::InvalidChunkSize { + id: chunk::lookup::ID, + message: "The chunk with alphabetically ordered object ids doesn't have the correct size", + }) + })??; + let offsets = chunks.validated_usize_offset_by_id(chunk::offsets::ID, |offset| { + chunk::offsets::is_valid(&offset, num_objects) + .then_some(offset) + .ok_or(Error::InvalidChunkSize { + id: chunk::offsets::ID, + message: "The chunk with offsets into the pack doesn't have the correct size", + }) + })??; + let large_offsets = chunks + .validated_usize_offset_by_id(chunk::large_offsets::ID, |offset| { + chunk::large_offsets::is_valid(&offset) + .then_some(offset) + .ok_or(Error::InvalidChunkSize { + id: chunk::large_offsets::ID, + message: "The chunk with large offsets into the pack doesn't have the correct size", + }) + }) + .ok() + .transpose()?; + + let checksum_offset = chunks.highest_offset() as usize; + let trailer = &data[checksum_offset..]; + if trailer.len() != object_hash.len_in_bytes() { + return Err(Error::Corrupt { + message: + "Trailing checksum didn't have the expected size or there were unknown bytes after the checksum.", + }); + } + + Ok(File { + data, + path: path.to_owned(), + version, + hash_len: object_hash.len_in_bytes(), + object_hash, + fan, + index_names, + lookup_ofs: lookup.start, + offsets_ofs: offsets.start, + large_offsets_ofs: large_offsets.map(|r| r.start), + num_objects, + num_indices, + }) + } +} diff --git a/vendor/gix-pack/src/multi_index/mod.rs b/vendor/gix-pack/src/multi_index/mod.rs new file mode 100644 index 000000000..3f7ed0ff5 --- /dev/null +++ b/vendor/gix-pack/src/multi_index/mod.rs @@ -0,0 +1,57 @@ +use std::path::PathBuf; + +use memmap2::Mmap; + +/// Known multi-index file versions +#[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Hash, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Version { + V1 = 1, +} + +impl Default for Version { + fn default() -> Self { + Version::V1 + } +} + +/// An index into our [`File::index_names()`] array yielding the name of the index and by implication, its pack file. +pub type PackIndex = u32; + +/// The type for referring to indices of an entry within the index file. +pub type EntryIndex = u32; + +/// A representation of an index file for multiple packs at the same time, typically stored in a file +/// named 'multi-pack-index'. +pub struct File { + data: Mmap, + path: std::path::PathBuf, + version: Version, + hash_len: usize, + object_hash: gix_hash::Kind, + /// The amount of pack files contained within + num_indices: u32, + num_objects: u32, + + fan: [u32; 256], + index_names: Vec<PathBuf>, + lookup_ofs: usize, + offsets_ofs: usize, + large_offsets_ofs: Option<usize>, +} + +/// +pub mod write; + +/// +mod access; + +/// +pub mod verify; + +/// +pub mod chunk; + +/// +pub mod init; diff --git a/vendor/gix-pack/src/multi_index/verify.rs b/vendor/gix-pack/src/multi_index/verify.rs new file mode 100644 index 000000000..856a48501 --- /dev/null +++ b/vendor/gix-pack/src/multi_index/verify.rs @@ -0,0 +1,337 @@ +use std::{cmp::Ordering, sync::atomic::AtomicBool, time::Instant}; + +use gix_features::progress::Progress; + +use crate::{index, multi_index::File}; + +/// +pub mod integrity { + use crate::multi_index::EntryIndex; + + /// Returned by [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. + #[derive(thiserror::Error, Debug)] + #[allow(missing_docs)] + pub enum Error { + #[error("Object {id} should be at pack-offset {expected_pack_offset} but was found at {actual_pack_offset}")] + PackOffsetMismatch { + id: gix_hash::ObjectId, + expected_pack_offset: u64, + actual_pack_offset: u64, + }, + #[error(transparent)] + MultiIndexChecksum(#[from] crate::multi_index::verify::checksum::Error), + #[error(transparent)] + IndexIntegrity(#[from] crate::index::verify::integrity::Error), + #[error(transparent)] + BundleInit(#[from] crate::bundle::init::Error), + #[error("Counted {actual} objects, but expected {expected} as per multi-index")] + UnexpectedObjectCount { actual: usize, expected: usize }, + #[error("{id} wasn't found in the index referenced in the multi-pack index")] + OidNotFound { id: gix_hash::ObjectId }, + #[error("The object id at multi-index entry {index} wasn't in order")] + OutOfOrder { index: EntryIndex }, + #[error("The fan at index {index} is out of order as it's larger then the following value.")] + Fan { index: usize }, + #[error("The multi-index claims to have no objects")] + Empty, + #[error("Interrupted")] + Interrupted, + } + + /// Returned by [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. + pub struct Outcome<P> { + /// The computed checksum of the multi-index which matched the stored one. + pub actual_index_checksum: gix_hash::ObjectId, + /// The for each entry in [`index_names()`][super::File::index_names()] provide the corresponding pack traversal outcome. + pub pack_traverse_statistics: Vec<crate::index::traverse::Statistics>, + /// The provided progress instance. + pub progress: P, + } + + /// The progress ids used in [`multi_index::File::verify_integrity()`][crate::multi_index::File::verify_integrity()]. + /// + /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. + #[derive(Debug, Copy, Clone)] + pub enum ProgressId { + /// The amount of bytes read to verify the multi-index checksum. + ChecksumBytes, + /// The amount of objects whose offset has been checked. + ObjectOffsets, + } + + impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::ChecksumBytes => *b"MVCK", + ProgressId::ObjectOffsets => *b"MVOF", + } + } + } +} + +/// +pub mod checksum { + /// Returned by [`multi_index::File::verify_checksum()`][crate::multi_index::File::verify_checksum()]. + pub type Error = crate::verify::checksum::Error; +} + +impl File { + /// Validate that our [`checksum()`][File::checksum()] matches the actual contents + /// of this index file, and return it if it does. + pub fn verify_checksum( + &self, + progress: impl Progress, + should_interrupt: &AtomicBool, + ) -> Result<gix_hash::ObjectId, checksum::Error> { + crate::verify::checksum_on_disk_or_mmap( + self.path(), + &self.data, + self.checksum(), + self.object_hash, + progress, + should_interrupt, + ) + } + + /// Similar to [`verify_integrity()`][File::verify_integrity()] but without any deep inspection of objects. + /// + /// Instead we only validate the contents of the multi-index itself. + pub fn verify_integrity_fast<P>( + &self, + progress: P, + should_interrupt: &AtomicBool, + ) -> Result<(gix_hash::ObjectId, P), integrity::Error> + where + P: Progress, + { + self.verify_integrity_inner( + progress, + should_interrupt, + false, + index::verify::integrity::Options::default(), + ) + .map_err(|err| match err { + index::traverse::Error::Processor(err) => err, + _ => unreachable!("BUG: no other error type is possible"), + }) + .map(|o| (o.actual_index_checksum, o.progress)) + } + + /// Similar to [`crate::Bundle::verify_integrity()`] but checks all contained indices and their packs. + /// + /// Note that it's considered a failure if an index doesn't have a corresponding pack. + pub fn verify_integrity<C, P, F>( + &self, + progress: P, + should_interrupt: &AtomicBool, + options: index::verify::integrity::Options<F>, + ) -> Result<integrity::Outcome<P>, index::traverse::Error<integrity::Error>> + where + P: Progress, + C: crate::cache::DecodeEntry, + F: Fn() -> C + Send + Clone, + { + self.verify_integrity_inner(progress, should_interrupt, true, options) + } + + fn verify_integrity_inner<C, P, F>( + &self, + mut progress: P, + should_interrupt: &AtomicBool, + deep_check: bool, + options: index::verify::integrity::Options<F>, + ) -> Result<integrity::Outcome<P>, index::traverse::Error<integrity::Error>> + where + P: Progress, + C: crate::cache::DecodeEntry, + F: Fn() -> C + Send + Clone, + { + let parent = self.path.parent().expect("must be in a directory"); + + let actual_index_checksum = self + .verify_checksum( + progress.add_child_with_id( + format!("{}: checksum", self.path.display()), + integrity::ProgressId::ChecksumBytes.into(), + ), + should_interrupt, + ) + .map_err(integrity::Error::from) + .map_err(index::traverse::Error::Processor)?; + + if let Some(first_invalid) = crate::verify::fan(&self.fan) { + return Err(index::traverse::Error::Processor(integrity::Error::Fan { + index: first_invalid, + })); + } + + if self.num_objects == 0 { + return Err(index::traverse::Error::Processor(integrity::Error::Empty)); + } + + let mut pack_traverse_statistics = Vec::new(); + + let operation_start = Instant::now(); + let mut total_objects_checked = 0; + let mut pack_ids_and_offsets = Vec::with_capacity(self.num_objects as usize); + { + let order_start = Instant::now(); + let mut progress = progress.add_child_with_id("checking oid order", gix_features::progress::UNKNOWN); + progress.init( + Some(self.num_objects as usize), + gix_features::progress::count("objects"), + ); + + for entry_index in 0..(self.num_objects - 1) { + let lhs = self.oid_at_index(entry_index); + let rhs = self.oid_at_index(entry_index + 1); + + if rhs.cmp(lhs) != Ordering::Greater { + return Err(index::traverse::Error::Processor(integrity::Error::OutOfOrder { + index: entry_index, + })); + } + let (pack_id, _) = self.pack_id_and_pack_offset_at_index(entry_index); + pack_ids_and_offsets.push((pack_id, entry_index)); + progress.inc(); + } + { + let entry_index = self.num_objects - 1; + let (pack_id, _) = self.pack_id_and_pack_offset_at_index(entry_index); + pack_ids_and_offsets.push((pack_id, entry_index)); + } + // sort by pack-id to allow handling all indices matching a pack while its open. + pack_ids_and_offsets.sort_by(|l, r| l.0.cmp(&r.0)); + progress.show_throughput(order_start); + }; + + progress.init( + Some(self.num_indices as usize), + gix_features::progress::count("indices"), + ); + + let mut pack_ids_slice = pack_ids_and_offsets.as_slice(); + + for (pack_id, index_file_name) in self.index_names.iter().enumerate() { + progress.set_name(index_file_name.display().to_string()); + progress.inc(); + + let mut bundle = None; + let index; + let index_path = parent.join(index_file_name); + let index = if deep_check { + bundle = crate::Bundle::at(index_path, self.object_hash) + .map_err(integrity::Error::from) + .map_err(index::traverse::Error::Processor)? + .into(); + bundle.as_ref().map(|b| &b.index).expect("just set") + } else { + index = Some( + index::File::at(index_path, self.object_hash) + .map_err(|err| integrity::Error::BundleInit(crate::bundle::init::Error::Index(err))) + .map_err(index::traverse::Error::Processor)?, + ); + index.as_ref().expect("just set") + }; + + let slice_end = pack_ids_slice.partition_point(|e| e.0 == pack_id as crate::data::Id); + let multi_index_entries_to_check = &pack_ids_slice[..slice_end]; + { + let offset_start = Instant::now(); + let mut offsets_progress = + progress.add_child_with_id("verify object offsets", integrity::ProgressId::ObjectOffsets.into()); + offsets_progress.init( + Some(pack_ids_and_offsets.len()), + gix_features::progress::count("objects"), + ); + pack_ids_slice = &pack_ids_slice[slice_end..]; + + for entry_id in multi_index_entries_to_check.iter().map(|e| e.1) { + let oid = self.oid_at_index(entry_id); + let (_, expected_pack_offset) = self.pack_id_and_pack_offset_at_index(entry_id); + let entry_in_bundle_index = index.lookup(oid).ok_or_else(|| { + index::traverse::Error::Processor(integrity::Error::OidNotFound { id: oid.to_owned() }) + })?; + let actual_pack_offset = index.pack_offset_at_index(entry_in_bundle_index); + if actual_pack_offset != expected_pack_offset { + return Err(index::traverse::Error::Processor( + integrity::Error::PackOffsetMismatch { + id: oid.to_owned(), + expected_pack_offset, + actual_pack_offset, + }, + )); + } + offsets_progress.inc(); + } + + if should_interrupt.load(std::sync::atomic::Ordering::Relaxed) { + return Err(index::traverse::Error::Processor(integrity::Error::Interrupted)); + } + offsets_progress.show_throughput(offset_start); + } + + total_objects_checked += multi_index_entries_to_check.len(); + + if let Some(bundle) = bundle { + progress.set_name(format!("Validating {}", index_file_name.display())); + let crate::bundle::verify::integrity::Outcome { + actual_index_checksum: _, + pack_traverse_outcome, + progress: returned_progress, + } = bundle + .verify_integrity(progress, should_interrupt, options.clone()) + .map_err(|err| { + use index::traverse::Error::*; + match err { + Processor(err) => Processor(integrity::Error::IndexIntegrity(err)), + VerifyChecksum(err) => VerifyChecksum(err), + Tree(err) => Tree(err), + TreeTraversal(err) => TreeTraversal(err), + PackDecode { id, offset, source } => PackDecode { id, offset, source }, + PackMismatch { expected, actual } => PackMismatch { expected, actual }, + PackObjectMismatch { + expected, + actual, + offset, + kind, + } => PackObjectMismatch { + expected, + actual, + offset, + kind, + }, + Crc32Mismatch { + expected, + actual, + offset, + kind, + } => Crc32Mismatch { + expected, + actual, + offset, + kind, + }, + Interrupted => Interrupted, + } + })?; + progress = returned_progress; + pack_traverse_statistics.push(pack_traverse_outcome); + } + } + + assert_eq!( + self.num_objects as usize, total_objects_checked, + "BUG: our slicing should allow to visit all objects" + ); + + progress.set_name("Validating multi-pack"); + progress.show_throughput(operation_start); + + Ok(integrity::Outcome { + actual_index_checksum, + pack_traverse_statistics, + progress, + }) + } +} diff --git a/vendor/gix-pack/src/multi_index/write.rs b/vendor/gix-pack/src/multi_index/write.rs new file mode 100644 index 000000000..314506401 --- /dev/null +++ b/vendor/gix-pack/src/multi_index/write.rs @@ -0,0 +1,244 @@ +use std::{ + convert::TryInto, + path::PathBuf, + sync::atomic::{AtomicBool, Ordering}, + time::{Instant, SystemTime}, +}; + +use gix_features::progress::Progress; + +use crate::multi_index; + +mod error { + /// The error returned by [multi_index::File::write_from_index_paths()][super::multi_index::File::write_from_index_paths()].. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Interrupted")] + Interrupted, + #[error(transparent)] + OpenIndex(#[from] crate::index::init::Error), + } +} +pub use error::Error; + +/// An entry suitable for sorting and writing +pub(crate) struct Entry { + pub(crate) id: gix_hash::ObjectId, + pub(crate) pack_index: u32, + pub(crate) pack_offset: crate::data::Offset, + /// Used for sorting in case of duplicates + index_mtime: SystemTime, +} + +/// Options for use in [`multi_index::File::write_from_index_paths()`]. +pub struct Options { + /// The kind of hash to use for objects and to expect in the input files. + pub object_hash: gix_hash::Kind, +} + +/// The result of [`multi_index::File::write_from_index_paths()`]. +pub struct Outcome<P> { + /// The calculated multi-index checksum of the file at `multi_index_path`. + pub multi_index_checksum: gix_hash::ObjectId, + /// The input progress + pub progress: P, +} + +/// The progress ids used in [`write_from_index_paths()`][multi_index::File::write_from_index_paths()]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// Counts each path in the input set whose entries we enumerate and write into the multi-index + FromPathsCollectingEntries, + /// The amount of bytes written as part of the multi-index. + BytesWritten, +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::FromPathsCollectingEntries => *b"MPCE", + ProgressId::BytesWritten => *b"MPBW", + } + } +} + +impl multi_index::File { + pub(crate) const SIGNATURE: &'static [u8] = b"MIDX"; + pub(crate) const HEADER_LEN: usize = 4 /*signature*/ + + 1 /*version*/ + + 1 /*object id version*/ + + 1 /*num chunks */ + + 1 /*num base files */ + + 4 /*num pack files*/; + + /// Create a new multi-index file for writing to `out` from the pack index files at `index_paths`. + /// + /// Progress is sent to `progress` and interruptions checked via `should_interrupt`. + pub fn write_from_index_paths<P>( + mut index_paths: Vec<PathBuf>, + out: impl std::io::Write, + mut progress: P, + should_interrupt: &AtomicBool, + Options { object_hash }: Options, + ) -> Result<Outcome<P>, Error> + where + P: Progress, + { + let out = gix_features::hash::Write::new(out, object_hash); + let (index_paths_sorted, index_filenames_sorted) = { + index_paths.sort(); + let file_names = index_paths + .iter() + .map(|p| PathBuf::from(p.file_name().expect("file name present"))) + .collect::<Vec<_>>(); + (index_paths, file_names) + }; + + let entries = { + let mut entries = Vec::new(); + let start = Instant::now(); + let mut progress = + progress.add_child_with_id("Collecting entries", ProgressId::FromPathsCollectingEntries.into()); + progress.init(Some(index_paths_sorted.len()), gix_features::progress::count("indices")); + + // This could be parallelized… but it's probably not worth it unless you have 500mio objects. + for (index_id, index) in index_paths_sorted.iter().enumerate() { + let mtime = index + .metadata() + .and_then(|m| m.modified()) + .unwrap_or(SystemTime::UNIX_EPOCH); + let index = crate::index::File::at(index, object_hash)?; + + entries.reserve(index.num_objects() as usize); + entries.extend(index.iter().map(|e| Entry { + id: e.oid, + pack_index: index_id as u32, + pack_offset: e.pack_offset, + index_mtime: mtime, + })); + progress.inc(); + if should_interrupt.load(Ordering::Relaxed) { + return Err(Error::Interrupted); + } + } + progress.show_throughput(start); + + let start = Instant::now(); + progress.set_name("Deduplicate"); + progress.init(Some(entries.len()), gix_features::progress::count("entries")); + entries.sort_by(|l, r| { + l.id.cmp(&r.id) + .then_with(|| l.index_mtime.cmp(&r.index_mtime).reverse()) + .then_with(|| l.pack_index.cmp(&r.pack_index)) + }); + entries.dedup_by_key(|e| e.id); + progress.inc_by(entries.len()); + progress.show_throughput(start); + if should_interrupt.load(Ordering::Relaxed) { + return Err(Error::Interrupted); + } + entries + }; + + let mut cf = gix_chunk::file::Index::for_writing(); + cf.plan_chunk( + multi_index::chunk::index_names::ID, + multi_index::chunk::index_names::storage_size(&index_filenames_sorted), + ); + cf.plan_chunk(multi_index::chunk::fanout::ID, multi_index::chunk::fanout::SIZE as u64); + cf.plan_chunk( + multi_index::chunk::lookup::ID, + multi_index::chunk::lookup::storage_size(entries.len(), object_hash), + ); + cf.plan_chunk( + multi_index::chunk::offsets::ID, + multi_index::chunk::offsets::storage_size(entries.len()), + ); + + let num_large_offsets = multi_index::chunk::large_offsets::num_large_offsets(&entries); + if let Some(num_large_offsets) = num_large_offsets { + cf.plan_chunk( + multi_index::chunk::large_offsets::ID, + multi_index::chunk::large_offsets::storage_size(num_large_offsets), + ); + } + + let mut write_progress = progress.add_child_with_id("Writing multi-index", ProgressId::BytesWritten.into()); + let write_start = Instant::now(); + write_progress.init( + Some(cf.planned_storage_size() as usize + Self::HEADER_LEN), + gix_features::progress::bytes(), + ); + let mut out = gix_features::progress::Write { + inner: out, + progress: write_progress, + }; + + let bytes_written = Self::write_header( + &mut out, + cf.num_chunks().try_into().expect("BUG: wrote more than 256 chunks"), + index_paths_sorted.len() as u32, + object_hash, + )?; + + { + progress.set_name("Writing chunks"); + progress.init(Some(cf.num_chunks()), gix_features::progress::count("chunks")); + + let mut chunk_write = cf.into_write(&mut out, bytes_written)?; + while let Some(chunk_to_write) = chunk_write.next_chunk() { + match chunk_to_write { + multi_index::chunk::index_names::ID => { + multi_index::chunk::index_names::write(&index_filenames_sorted, &mut chunk_write)? + } + multi_index::chunk::fanout::ID => multi_index::chunk::fanout::write(&entries, &mut chunk_write)?, + multi_index::chunk::lookup::ID => multi_index::chunk::lookup::write(&entries, &mut chunk_write)?, + multi_index::chunk::offsets::ID => { + multi_index::chunk::offsets::write(&entries, num_large_offsets.is_some(), &mut chunk_write)? + } + multi_index::chunk::large_offsets::ID => multi_index::chunk::large_offsets::write( + &entries, + num_large_offsets.expect("available if planned"), + &mut chunk_write, + )?, + unknown => unreachable!("BUG: forgot to implement chunk {:?}", std::str::from_utf8(&unknown)), + } + progress.inc(); + if should_interrupt.load(Ordering::Relaxed) { + return Err(Error::Interrupted); + } + } + } + + // write trailing checksum + let multi_index_checksum: gix_hash::ObjectId = out.inner.hash.digest().into(); + out.inner.inner.write_all(multi_index_checksum.as_slice())?; + out.progress.show_throughput(write_start); + + Ok(Outcome { + multi_index_checksum, + progress, + }) + } + + fn write_header( + mut out: impl std::io::Write, + num_chunks: u8, + num_indices: u32, + object_hash: gix_hash::Kind, + ) -> std::io::Result<usize> { + out.write_all(Self::SIGNATURE)?; + out.write_all(&[crate::multi_index::Version::V1 as u8])?; + out.write_all(&[object_hash as u8])?; + out.write_all(&[num_chunks])?; + out.write_all(&[0])?; /* unused number of base files */ + out.write_all(&num_indices.to_be_bytes())?; + + Ok(Self::HEADER_LEN) + } +} diff --git a/vendor/gix-pack/src/verify.rs b/vendor/gix-pack/src/verify.rs new file mode 100644 index 000000000..f985c8657 --- /dev/null +++ b/vendor/gix-pack/src/verify.rs @@ -0,0 +1,64 @@ +use std::{path::Path, sync::atomic::AtomicBool}; + +use gix_features::progress::Progress; + +/// +pub mod checksum { + /// Returned by various methods to verify the checksum of a memory mapped file that might also exist on disk. + #[derive(thiserror::Error, Debug)] + #[allow(missing_docs)] + pub enum Error { + #[error("Interrupted by user")] + Interrupted, + #[error("index checksum mismatch: expected {expected}, got {actual}")] + Mismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + }, + } +} + +/// Returns the `index` at which the following `index + 1` value is not an increment over the value at `index`. +pub fn fan(data: &[u32]) -> Option<usize> { + data.windows(2) + .enumerate() + .find_map(|(win_index, v)| (v[0] > v[1]).then_some(win_index)) +} + +/// Calculate the hash of the given kind by trying to read the file from disk at `data_path` or falling back on the mapped content in `data`. +/// `Ok(desired_hash)` or `Err(Some(actual_hash))` is returned if the hash matches or mismatches. +/// If the `Err(None)` is returned, the operation was interrupted. +pub fn checksum_on_disk_or_mmap( + data_path: &Path, + data: &[u8], + expected: gix_hash::ObjectId, + object_hash: gix_hash::Kind, + mut progress: impl Progress, + should_interrupt: &AtomicBool, +) -> Result<gix_hash::ObjectId, checksum::Error> { + let data_len_without_trailer = data.len() - object_hash.len_in_bytes(); + let actual = match gix_features::hash::bytes_of_file( + data_path, + data_len_without_trailer, + object_hash, + &mut progress, + should_interrupt, + ) { + Ok(id) => id, + Err(err) if err.kind() == std::io::ErrorKind::Interrupted => return Err(checksum::Error::Interrupted), + Err(_io_err) => { + let start = std::time::Instant::now(); + let mut hasher = gix_features::hash::hasher(object_hash); + hasher.update(&data[..data_len_without_trailer]); + progress.inc_by(data_len_without_trailer); + progress.show_throughput(start); + gix_hash::ObjectId::from(hasher.digest()) + } + }; + + if actual == expected { + Ok(actual) + } else { + Err(checksum::Error::Mismatch { actual, expected }) + } +} |