diff options
Diffstat (limited to 'vendor/gix-pack/src/index/write')
-rw-r--r-- | vendor/gix-pack/src/index/write/encode.rs | 127 | ||||
-rw-r--r-- | vendor/gix-pack/src/index/write/error.rs | 25 | ||||
-rw-r--r-- | vendor/gix-pack/src/index/write/mod.rs | 263 |
3 files changed, 415 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/index/write/encode.rs b/vendor/gix-pack/src/index/write/encode.rs new file mode 100644 index 000000000..80f0cac61 --- /dev/null +++ b/vendor/gix-pack/src/index/write/encode.rs @@ -0,0 +1,127 @@ +use std::{cmp::Ordering, io}; + +pub(crate) const LARGE_OFFSET_THRESHOLD: u64 = 0x7fff_ffff; +pub(crate) const HIGH_BIT: u32 = 0x8000_0000; + +use gix_features::{ + hash, + progress::{self, Progress}, +}; + +use crate::index::{util::Count, V2_SIGNATURE}; + +pub(crate) fn write_to( + out: impl io::Write, + entries_sorted_by_oid: Vec<crate::cache::delta::Item<crate::index::write::TreeEntry>>, + pack_hash: &gix_hash::ObjectId, + kind: crate::index::Version, + mut progress: impl Progress, +) -> io::Result<gix_hash::ObjectId> { + use io::Write; + assert_eq!(kind, crate::index::Version::V2, "Can only write V2 packs right now"); + assert!( + entries_sorted_by_oid.len() <= u32::MAX as usize, + "a pack cannot have more than u32::MAX objects" + ); + + // Write header + let mut out = Count::new(std::io::BufWriter::with_capacity( + 8 * 4096, + hash::Write::new(out, kind.hash()), + )); + out.write_all(V2_SIGNATURE)?; + out.write_all(&(kind as u32).to_be_bytes())?; + + progress.init(Some(4), progress::steps()); + let start = std::time::Instant::now(); + let _info = progress.add_child_with_id("writing fan-out table", gix_features::progress::UNKNOWN); + let fan_out = fanout(entries_sorted_by_oid.iter().map(|e| e.data.id.first_byte())); + + for value in fan_out.iter() { + out.write_all(&value.to_be_bytes())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing ids", gix_features::progress::UNKNOWN); + for entry in &entries_sorted_by_oid { + out.write_all(entry.data.id.as_slice())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing crc32", gix_features::progress::UNKNOWN); + for entry in &entries_sorted_by_oid { + out.write_all(&entry.data.crc32.to_be_bytes())?; + } + + progress.inc(); + let _info = progress.add_child_with_id("writing offsets", gix_features::progress::UNKNOWN); + { + let mut offsets64 = Vec::<u64>::new(); + for entry in &entries_sorted_by_oid { + let offset: u32 = if entry.offset > LARGE_OFFSET_THRESHOLD { + assert!( + offsets64.len() < LARGE_OFFSET_THRESHOLD as usize, + "Encoding breakdown - way too many 64bit offsets" + ); + offsets64.push(entry.offset); + ((offsets64.len() - 1) as u32) | HIGH_BIT + } else { + entry.offset as u32 + }; + out.write_all(&offset.to_be_bytes())?; + } + for value in offsets64 { + out.write_all(&value.to_be_bytes())?; + } + } + + out.write_all(pack_hash.as_slice())?; + + let bytes_written_without_trailer = out.bytes; + let mut out = out.inner.into_inner()?; + let index_hash: gix_hash::ObjectId = out.hash.digest().into(); + out.inner.write_all(index_hash.as_slice())?; + out.inner.flush()?; + + progress.inc(); + progress.show_throughput_with( + start, + (bytes_written_without_trailer + 20) as usize, + progress::bytes().expect("unit always set"), + progress::MessageLevel::Success, + ); + + Ok(index_hash) +} + +pub(crate) fn fanout(iter: impl ExactSizeIterator<Item = u8>) -> [u32; 256] { + let mut fan_out = [0u32; 256]; + let entries_len = iter.len() as u32; + let mut iter = iter.enumerate(); + let mut idx_and_entry = iter.next(); + let mut upper_bound = 0; + + for (offset_be, byte) in fan_out.iter_mut().zip(0u8..=255) { + *offset_be = match idx_and_entry.as_ref() { + Some((_idx, first_byte)) => match first_byte.cmp(&byte) { + Ordering::Less => unreachable!("ids should be ordered, and we make sure to keep ahead with them"), + Ordering::Greater => upper_bound, + Ordering::Equal => { + if byte == 255 { + entries_len + } else { + idx_and_entry = iter.find(|(_, first_byte)| *first_byte != byte); + upper_bound = idx_and_entry + .as_ref() + .map(|(idx, _)| *idx as u32) + .unwrap_or(entries_len); + upper_bound + } + } + }, + None => entries_len, + }; + } + + fan_out +} diff --git a/vendor/gix-pack/src/index/write/error.rs b/vendor/gix-pack/src/index/write/error.rs new file mode 100644 index 000000000..a5ef6ad67 --- /dev/null +++ b/vendor/gix-pack/src/index/write/error.rs @@ -0,0 +1,25 @@ +use std::io; + +/// Returned by [`crate::index::File::write_data_iter_to_stream()`] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An IO error occurred when reading the pack or creating a temporary file")] + Io(#[from] io::Error), + #[error("A pack entry could not be extracted")] + PackEntryDecode(#[from] crate::data::input::Error), + #[error("Indices of type {} cannot be written, only {} are supported", *.0 as usize, crate::index::Version::default() as usize)] + Unsupported(crate::index::Version), + #[error("Ref delta objects are not supported as there is no way to look them up. Resolve them beforehand.")] + IteratorInvariantNoRefDelta, + #[error("The iterator failed to set a trailing hash over all prior pack entries in the last provided entry")] + IteratorInvariantTrailer, + #[error("Only u32::MAX objects can be stored in a pack, found {0}")] + IteratorInvariantTooManyObjects(usize), + #[error("{pack_offset} is not a valid offset for pack offset {distance}")] + IteratorInvariantBaseOffset { pack_offset: u64, distance: u64 }, + #[error(transparent)] + Tree(#[from] crate::cache::delta::Error), + #[error(transparent)] + TreeTraversal(#[from] crate::cache::delta::traverse::Error), +} diff --git a/vendor/gix-pack/src/index/write/mod.rs b/vendor/gix-pack/src/index/write/mod.rs new file mode 100644 index 000000000..c8fdaa271 --- /dev/null +++ b/vendor/gix-pack/src/index/write/mod.rs @@ -0,0 +1,263 @@ +use std::{convert::TryInto, io, sync::atomic::AtomicBool}; + +pub use error::Error; +use gix_features::progress::{self, Progress}; + +use crate::cache::delta::{traverse, Tree}; + +pub(crate) mod encode; +mod error; + +pub(crate) struct TreeEntry { + pub id: gix_hash::ObjectId, + pub crc32: u32, +} + +/// Information gathered while executing [`write_data_iter_to_stream()`][crate::index::File::write_data_iter_to_stream] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The version of the verified index + pub index_version: crate::index::Version, + /// The verified checksum of the verified index + pub index_hash: gix_hash::ObjectId, + + /// The hash of the '.pack' file, also found in its trailing bytes + pub data_hash: gix_hash::ObjectId, + /// The amount of objects that were verified, always the amount of objects in the pack. + pub num_objects: u32, +} + +/// The progress ids used in [`write_data_iter_from_stream()`][crate::index::File::write_data_iter_to_stream()]. +/// +/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. +#[derive(Debug, Copy, Clone)] +pub enum ProgressId { + /// Counts the amount of objects that were index thus far. + IndexObjects, + /// The amount of bytes that were decompressed while decoding pack entries. + /// + /// This is done to determine entry boundaries. + DecompressedBytes, + /// The amount of objects whose hashes were computed. + /// + /// This is done by decoding them, which typically involves decoding delta objects. + ResolveObjects, + /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all resolved objects. + DecodedBytes, + /// The amount of bytes written to the index file. + IndexBytesWritten, +} + +impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::IndexObjects => *b"IWIO", + ProgressId::DecompressedBytes => *b"IWDB", + ProgressId::ResolveObjects => *b"IWRO", + ProgressId::DecodedBytes => *b"IWDB", + ProgressId::IndexBytesWritten => *b"IWBW", + } + } +} + +/// Various ways of writing an index file from pack entries +impl crate::index::File { + /// Write information about `entries` as obtained from a pack data file into a pack index file via the `out` stream. + /// The resolver produced by `make_resolver` must resolve pack entries from the same pack data file that produced the + /// `entries` iterator. + /// + /// * `kind` is the version of pack index to produce, use [`crate::index::Version::default()`] if in doubt. + /// * `tread_limit` is used for a parallel tree traversal for obtaining object hashes with optimal performance. + /// * `root_progress` is the top-level progress to stay informed about the progress of this potentially long-running + /// computation. + /// * `object_hash` defines what kind of object hash we write into the index file. + /// * `pack_version` is the version of the underlying pack for which `entries` are read. It's used in case none of these objects are provided + /// to compute a pack-hash. + /// + /// # Remarks + /// + /// * neither in-pack nor out-of-pack Ref Deltas are supported here, these must have been resolved beforehand. + /// * `make_resolver()` will only be called after the iterator stopped returning elements and produces a function that + /// provides all bytes belonging to a pack entry writing them to the given mutable output `Vec`. + /// It should return `None` if the entry cannot be resolved from the pack that produced the `entries` iterator, causing + /// the write operation to fail. + #[allow(clippy::too_many_arguments)] + pub fn write_data_iter_to_stream<F, F2>( + version: crate::index::Version, + make_resolver: F, + entries: impl Iterator<Item = Result<crate::data::input::Entry, crate::data::input::Error>>, + thread_limit: Option<usize>, + mut root_progress: impl Progress, + out: impl io::Write, + should_interrupt: &AtomicBool, + object_hash: gix_hash::Kind, + pack_version: crate::data::Version, + ) -> Result<Outcome, Error> + where + F: FnOnce() -> io::Result<F2>, + F2: for<'r> Fn(crate::data::EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone, + { + if version != crate::index::Version::default() { + return Err(Error::Unsupported(version)); + } + let mut num_objects: usize = 0; + let mut last_seen_trailer = None; + let (anticipated_num_objects, upper_bound) = entries.size_hint(); + let worst_case_num_objects_after_thin_pack_resolution = upper_bound.unwrap_or(anticipated_num_objects); + let mut tree = Tree::with_capacity(worst_case_num_objects_after_thin_pack_resolution)?; + let indexing_start = std::time::Instant::now(); + + root_progress.init(Some(4), progress::steps()); + let mut objects_progress = root_progress.add_child_with_id("indexing", ProgressId::IndexObjects.into()); + objects_progress.init(Some(anticipated_num_objects), progress::count("objects")); + let mut decompressed_progress = + root_progress.add_child_with_id("decompressing", ProgressId::DecompressedBytes.into()); + decompressed_progress.init(None, progress::bytes()); + let mut pack_entries_end: u64 = 0; + + for entry in entries { + let crate::data::input::Entry { + header, + pack_offset, + crc32, + header_size, + compressed: _, + compressed_size, + decompressed_size, + trailer, + } = entry?; + + decompressed_progress.inc_by(decompressed_size as usize); + + let entry_len = header_size as u64 + compressed_size; + pack_entries_end = pack_offset + entry_len; + + let crc32 = crc32.expect("crc32 to be computed by the iterator. Caller assures correct configuration."); + + use crate::data::entry::Header::*; + match header { + Tree | Blob | Commit | Tag => { + tree.add_root( + pack_offset, + TreeEntry { + id: object_hash.null(), + crc32, + }, + )?; + } + RefDelta { .. } => return Err(Error::IteratorInvariantNoRefDelta), + OfsDelta { base_distance } => { + let base_pack_offset = + crate::data::entry::Header::verified_base_pack_offset(pack_offset, base_distance).ok_or( + Error::IteratorInvariantBaseOffset { + pack_offset, + distance: base_distance, + }, + )?; + tree.add_child( + base_pack_offset, + pack_offset, + TreeEntry { + id: object_hash.null(), + crc32, + }, + )?; + } + }; + last_seen_trailer = trailer; + num_objects += 1; + objects_progress.inc(); + } + let num_objects: u32 = num_objects + .try_into() + .map_err(|_| Error::IteratorInvariantTooManyObjects(num_objects))?; + + objects_progress.show_throughput(indexing_start); + decompressed_progress.show_throughput(indexing_start); + drop(objects_progress); + drop(decompressed_progress); + + root_progress.inc(); + + let resolver = make_resolver()?; + let sorted_pack_offsets_by_oid = { + let traverse::Outcome { roots, children } = tree.traverse( + resolver, + pack_entries_end, + || (), + |data, + _progress, + traverse::Context { + entry, + decompressed: bytes, + .. + }| { + modify_base(data, entry, bytes, version.hash()); + Ok::<_, Error>(()) + }, + traverse::Options { + object_progress: root_progress.add_child_with_id("Resolving", ProgressId::ResolveObjects.into()), + size_progress: root_progress.add_child_with_id("Decoding", ProgressId::DecodedBytes.into()), + thread_limit, + should_interrupt, + object_hash, + }, + )?; + root_progress.inc(); + + let mut items = roots; + items.extend(children); + { + let _progress = root_progress.add_child_with_id("sorting by id", gix_features::progress::UNKNOWN); + items.sort_by_key(|e| e.data.id); + } + + root_progress.inc(); + items + }; + + let pack_hash = match last_seen_trailer { + Some(ph) => ph, + None if num_objects == 0 => { + let header = crate::data::header::encode(pack_version, 0); + let mut hasher = gix_features::hash::hasher(object_hash); + hasher.update(&header); + gix_hash::ObjectId::from(hasher.digest()) + } + None => return Err(Error::IteratorInvariantTrailer), + }; + let index_hash = encode::write_to( + out, + sorted_pack_offsets_by_oid, + &pack_hash, + version, + root_progress.add_child_with_id("writing index file", ProgressId::IndexBytesWritten.into()), + )?; + root_progress.show_throughput_with( + indexing_start, + num_objects as usize, + progress::count("objects").expect("unit always set"), + progress::MessageLevel::Success, + ); + Ok(Outcome { + index_version: version, + index_hash, + data_hash: pack_hash, + num_objects, + }) + } +} + +fn modify_base(entry: &mut TreeEntry, pack_entry: &crate::data::Entry, decompressed: &[u8], hash: gix_hash::Kind) { + fn compute_hash(kind: gix_object::Kind, bytes: &[u8], object_hash: gix_hash::Kind) -> gix_hash::ObjectId { + let mut hasher = gix_features::hash::hasher(object_hash); + hasher.update(&gix_object::encode::loose_header(kind, bytes.len())); + hasher.update(bytes); + gix_hash::ObjectId::from(hasher.digest()) + } + + let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration"); + let id = compute_hash(object_kind, decompressed, hash); + entry.id = id; +} |