summaryrefslogtreecommitdiffstats
path: root/vendor/gix-pack/src/index/write
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:41:41 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:41:41 +0000
commit10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87 (patch)
treebdffd5d80c26cf4a7a518281a204be1ace85b4c1 /vendor/gix-pack/src/index/write
parentReleasing progress-linux version 1.70.0+dfsg1-9~progress7.99u1. (diff)
downloadrustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.tar.xz
rustc-10ee2acdd26a7f1298c6f6d6b7af9b469fe29b87.zip
Merging upstream version 1.70.0+dfsg2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/gix-pack/src/index/write')
-rw-r--r--vendor/gix-pack/src/index/write/encode.rs127
-rw-r--r--vendor/gix-pack/src/index/write/error.rs25
-rw-r--r--vendor/gix-pack/src/index/write/mod.rs263
3 files changed, 415 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/index/write/encode.rs b/vendor/gix-pack/src/index/write/encode.rs
new file mode 100644
index 000000000..80f0cac61
--- /dev/null
+++ b/vendor/gix-pack/src/index/write/encode.rs
@@ -0,0 +1,127 @@
+use std::{cmp::Ordering, io};
+
+pub(crate) const LARGE_OFFSET_THRESHOLD: u64 = 0x7fff_ffff;
+pub(crate) const HIGH_BIT: u32 = 0x8000_0000;
+
+use gix_features::{
+ hash,
+ progress::{self, Progress},
+};
+
+use crate::index::{util::Count, V2_SIGNATURE};
+
+pub(crate) fn write_to(
+ out: impl io::Write,
+ entries_sorted_by_oid: Vec<crate::cache::delta::Item<crate::index::write::TreeEntry>>,
+ pack_hash: &gix_hash::ObjectId,
+ kind: crate::index::Version,
+ mut progress: impl Progress,
+) -> io::Result<gix_hash::ObjectId> {
+ use io::Write;
+ assert_eq!(kind, crate::index::Version::V2, "Can only write V2 packs right now");
+ assert!(
+ entries_sorted_by_oid.len() <= u32::MAX as usize,
+ "a pack cannot have more than u32::MAX objects"
+ );
+
+ // Write header
+ let mut out = Count::new(std::io::BufWriter::with_capacity(
+ 8 * 4096,
+ hash::Write::new(out, kind.hash()),
+ ));
+ out.write_all(V2_SIGNATURE)?;
+ out.write_all(&(kind as u32).to_be_bytes())?;
+
+ progress.init(Some(4), progress::steps());
+ let start = std::time::Instant::now();
+ let _info = progress.add_child_with_id("writing fan-out table", gix_features::progress::UNKNOWN);
+ let fan_out = fanout(entries_sorted_by_oid.iter().map(|e| e.data.id.first_byte()));
+
+ for value in fan_out.iter() {
+ out.write_all(&value.to_be_bytes())?;
+ }
+
+ progress.inc();
+ let _info = progress.add_child_with_id("writing ids", gix_features::progress::UNKNOWN);
+ for entry in &entries_sorted_by_oid {
+ out.write_all(entry.data.id.as_slice())?;
+ }
+
+ progress.inc();
+ let _info = progress.add_child_with_id("writing crc32", gix_features::progress::UNKNOWN);
+ for entry in &entries_sorted_by_oid {
+ out.write_all(&entry.data.crc32.to_be_bytes())?;
+ }
+
+ progress.inc();
+ let _info = progress.add_child_with_id("writing offsets", gix_features::progress::UNKNOWN);
+ {
+ let mut offsets64 = Vec::<u64>::new();
+ for entry in &entries_sorted_by_oid {
+ let offset: u32 = if entry.offset > LARGE_OFFSET_THRESHOLD {
+ assert!(
+ offsets64.len() < LARGE_OFFSET_THRESHOLD as usize,
+ "Encoding breakdown - way too many 64bit offsets"
+ );
+ offsets64.push(entry.offset);
+ ((offsets64.len() - 1) as u32) | HIGH_BIT
+ } else {
+ entry.offset as u32
+ };
+ out.write_all(&offset.to_be_bytes())?;
+ }
+ for value in offsets64 {
+ out.write_all(&value.to_be_bytes())?;
+ }
+ }
+
+ out.write_all(pack_hash.as_slice())?;
+
+ let bytes_written_without_trailer = out.bytes;
+ let mut out = out.inner.into_inner()?;
+ let index_hash: gix_hash::ObjectId = out.hash.digest().into();
+ out.inner.write_all(index_hash.as_slice())?;
+ out.inner.flush()?;
+
+ progress.inc();
+ progress.show_throughput_with(
+ start,
+ (bytes_written_without_trailer + 20) as usize,
+ progress::bytes().expect("unit always set"),
+ progress::MessageLevel::Success,
+ );
+
+ Ok(index_hash)
+}
+
+pub(crate) fn fanout(iter: impl ExactSizeIterator<Item = u8>) -> [u32; 256] {
+ let mut fan_out = [0u32; 256];
+ let entries_len = iter.len() as u32;
+ let mut iter = iter.enumerate();
+ let mut idx_and_entry = iter.next();
+ let mut upper_bound = 0;
+
+ for (offset_be, byte) in fan_out.iter_mut().zip(0u8..=255) {
+ *offset_be = match idx_and_entry.as_ref() {
+ Some((_idx, first_byte)) => match first_byte.cmp(&byte) {
+ Ordering::Less => unreachable!("ids should be ordered, and we make sure to keep ahead with them"),
+ Ordering::Greater => upper_bound,
+ Ordering::Equal => {
+ if byte == 255 {
+ entries_len
+ } else {
+ idx_and_entry = iter.find(|(_, first_byte)| *first_byte != byte);
+ upper_bound = idx_and_entry
+ .as_ref()
+ .map(|(idx, _)| *idx as u32)
+ .unwrap_or(entries_len);
+ upper_bound
+ }
+ }
+ },
+ None => entries_len,
+ };
+ }
+
+ fan_out
+}
diff --git a/vendor/gix-pack/src/index/write/error.rs b/vendor/gix-pack/src/index/write/error.rs
new file mode 100644
index 000000000..a5ef6ad67
--- /dev/null
+++ b/vendor/gix-pack/src/index/write/error.rs
@@ -0,0 +1,25 @@
+use std::io;
+
+/// Returned by [`crate::index::File::write_data_iter_to_stream()`]
+#[derive(thiserror::Error, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+ #[error("An IO error occurred when reading the pack or creating a temporary file")]
+ Io(#[from] io::Error),
+ #[error("A pack entry could not be extracted")]
+ PackEntryDecode(#[from] crate::data::input::Error),
+ #[error("Indices of type {} cannot be written, only {} are supported", *.0 as usize, crate::index::Version::default() as usize)]
+ Unsupported(crate::index::Version),
+ #[error("Ref delta objects are not supported as there is no way to look them up. Resolve them beforehand.")]
+ IteratorInvariantNoRefDelta,
+ #[error("The iterator failed to set a trailing hash over all prior pack entries in the last provided entry")]
+ IteratorInvariantTrailer,
+ #[error("Only u32::MAX objects can be stored in a pack, found {0}")]
+ IteratorInvariantTooManyObjects(usize),
+ #[error("{pack_offset} is not a valid offset for pack offset {distance}")]
+ IteratorInvariantBaseOffset { pack_offset: u64, distance: u64 },
+ #[error(transparent)]
+ Tree(#[from] crate::cache::delta::Error),
+ #[error(transparent)]
+ TreeTraversal(#[from] crate::cache::delta::traverse::Error),
+}
diff --git a/vendor/gix-pack/src/index/write/mod.rs b/vendor/gix-pack/src/index/write/mod.rs
new file mode 100644
index 000000000..c8fdaa271
--- /dev/null
+++ b/vendor/gix-pack/src/index/write/mod.rs
@@ -0,0 +1,263 @@
+use std::{convert::TryInto, io, sync::atomic::AtomicBool};
+
+pub use error::Error;
+use gix_features::progress::{self, Progress};
+
+use crate::cache::delta::{traverse, Tree};
+
+pub(crate) mod encode;
+mod error;
+
+pub(crate) struct TreeEntry {
+ pub id: gix_hash::ObjectId,
+ pub crc32: u32,
+}
+
+/// Information gathered while executing [`write_data_iter_to_stream()`][crate::index::File::write_data_iter_to_stream]
+#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)]
+#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
+pub struct Outcome {
+ /// The version of the verified index
+ pub index_version: crate::index::Version,
+ /// The verified checksum of the verified index
+ pub index_hash: gix_hash::ObjectId,
+
+ /// The hash of the '.pack' file, also found in its trailing bytes
+ pub data_hash: gix_hash::ObjectId,
+ /// The amount of objects that were verified, always the amount of objects in the pack.
+ pub num_objects: u32,
+}
+
+/// The progress ids used in [`write_data_iter_from_stream()`][crate::index::File::write_data_iter_to_stream()].
+///
+/// Use this information to selectively extract the progress of interest in case the parent application has custom visualization.
+#[derive(Debug, Copy, Clone)]
+pub enum ProgressId {
+ /// Counts the amount of objects that were index thus far.
+ IndexObjects,
+ /// The amount of bytes that were decompressed while decoding pack entries.
+ ///
+ /// This is done to determine entry boundaries.
+ DecompressedBytes,
+ /// The amount of objects whose hashes were computed.
+ ///
+ /// This is done by decoding them, which typically involves decoding delta objects.
+ ResolveObjects,
+ /// The amount of bytes that were decoded in total, as the sum of all bytes to represent all resolved objects.
+ DecodedBytes,
+ /// The amount of bytes written to the index file.
+ IndexBytesWritten,
+}
+
+impl From<ProgressId> for gix_features::progress::Id {
+ fn from(v: ProgressId) -> Self {
+ match v {
+ ProgressId::IndexObjects => *b"IWIO",
+ ProgressId::DecompressedBytes => *b"IWDB",
+ ProgressId::ResolveObjects => *b"IWRO",
+ ProgressId::DecodedBytes => *b"IWDB",
+ ProgressId::IndexBytesWritten => *b"IWBW",
+ }
+ }
+}
+
+/// Various ways of writing an index file from pack entries
+impl crate::index::File {
+ /// Write information about `entries` as obtained from a pack data file into a pack index file via the `out` stream.
+ /// The resolver produced by `make_resolver` must resolve pack entries from the same pack data file that produced the
+ /// `entries` iterator.
+ ///
+ /// * `kind` is the version of pack index to produce, use [`crate::index::Version::default()`] if in doubt.
+ /// * `tread_limit` is used for a parallel tree traversal for obtaining object hashes with optimal performance.
+ /// * `root_progress` is the top-level progress to stay informed about the progress of this potentially long-running
+ /// computation.
+ /// * `object_hash` defines what kind of object hash we write into the index file.
+ /// * `pack_version` is the version of the underlying pack for which `entries` are read. It's used in case none of these objects are provided
+ /// to compute a pack-hash.
+ ///
+ /// # Remarks
+ ///
+ /// * neither in-pack nor out-of-pack Ref Deltas are supported here, these must have been resolved beforehand.
+ /// * `make_resolver()` will only be called after the iterator stopped returning elements and produces a function that
+ /// provides all bytes belonging to a pack entry writing them to the given mutable output `Vec`.
+ /// It should return `None` if the entry cannot be resolved from the pack that produced the `entries` iterator, causing
+ /// the write operation to fail.
+ #[allow(clippy::too_many_arguments)]
+ pub fn write_data_iter_to_stream<F, F2>(
+ version: crate::index::Version,
+ make_resolver: F,
+ entries: impl Iterator<Item = Result<crate::data::input::Entry, crate::data::input::Error>>,
+ thread_limit: Option<usize>,
+ mut root_progress: impl Progress,
+ out: impl io::Write,
+ should_interrupt: &AtomicBool,
+ object_hash: gix_hash::Kind,
+ pack_version: crate::data::Version,
+ ) -> Result<Outcome, Error>
+ where
+ F: FnOnce() -> io::Result<F2>,
+ F2: for<'r> Fn(crate::data::EntryRange, &'r mut Vec<u8>) -> Option<()> + Send + Clone,
+ {
+ if version != crate::index::Version::default() {
+ return Err(Error::Unsupported(version));
+ }
+ let mut num_objects: usize = 0;
+ let mut last_seen_trailer = None;
+ let (anticipated_num_objects, upper_bound) = entries.size_hint();
+ let worst_case_num_objects_after_thin_pack_resolution = upper_bound.unwrap_or(anticipated_num_objects);
+ let mut tree = Tree::with_capacity(worst_case_num_objects_after_thin_pack_resolution)?;
+ let indexing_start = std::time::Instant::now();
+
+ root_progress.init(Some(4), progress::steps());
+ let mut objects_progress = root_progress.add_child_with_id("indexing", ProgressId::IndexObjects.into());
+ objects_progress.init(Some(anticipated_num_objects), progress::count("objects"));
+ let mut decompressed_progress =
+ root_progress.add_child_with_id("decompressing", ProgressId::DecompressedBytes.into());
+ decompressed_progress.init(None, progress::bytes());
+ let mut pack_entries_end: u64 = 0;
+
+ for entry in entries {
+ let crate::data::input::Entry {
+ header,
+ pack_offset,
+ crc32,
+ header_size,
+ compressed: _,
+ compressed_size,
+ decompressed_size,
+ trailer,
+ } = entry?;
+
+ decompressed_progress.inc_by(decompressed_size as usize);
+
+ let entry_len = header_size as u64 + compressed_size;
+ pack_entries_end = pack_offset + entry_len;
+
+ let crc32 = crc32.expect("crc32 to be computed by the iterator. Caller assures correct configuration.");
+
+ use crate::data::entry::Header::*;
+ match header {
+ Tree | Blob | Commit | Tag => {
+ tree.add_root(
+ pack_offset,
+ TreeEntry {
+ id: object_hash.null(),
+ crc32,
+ },
+ )?;
+ }
+ RefDelta { .. } => return Err(Error::IteratorInvariantNoRefDelta),
+ OfsDelta { base_distance } => {
+ let base_pack_offset =
+ crate::data::entry::Header::verified_base_pack_offset(pack_offset, base_distance).ok_or(
+ Error::IteratorInvariantBaseOffset {
+ pack_offset,
+ distance: base_distance,
+ },
+ )?;
+ tree.add_child(
+ base_pack_offset,
+ pack_offset,
+ TreeEntry {
+ id: object_hash.null(),
+ crc32,
+ },
+ )?;
+ }
+ };
+ last_seen_trailer = trailer;
+ num_objects += 1;
+ objects_progress.inc();
+ }
+ let num_objects: u32 = num_objects
+ .try_into()
+ .map_err(|_| Error::IteratorInvariantTooManyObjects(num_objects))?;
+
+ objects_progress.show_throughput(indexing_start);
+ decompressed_progress.show_throughput(indexing_start);
+ drop(objects_progress);
+ drop(decompressed_progress);
+
+ root_progress.inc();
+
+ let resolver = make_resolver()?;
+ let sorted_pack_offsets_by_oid = {
+ let traverse::Outcome { roots, children } = tree.traverse(
+ resolver,
+ pack_entries_end,
+ || (),
+ |data,
+ _progress,
+ traverse::Context {
+ entry,
+ decompressed: bytes,
+ ..
+ }| {
+ modify_base(data, entry, bytes, version.hash());
+ Ok::<_, Error>(())
+ },
+ traverse::Options {
+ object_progress: root_progress.add_child_with_id("Resolving", ProgressId::ResolveObjects.into()),
+ size_progress: root_progress.add_child_with_id("Decoding", ProgressId::DecodedBytes.into()),
+ thread_limit,
+ should_interrupt,
+ object_hash,
+ },
+ )?;
+ root_progress.inc();
+
+ let mut items = roots;
+ items.extend(children);
+ {
+ let _progress = root_progress.add_child_with_id("sorting by id", gix_features::progress::UNKNOWN);
+ items.sort_by_key(|e| e.data.id);
+ }
+
+ root_progress.inc();
+ items
+ };
+
+ let pack_hash = match last_seen_trailer {
+ Some(ph) => ph,
+ None if num_objects == 0 => {
+ let header = crate::data::header::encode(pack_version, 0);
+ let mut hasher = gix_features::hash::hasher(object_hash);
+ hasher.update(&header);
+ gix_hash::ObjectId::from(hasher.digest())
+ }
+ None => return Err(Error::IteratorInvariantTrailer),
+ };
+ let index_hash = encode::write_to(
+ out,
+ sorted_pack_offsets_by_oid,
+ &pack_hash,
+ version,
+ root_progress.add_child_with_id("writing index file", ProgressId::IndexBytesWritten.into()),
+ )?;
+ root_progress.show_throughput_with(
+ indexing_start,
+ num_objects as usize,
+ progress::count("objects").expect("unit always set"),
+ progress::MessageLevel::Success,
+ );
+ Ok(Outcome {
+ index_version: version,
+ index_hash,
+ data_hash: pack_hash,
+ num_objects,
+ })
+ }
+}
+
+fn modify_base(entry: &mut TreeEntry, pack_entry: &crate::data::Entry, decompressed: &[u8], hash: gix_hash::Kind) {
+ fn compute_hash(kind: gix_object::Kind, bytes: &[u8], object_hash: gix_hash::Kind) -> gix_hash::ObjectId {
+ let mut hasher = gix_features::hash::hasher(object_hash);
+ hasher.update(&gix_object::encode::loose_header(kind, bytes.len()));
+ hasher.update(bytes);
+ gix_hash::ObjectId::from(hasher.digest())
+ }
+
+ let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration");
+ let id = compute_hash(object_kind, decompressed, hash);
+ entry.id = id;
+}