diff options
Diffstat (limited to 'vendor/gix-pack/src/data')
28 files changed, 3633 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/data/delta.rs b/vendor/gix-pack/src/data/delta.rs new file mode 100644 index 000000000..a898e4aaf --- /dev/null +++ b/vendor/gix-pack/src/data/delta.rs @@ -0,0 +1,70 @@ +/// Given the decompressed pack delta `d`, decode a size in bytes (either the base object size or the result object size) +/// Equivalent to [this canonical git function](https://github.com/git/git/blob/311531c9de557d25ac087c1637818bd2aad6eb3a/delta.h#L89) +pub fn decode_header_size(d: &[u8]) -> (u64, usize) { + let mut i = 0; + let mut size = 0u64; + let mut consumed = 0; + for cmd in d.iter() { + consumed += 1; + size |= (*cmd as u64 & 0x7f) << i; + i += 7; + if *cmd & 0x80 == 0 { + break; + } + } + (size, consumed) +} + +pub fn apply(base: &[u8], mut target: &mut [u8], data: &[u8]) { + let mut i = 0; + while let Some(cmd) = data.get(i) { + i += 1; + match cmd { + cmd if cmd & 0b1000_0000 != 0 => { + let (mut ofs, mut size): (u32, u32) = (0, 0); + if cmd & 0b0000_0001 != 0 { + ofs = data[i] as u32; + i += 1; + } + if cmd & 0b0000_0010 != 0 { + ofs |= (data[i] as u32) << 8; + i += 1; + } + if cmd & 0b0000_0100 != 0 { + ofs |= (data[i] as u32) << 16; + i += 1; + } + if cmd & 0b0000_1000 != 0 { + ofs |= (data[i] as u32) << 24; + i += 1; + } + if cmd & 0b0001_0000 != 0 { + size = data[i] as u32; + i += 1; + } + if cmd & 0b0010_0000 != 0 { + size |= (data[i] as u32) << 8; + i += 1; + } + if cmd & 0b0100_0000 != 0 { + size |= (data[i] as u32) << 16; + i += 1; + } + if size == 0 { + size = 0x10000; // 65536 + } + let ofs = ofs as usize; + std::io::Write::write(&mut target, &base[ofs..ofs + size as usize]) + .expect("delta copy from base: byte slices must match"); + } + 0 => panic!("encountered unsupported command code: 0"), + size => { + std::io::Write::write(&mut target, &data[i..i + *size as usize]) + .expect("delta copy data: slice sizes to match up"); + i += *size as usize; + } + } + } + assert_eq!(i, data.len()); + assert_eq!(target.len(), 0); +} diff --git a/vendor/gix-pack/src/data/entry/decode.rs b/vendor/gix-pack/src/data/entry/decode.rs new file mode 100644 index 000000000..79d7aecff --- /dev/null +++ b/vendor/gix-pack/src/data/entry/decode.rs @@ -0,0 +1,125 @@ +use std::io; + +use gix_features::decode::{leb64, leb64_from_read}; + +use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; +use crate::data; + +/// Decoding +impl data::Entry { + /// Decode an entry from the given entry data `d`, providing the `pack_offset` to allow tracking the start of the entry data section. + /// + /// # Panics + /// + /// If we cannot understand the header, garbage data is likely to trigger this. + pub fn from_bytes(d: &[u8], pack_offset: data::Offset, hash_len: usize) -> data::Entry { + let (type_id, size, mut consumed) = parse_header_info(d); + + use crate::data::entry::Header::*; + let object = match type_id { + OFS_DELTA => { + let (distance, leb_bytes) = leb64(&d[consumed..]); + let delta = OfsDelta { + base_distance: distance, + }; + consumed += leb_bytes; + delta + } + REF_DELTA => { + let delta = RefDelta { + base_id: gix_hash::ObjectId::from(&d[consumed..][..hash_len]), + }; + consumed += hash_len; + delta + } + BLOB => Blob, + TREE => Tree, + COMMIT => Commit, + TAG => Tag, + _ => panic!("We currently don't support any V3 features or extensions"), + }; + data::Entry { + header: object, + decompressed_size: size, + data_offset: pack_offset + consumed as u64, + } + } + + /// Instantiate an `Entry` from the reader `r`, providing the `pack_offset` to allow tracking the start of the entry data section. + pub fn from_read( + mut r: impl io::Read, + pack_offset: data::Offset, + hash_len: usize, + ) -> Result<data::Entry, io::Error> { + let (type_id, size, mut consumed) = streaming_parse_header_info(&mut r)?; + + use crate::data::entry::Header::*; + let object = match type_id { + OFS_DELTA => { + let (distance, leb_bytes) = leb64_from_read(&mut r)?; + let delta = OfsDelta { + base_distance: distance, + }; + consumed += leb_bytes; + delta + } + REF_DELTA => { + let mut buf = gix_hash::Kind::buf(); + let hash = &mut buf[..hash_len]; + r.read_exact(hash)?; + #[allow(clippy::redundant_slicing)] + let delta = RefDelta { + base_id: gix_hash::ObjectId::from(&hash[..]), + }; + consumed += hash_len; + delta + } + BLOB => Blob, + TREE => Tree, + COMMIT => Commit, + TAG => Tag, + _ => panic!("We currently don't support any V3 features or extensions"), + }; + Ok(data::Entry { + header: object, + decompressed_size: size, + data_offset: pack_offset + consumed as u64, + }) + } +} + +#[inline] +fn streaming_parse_header_info(mut read: impl io::Read) -> Result<(u8, u64, usize), io::Error> { + let mut byte = [0u8; 1]; + read.read_exact(&mut byte)?; + let mut c = byte[0]; + let mut i = 1; + let type_id = (c >> 4) & 0b0000_0111; + let mut size = c as u64 & 0b0000_1111; + let mut s = 4; + while c & 0b1000_0000 != 0 { + read.read_exact(&mut byte)?; + c = byte[0]; + i += 1; + size += ((c & 0b0111_1111) as u64) << s; + s += 7 + } + Ok((type_id, size, i)) +} + +/// Parses the header of a pack-entry, yielding object type id, decompressed object size, and consumed bytes +#[inline] +fn parse_header_info(data: &[u8]) -> (u8, u64, usize) { + let mut c = data[0]; + let mut i = 1; + let type_id = (c >> 4) & 0b0000_0111; + let mut size = c as u64 & 0b0000_1111; + let mut s = 4; + while c & 0b1000_0000 != 0 { + c = data[i]; + i += 1; + size += ((c & 0b0111_1111) as u64) << s; + s += 7 + } + (type_id, size, i) +} diff --git a/vendor/gix-pack/src/data/entry/header.rs b/vendor/gix-pack/src/data/entry/header.rs new file mode 100644 index 000000000..83983eab0 --- /dev/null +++ b/vendor/gix-pack/src/data/entry/header.rs @@ -0,0 +1,150 @@ +use std::io; + +use super::{BLOB, COMMIT, OFS_DELTA, REF_DELTA, TAG, TREE}; +use crate::data; + +/// The header portion of a pack data entry, identifying the kind of stored object. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Header { + /// The object is a commit + Commit, + /// The object is a tree + Tree, + /// The object is a blob + Blob, + /// The object is a tag + Tag, + /// Describes a delta-object which needs to be applied to a base. The base object is identified by the `base_id` field + /// which is found within the parent repository. + /// Most commonly used for **thin-packs** when receiving pack files from the server to refer to objects that are not + /// part of the pack but expected to be present in the receivers repository. + /// + /// # Note + /// This could also be an object within this pack if the LSB encoded offset would be larger than 20 bytes, which is unlikely to + /// happen. + /// + /// **The naming** is exactly the same as the canonical implementation uses, namely **REF_DELTA**. + RefDelta { base_id: gix_hash::ObjectId }, + /// Describes a delta-object present in this pack which acts as base for this object. + /// The base object is measured as a distance from this objects + /// pack offset, so that `base_pack_offset = this_objects_pack_offset - base_distance` + /// + /// # Note + /// + /// **The naming** is exactly the same as the canonical implementation uses, namely **OFS_DELTA**. + OfsDelta { base_distance: u64 }, +} + +impl Header { + /// Subtract `distance` from `pack_offset` safely without the chance for overflow or no-ops if `distance` is 0. + pub fn verified_base_pack_offset(pack_offset: data::Offset, distance: u64) -> Option<data::Offset> { + if distance == 0 { + return None; + } + pack_offset.checked_sub(distance) + } + /// Convert the header's object kind into [`gix_object::Kind`] if possible + pub fn as_kind(&self) -> Option<gix_object::Kind> { + use gix_object::Kind::*; + Some(match self { + Header::Tree => Tree, + Header::Blob => Blob, + Header::Commit => Commit, + Header::Tag => Tag, + Header::RefDelta { .. } | Header::OfsDelta { .. } => return None, + }) + } + /// Convert this header's object kind into the packs internal representation + pub fn as_type_id(&self) -> u8 { + use Header::*; + match self { + Blob => BLOB, + Tree => TREE, + Commit => COMMIT, + Tag => TAG, + OfsDelta { .. } => OFS_DELTA, + RefDelta { .. } => REF_DELTA, + } + } + /// Return's true if this is a delta object, i.e. not a full object. + pub fn is_delta(&self) -> bool { + matches!(self, Header::OfsDelta { .. } | Header::RefDelta { .. }) + } + /// Return's true if this is a base object, i.e. not a delta object. + pub fn is_base(&self) -> bool { + !self.is_delta() + } +} + +impl Header { + /// Encode this header along the given `decompressed_size_in_bytes` into the `out` write stream for use within a data pack. + /// + /// Returns the amount of bytes written to `out`. + /// `decompressed_size_in_bytes` is the full size in bytes of the object that this header represents + pub fn write_to(&self, decompressed_size_in_bytes: u64, mut out: impl io::Write) -> io::Result<usize> { + let mut size = decompressed_size_in_bytes; + let mut written = 1; + let mut c: u8 = (self.as_type_id() << 4) | (size as u8 & 0b0000_1111); + size >>= 4; + while size != 0 { + out.write_all(&[c | 0b1000_0000])?; + written += 1; + c = size as u8 & 0b0111_1111; + size >>= 7; + } + out.write_all(&[c])?; + + use Header::*; + match self { + RefDelta { base_id: oid } => { + out.write_all(oid.as_slice())?; + written += oid.as_slice().len(); + } + OfsDelta { base_distance } => { + let mut buf = [0u8; 10]; + let buf = leb64_encode(*base_distance, &mut buf); + out.write_all(buf)?; + written += buf.len(); + } + Blob | Tree | Commit | Tag => {} + } + Ok(written) + } + + /// The size of the header in bytes when serialized + pub fn size(&self, decompressed_size: u64) -> usize { + self.write_to(decompressed_size, io::sink()) + .expect("io::sink() to never fail") + } +} + +#[inline] +fn leb64_encode(mut n: u64, buf: &mut [u8; 10]) -> &[u8] { + let mut bytes_written = 1; + buf[buf.len() - 1] = n as u8 & 0b0111_1111; + for out in buf.iter_mut().rev().skip(1) { + n >>= 7; + if n == 0 { + break; + } + n -= 1; + *out = 0b1000_0000 | (n as u8 & 0b0111_1111); + bytes_written += 1; + } + debug_assert_eq!(n, 0, "BUG: buffer must be large enough to hold a 64 bit integer"); + &buf[buf.len() - bytes_written..] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn leb64_encode_max_int() { + let mut buf = [0u8; 10]; + let buf = leb64_encode(u64::MAX, &mut buf); + assert_eq!(buf.len(), 10, "10 bytes should be used when 64bits are encoded"); + } +} diff --git a/vendor/gix-pack/src/data/entry/mod.rs b/vendor/gix-pack/src/data/entry/mod.rs new file mode 100644 index 000000000..f11c39c5c --- /dev/null +++ b/vendor/gix-pack/src/data/entry/mod.rs @@ -0,0 +1,53 @@ +use crate::data::Entry; + +const _TYPE_EXT1: u8 = 0; +const COMMIT: u8 = 1; +const TREE: u8 = 2; +const BLOB: u8 = 3; +const TAG: u8 = 4; +const _TYPE_EXT2: u8 = 5; +const OFS_DELTA: u8 = 6; +const REF_DELTA: u8 = 7; + +/// A way to uniquely identify the location of an entry within a pack bundle +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Location { + /// The id of the pack containing the object. It's unique within its frame of reference which is the owning object database. + pub pack_id: u32, + /// The size of the entry of disk so that the range of bytes of the entry is `pack_offset..pack_offset + entry_size`. + pub entry_size: usize, + /// The start of the entry in the pack identified by `pack_id`. + pub pack_offset: data::Offset, +} + +impl Location { + /// Compute a range suitable for lookup in pack data using the [`entry_slice()`][crate::data::File::entry_slice()] method. + pub fn entry_range(&self, pack_offset: data::Offset) -> crate::data::EntryRange { + pack_offset..pack_offset + self.entry_size as u64 + } +} + +/// Access +impl Entry { + /// Compute the pack offset to the base entry of the object represented by this entry. + pub fn base_pack_offset(&self, distance: u64) -> data::Offset { + let pack_offset = self.data_offset - self.header_size() as u64; + pack_offset.checked_sub(distance).expect("in-bound distance of deltas") + } + /// The pack offset at which this entry starts + pub fn pack_offset(&self) -> data::Offset { + self.data_offset - self.header_size() as u64 + } + /// The amount of bytes used to describe this entry in the pack. The header starts at [`Self::pack_offset()`] + pub fn header_size(&self) -> usize { + self.header.size(self.decompressed_size) + } +} + +mod decode; + +mod header; +pub use header::Header; + +use crate::data; diff --git a/vendor/gix-pack/src/data/file/decode/entry.rs b/vendor/gix-pack/src/data/file/decode/entry.rs new file mode 100644 index 000000000..60fefec0f --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/entry.rs @@ -0,0 +1,422 @@ +use std::{convert::TryInto, ops::Range}; + +use gix_features::zlib; +use smallvec::SmallVec; + +use crate::{ + cache, data, + data::{delta, file::decode::Error, File}, +}; + +/// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ResolvedBase { + /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. + InPack(data::Entry), + /// Indicates the object of `kind` was found outside of the pack, and its data was written into an output + /// vector which now has a length of `end`. + #[allow(missing_docs)] + OutOfPack { kind: gix_object::Kind, end: usize }, +} + +#[derive(Debug)] +struct Delta { + data: Range<usize>, + base_size: usize, + result_size: usize, + + decompressed_size: usize, + data_offset: data::Offset, +} + +/// Additional information and statistics about a successfully decoded object produced by [`File::decode_entry()`]. +/// +/// Useful to understand the effectiveness of the pack compression or the cost of decompression. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The kind of resolved object. + pub kind: gix_object::Kind, + /// The amount of deltas in the chain of objects that had to be resolved beforehand. + /// + /// This number is affected by the [`Cache`][cache::DecodeEntry] implementation, with cache hits shortening the + /// delta chain accordingly + pub num_deltas: u32, + /// The total decompressed size of all pack entries in the delta chain + pub decompressed_size: u64, + /// The total compressed size of all pack entries in the delta chain + pub compressed_size: usize, + /// The total size of the decoded object. + pub object_size: u64, +} + +impl Outcome { + pub(crate) fn default_from_kind(kind: gix_object::Kind) -> Self { + Self { + kind, + num_deltas: 0, + decompressed_size: 0, + compressed_size: 0, + object_size: 0, + } + } + fn from_object_entry(kind: gix_object::Kind, entry: &data::Entry, compressed_size: usize) -> Self { + Self { + kind, + num_deltas: 0, + decompressed_size: entry.decompressed_size, + compressed_size, + object_size: entry.decompressed_size, + } + } +} + +/// Decompression of objects +impl File { + /// Decompress the given `entry` into `out` and return the amount of bytes read from the pack data. + /// + /// _Note_ that this method does not resolve deltified objects, but merely decompresses their content + /// `out` is expected to be large enough to hold `entry.size` bytes. + /// + /// # Panics + /// + /// If `out` isn't large enough to hold the decompressed `entry` + pub fn decompress_entry(&self, entry: &data::Entry, out: &mut [u8]) -> Result<usize, Error> { + assert!( + out.len() as u64 >= entry.decompressed_size, + "output buffer isn't large enough to hold decompressed result, want {}, have {}", + entry.decompressed_size, + out.len() + ); + + self.decompress_entry_from_data_offset(entry.data_offset, out) + .map_err(Into::into) + } + + fn assure_v2(&self) { + assert!( + matches!(self.version, crate::data::Version::V2), + "Only V2 is implemented" + ); + } + + /// Obtain the [`Entry`][crate::data::Entry] at the given `offset` into the pack. + /// + /// The `offset` is typically obtained from the pack index file. + pub fn entry(&self, offset: data::Offset) -> data::Entry { + self.assure_v2(); + let pack_offset: usize = offset.try_into().expect("offset representable by machine"); + assert!(pack_offset <= self.data.len(), "offset out of bounds"); + + let object_data = &self.data[pack_offset..]; + data::Entry::from_bytes(object_data, offset, self.hash_len) + } + + /// Decompress the object expected at the given data offset, sans pack header. This information is only + /// known after the pack header was parsed. + /// Note that this method does not resolve deltified objects, but merely decompresses their content + /// `out` is expected to be large enough to hold `entry.size` bytes. + /// Returns the amount of packed bytes there read from the pack data file. + pub(crate) fn decompress_entry_from_data_offset( + &self, + data_offset: data::Offset, + out: &mut [u8], + ) -> Result<usize, zlib::inflate::Error> { + let offset: usize = data_offset.try_into().expect("offset representable by machine"); + assert!(offset < self.data.len(), "entry offset out of bounds"); + + zlib::Inflate::default() + .once(&self.data[offset..], out) + .map(|(_status, consumed_in, _consumed_out)| consumed_in) + } + + /// Like `decompress_entry_from_data_offset`, but returns consumed input and output. + pub(crate) fn decompress_entry_from_data_offset_2( + &self, + data_offset: data::Offset, + out: &mut [u8], + ) -> Result<(usize, usize), zlib::inflate::Error> { + let offset: usize = data_offset.try_into().expect("offset representable by machine"); + assert!(offset < self.data.len(), "entry offset out of bounds"); + + zlib::Inflate::default() + .once(&self.data[offset..], out) + .map(|(_status, consumed_in, consumed_out)| (consumed_in, consumed_out)) + } + + /// Decode an entry, resolving delta's as needed, while growing the `out` vector if there is not enough + /// space to hold the result object. + /// + /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. + /// + /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id is used to refer to + /// a base object, instead of an in-pack offset. + /// + /// `delta_cache` is a mechanism to avoid looking up base objects multiple times when decompressing multiple objects in a row. + /// Use a [Noop-Cache][cache::Never] to disable caching all together at the cost of repeating work. + pub fn decode_entry( + &self, + entry: data::Entry, + out: &mut Vec<u8>, + resolve: impl Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>, + delta_cache: &mut impl cache::DecodeEntry, + ) -> Result<Outcome, Error> { + use crate::data::entry::Header::*; + match entry.header { + Tree | Blob | Commit | Tag => { + out.resize( + entry + .decompressed_size + .try_into() + .expect("size representable by machine"), + 0, + ); + self.decompress_entry(&entry, out.as_mut_slice()).map(|consumed_input| { + Outcome::from_object_entry( + entry.header.as_kind().expect("a non-delta entry"), + &entry, + consumed_input, + ) + }) + } + OfsDelta { .. } | RefDelta { .. } => self.resolve_deltas(entry, resolve, out, delta_cache), + } + } + + /// resolve: technically, this shouldn't ever be required as stored local packs don't refer to objects by id + /// that are outside of the pack. Unless, of course, the ref refers to an object within this pack, which means + /// it's very, very large as 20bytes are smaller than the corresponding MSB encoded number + fn resolve_deltas( + &self, + last: data::Entry, + resolve: impl Fn(&gix_hash::oid, &mut Vec<u8>) -> Option<ResolvedBase>, + out: &mut Vec<u8>, + cache: &mut impl cache::DecodeEntry, + ) -> Result<Outcome, Error> { + // all deltas, from the one that produces the desired object (first) to the oldest at the end of the chain + let mut chain = SmallVec::<[Delta; 10]>::default(); + let first_entry = last.clone(); + let mut cursor = last; + let mut base_buffer_size: Option<usize> = None; + let mut object_kind: Option<gix_object::Kind> = None; + let mut consumed_input: Option<usize> = None; + + // Find the first full base, either an undeltified object in the pack or a reference to another object. + let mut total_delta_data_size: u64 = 0; + while cursor.header.is_delta() { + if let Some((kind, packed_size)) = cache.get(self.id, cursor.data_offset, out) { + base_buffer_size = Some(out.len()); + object_kind = Some(kind); + // If the input entry is a cache hit, keep the packed size as it must be returned. + // Otherwise, the packed size will be determined later when decompressing the input delta + if total_delta_data_size == 0 { + consumed_input = Some(packed_size); + } + break; + } + total_delta_data_size += cursor.decompressed_size; + let decompressed_size = cursor + .decompressed_size + .try_into() + .expect("a single delta size small enough to fit a usize"); + chain.push(Delta { + data: Range { + start: 0, + end: decompressed_size, + }, + base_size: 0, + result_size: 0, + decompressed_size, + data_offset: cursor.data_offset, + }); + use crate::data::entry::Header; + cursor = match cursor.header { + Header::OfsDelta { base_distance } => self.entry(cursor.base_pack_offset(base_distance)), + Header::RefDelta { base_id } => match resolve(base_id.as_ref(), out) { + Some(ResolvedBase::InPack(entry)) => entry, + Some(ResolvedBase::OutOfPack { end, kind }) => { + base_buffer_size = Some(end); + object_kind = Some(kind); + break; + } + None => return Err(Error::DeltaBaseUnresolved(base_id)), + }, + _ => unreachable!("cursor.is_delta() only allows deltas here"), + }; + } + + // This can happen if the cache held the first entry itself + // We will just treat it as an object then, even though it's technically incorrect. + if chain.is_empty() { + return Ok(Outcome::from_object_entry( + object_kind.expect("object kind as set by cache"), + &first_entry, + consumed_input.expect("consumed bytes as set by cache"), + )); + }; + + // First pass will decompress all delta data and keep it in our output buffer + // [<possibly resolved base object>]<delta-1..delta-n>... + // so that we can find the biggest result size. + let total_delta_data_size: usize = total_delta_data_size.try_into().expect("delta data to fit in memory"); + + let chain_len = chain.len(); + let (first_buffer_end, second_buffer_end) = { + let delta_start = base_buffer_size.unwrap_or(0); + out.resize(delta_start + total_delta_data_size, 0); + + let delta_range = Range { + start: delta_start, + end: delta_start + total_delta_data_size, + }; + let mut instructions = &mut out[delta_range.clone()]; + let mut relative_delta_start = 0; + let mut biggest_result_size = 0; + for (delta_idx, delta) in chain.iter_mut().rev().enumerate() { + let consumed_from_data_offset = self.decompress_entry_from_data_offset( + delta.data_offset, + &mut instructions[..delta.decompressed_size], + )?; + let is_last_delta_to_be_applied = delta_idx + 1 == chain_len; + if is_last_delta_to_be_applied { + consumed_input = Some(consumed_from_data_offset); + } + + let (base_size, offset) = delta::decode_header_size(instructions); + let mut bytes_consumed_by_header = offset; + biggest_result_size = biggest_result_size.max(base_size); + delta.base_size = base_size.try_into().expect("base size fits into usize"); + + let (result_size, offset) = delta::decode_header_size(&instructions[offset..]); + bytes_consumed_by_header += offset; + biggest_result_size = biggest_result_size.max(result_size); + delta.result_size = result_size.try_into().expect("result size fits into usize"); + + // the absolute location into the instructions buffer, so we keep track of the end point of the last + delta.data.start = relative_delta_start + bytes_consumed_by_header; + relative_delta_start += delta.decompressed_size; + delta.data.end = relative_delta_start; + + instructions = &mut instructions[delta.decompressed_size..]; + } + + // Now we can produce a buffer like this + // [<biggest-result-buffer, possibly filled with resolved base object data>]<biggest-result-buffer><delta-1..delta-n> + // from [<possibly resolved base object>]<delta-1..delta-n>... + let biggest_result_size: usize = biggest_result_size + .try_into() + .expect("biggest result size small enough to fit into usize"); + let first_buffer_size = biggest_result_size; + let second_buffer_size = first_buffer_size; + out.resize(first_buffer_size + second_buffer_size + total_delta_data_size, 0); + + // Now 'rescue' the deltas, because in the next step we possibly overwrite that portion + // of memory with the base object (in the majority of cases) + let second_buffer_end = { + let end = first_buffer_size + second_buffer_size; + if delta_range.start < end { + // …this means that the delta size is even larger than two uncompressed worst-case + // intermediate results combined. It would already be undesirable to have it bigger + // then the target size (as you could just store the object in whole). + // However, this just means that it reuses existing deltas smartly, which as we rightfully + // remember stand for an object each. However, this means a lot of data is read to restore + // a single object sometimes. Fair enough - package size is minimized that way. + out.copy_within(delta_range, end); + } else { + let (buffers, instructions) = out.split_at_mut(end); + instructions.copy_from_slice(&buffers[delta_range]); + } + end + }; + + // If we don't have a out-of-pack object already, fill the base-buffer by decompressing the full object + // at which the cursor is left after the iteration + if base_buffer_size.is_none() { + let base_entry = cursor; + debug_assert!(!base_entry.header.is_delta()); + object_kind = base_entry.header.as_kind(); + self.decompress_entry_from_data_offset(base_entry.data_offset, out)?; + } + + (first_buffer_size, second_buffer_end) + }; + + // From oldest to most recent, apply all deltas, swapping the buffer back and forth + // TODO: once we have more tests, we could optimize this memory-intensive work to + // analyse the delta-chains to only copy data once - after all, with 'copy-from-base' deltas, + // all data originates from one base at some point. + // `out` is: [source-buffer][target-buffer][max-delta-instructions-buffer] + let (buffers, instructions) = out.split_at_mut(second_buffer_end); + let (mut source_buf, mut target_buf) = buffers.split_at_mut(first_buffer_end); + + let mut last_result_size = None; + for ( + delta_idx, + Delta { + data, + base_size, + result_size, + .. + }, + ) in chain.into_iter().rev().enumerate() + { + let data = &mut instructions[data]; + if delta_idx + 1 == chain_len { + last_result_size = Some(result_size); + } + delta::apply(&source_buf[..base_size], &mut target_buf[..result_size], data); + // use the target as source for the next delta + std::mem::swap(&mut source_buf, &mut target_buf); + } + + let last_result_size = last_result_size.expect("at least one delta chain item"); + // uneven chains leave the target buffer after the source buffer + // FIXME(Performance) If delta-chains are uneven, we know we will have to copy bytes over here + // Instead we could use a different start buffer, to naturally end up with the result in the + // right one. + // However, this is a bit more complicated than just that - you have to deal with the base + // object, which should also be placed in the second buffer right away. You don't have that + // control/knowledge for out-of-pack bases, so this is a special case to deal with, too. + // Maybe these invariants can be represented in the type system though. + if chain_len % 2 == 1 { + // this seems inverted, but remember: we swapped the buffers on the last iteration + target_buf[..last_result_size].copy_from_slice(&source_buf[..last_result_size]); + } + out.resize(last_result_size, 0); + + let object_kind = object_kind.expect("a base object as root of any delta chain that we are here to resolve"); + let consumed_input = consumed_input.expect("at least one decompressed delta object"); + cache.put( + self.id, + first_entry.data_offset, + out.as_slice(), + object_kind, + consumed_input, + ); + Ok(Outcome { + kind: object_kind, + // technically depending on the cache, the chain size is not correct as it might + // have been cut short by a cache hit. The caller must deactivate the cache to get + // actual results + num_deltas: chain_len as u32, + decompressed_size: first_entry.decompressed_size, + compressed_size: consumed_input, + object_size: last_result_size as u64, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_decode_entry_outcome() { + assert_eq!( + std::mem::size_of::<Outcome>(), + 32, + "this shouldn't change without use noticing as it's returned a lot" + ); + } +} diff --git a/vendor/gix-pack/src/data/file/decode/header.rs b/vendor/gix-pack/src/data/file/decode/header.rs new file mode 100644 index 000000000..1f4b1de0a --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/header.rs @@ -0,0 +1,114 @@ +use crate::{ + data, + data::{delta, file::decode::Error, File}, +}; + +/// A return value of a resolve function, which given an [`ObjectId`][gix_hash::ObjectId] determines where an object can be found. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ResolvedBase { + /// Indicate an object is within this pack, at the given entry, and thus can be looked up locally. + InPack(data::Entry), + /// Indicates the object of `kind` was found outside of the pack. + OutOfPack { + /// The kind of object we found when reading the header of the out-of-pack base. + kind: gix_object::Kind, + /// The amount of deltas encountered if the object was packed as well. + num_deltas: Option<u32>, + }, +} + +/// Additional information and statistics about a successfully decoded object produced by [`File::decode_header()`]. +/// +/// Useful to understand the effectiveness of the pack compression or the cost of decompression. +#[derive(Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The kind of resolved object. + pub kind: gix_object::Kind, + /// The decompressed size of the object. + pub object_size: u64, + /// The amount of deltas in the chain of objects that had to be resolved beforehand. + pub num_deltas: u32, +} + +/// Obtain object information quickly. +impl File { + /// Resolve the object header information starting at `entry`, following the chain of entries as needed. + /// + /// The `entry` determines which object to decode, and is commonly obtained with the help of a pack index file or through pack iteration. + /// + /// `resolve` is a function to lookup objects with the given [`ObjectId`][gix_hash::ObjectId], in case the full object id + /// is used to refer to a base object, instead of an in-pack offset. + pub fn decode_header( + &self, + mut entry: data::Entry, + resolve: impl Fn(&gix_hash::oid) -> Option<ResolvedBase>, + ) -> Result<Outcome, Error> { + use crate::data::entry::Header::*; + let mut num_deltas = 0; + let mut first_delta_decompressed_size = None::<u64>; + loop { + match entry.header { + Tree | Blob | Commit | Tag => { + return Ok(Outcome { + kind: entry.header.as_kind().expect("always valid for non-refs"), + object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), + num_deltas, + }); + } + OfsDelta { base_distance } => { + num_deltas += 1; + if first_delta_decompressed_size.is_none() { + first_delta_decompressed_size = Some(self.decode_delta_object_size(&entry)?); + } + entry = self.entry(entry.base_pack_offset(base_distance)) + } + RefDelta { base_id } => { + num_deltas += 1; + if first_delta_decompressed_size.is_none() { + first_delta_decompressed_size = Some(self.decode_delta_object_size(&entry)?); + } + match resolve(base_id.as_ref()) { + Some(ResolvedBase::InPack(base_entry)) => entry = base_entry, + Some(ResolvedBase::OutOfPack { + kind, + num_deltas: origin_num_deltas, + }) => { + return Ok(Outcome { + kind, + object_size: first_delta_decompressed_size.unwrap_or(entry.decompressed_size), + num_deltas: origin_num_deltas.unwrap_or_default() + num_deltas, + }) + } + None => return Err(Error::DeltaBaseUnresolved(base_id)), + } + } + }; + } + } + + #[inline] + fn decode_delta_object_size(&self, entry: &data::Entry) -> Result<u64, Error> { + let mut buf = [0_u8; 32]; + let used = self.decompress_entry_from_data_offset_2(entry.data_offset, &mut buf)?.1; + let buf = &buf[..used]; + let (_base_size, offset) = delta::decode_header_size(buf); + let (result_size, _offset) = delta::decode_header_size(&buf[offset..]); + Ok(result_size) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn size_of_decode_entry_outcome() { + assert_eq!( + std::mem::size_of::<Outcome>(), + 16, + "this shouldn't change without use noticing as it's returned a lot" + ); + } +} diff --git a/vendor/gix-pack/src/data/file/decode/mod.rs b/vendor/gix-pack/src/data/file/decode/mod.rs new file mode 100644 index 000000000..10bb7f19b --- /dev/null +++ b/vendor/gix-pack/src/data/file/decode/mod.rs @@ -0,0 +1,16 @@ +/// +pub mod entry; +/// +pub mod header; + +/// Returned by [`File::decode_header()`][crate::data::File::decode_header()], +/// [`File::decode_entry()`][crate::data::File::decode_entry()] and . +/// [`File::decompress_entry()`][crate::data::File::decompress_entry()] +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("Failed to decompress pack entry")] + ZlibInflate(#[from] gix_features::zlib::inflate::Error), + #[error("A delta chain could not be followed as the ref base with id {0} could not be found")] + DeltaBaseUnresolved(gix_hash::ObjectId), +} diff --git a/vendor/gix-pack/src/data/file/init.rs b/vendor/gix-pack/src/data/file/init.rs new file mode 100644 index 000000000..b16072417 --- /dev/null +++ b/vendor/gix-pack/src/data/file/init.rs @@ -0,0 +1,41 @@ +use std::{convert::TryInto, path::Path}; + +use crate::data; + +/// Instantiation +impl data::File { + /// Try opening a data file at the given `path`. + /// + /// The `object_hash` is a way to read (and write) the same file format with different hashes, as the hash kind + /// isn't stored within the file format itself. + pub fn at(path: impl AsRef<Path>, object_hash: gix_hash::Kind) -> Result<data::File, data::header::decode::Error> { + Self::at_inner(path.as_ref(), object_hash) + } + + fn at_inner(path: &Path, object_hash: gix_hash::Kind) -> Result<data::File, data::header::decode::Error> { + use crate::data::header::N32_SIZE; + let hash_len = object_hash.len_in_bytes(); + + let data = crate::mmap::read_only(path).map_err(|e| data::header::decode::Error::Io { + source: e, + path: path.to_owned(), + })?; + let pack_len = data.len(); + if pack_len < N32_SIZE * 3 + hash_len { + return Err(data::header::decode::Error::Corrupt(format!( + "Pack data of size {pack_len} is too small for even an empty pack with shortest hash" + ))); + } + let (kind, num_objects) = + data::header::decode(&data[..12].try_into().expect("enough data after previous check"))?; + Ok(data::File { + data, + path: path.to_owned(), + id: gix_features::hash::crc32(path.as_os_str().to_string_lossy().as_bytes()), + version: kind, + num_objects, + hash_len, + object_hash, + }) + } +} diff --git a/vendor/gix-pack/src/data/file/mod.rs b/vendor/gix-pack/src/data/file/mod.rs new file mode 100644 index 000000000..6bfe0e272 --- /dev/null +++ b/vendor/gix-pack/src/data/file/mod.rs @@ -0,0 +1,9 @@ +mod init; +/// +pub mod verify; + +/// +pub mod decode; + +/// The bytes used as header in a pack data file. +pub type Header = [u8; 12]; diff --git a/vendor/gix-pack/src/data/file/verify.rs b/vendor/gix-pack/src/data/file/verify.rs new file mode 100644 index 000000000..afec20826 --- /dev/null +++ b/vendor/gix-pack/src/data/file/verify.rs @@ -0,0 +1,42 @@ +use std::sync::atomic::AtomicBool; + +use gix_features::progress::Progress; + +use crate::data::File; + +/// +pub mod checksum { + /// Returned by [`data::File::verify_checksum()`][crate::data::File::verify_checksum()]. + pub type Error = crate::verify::checksum::Error; +} + +/// Checksums and verify checksums +impl File { + /// The checksum in the trailer of this pack data file + pub fn checksum(&self) -> gix_hash::ObjectId { + gix_hash::ObjectId::from(&self.data[self.data.len() - self.hash_len..]) + } + + /// Verifies that the checksum of the packfile over all bytes preceding it indeed matches the actual checksum, + /// returning the actual checksum equivalent to the return value of [`checksum()`][File::checksum()] if there + /// is no mismatch. + /// + /// Note that if no `progress` is desired, one can pass [`gix_features::progress::Discard`]. + /// + /// Have a look at [`index::File::verify_integrity(…)`][crate::index::File::verify_integrity()] for an + /// even more thorough integrity check. + pub fn verify_checksum( + &self, + progress: impl Progress, + should_interrupt: &AtomicBool, + ) -> Result<gix_hash::ObjectId, checksum::Error> { + crate::verify::checksum_on_disk_or_mmap( + self.path(), + &self.data, + self.checksum(), + self.object_hash, + progress, + should_interrupt, + ) + } +} diff --git a/vendor/gix-pack/src/data/header.rs b/vendor/gix-pack/src/data/header.rs new file mode 100644 index 000000000..348a4ca24 --- /dev/null +++ b/vendor/gix-pack/src/data/header.rs @@ -0,0 +1,55 @@ +use crate::data; + +pub(crate) const N32_SIZE: usize = std::mem::size_of::<u32>(); + +/// Parses the first 12 bytes of a pack file, returning the pack version as well as the number of objects contained in the pack. +pub fn decode(data: &[u8; 12]) -> Result<(data::Version, u32), decode::Error> { + let mut ofs = 0; + if &data[ofs..ofs + b"PACK".len()] != b"PACK" { + return Err(decode::Error::Corrupt("Pack data type not recognized".into())); + } + ofs += N32_SIZE; + let kind = match crate::read_u32(&data[ofs..ofs + N32_SIZE]) { + 2 => data::Version::V2, + 3 => data::Version::V3, + v => return Err(decode::Error::UnsupportedVersion(v)), + }; + ofs += N32_SIZE; + let num_objects = crate::read_u32(&data[ofs..ofs + N32_SIZE]); + + Ok((kind, num_objects)) +} + +/// Write a pack data header at `version` with `num_objects` and return a buffer. +pub fn encode(version: data::Version, num_objects: u32) -> [u8; 12] { + use crate::data::Version::*; + let mut buf = [0u8; 12]; + buf[..4].copy_from_slice(b"PACK"); + buf[4..8].copy_from_slice( + &match version { + V2 => 2u32, + V3 => 3, + } + .to_be_bytes()[..], + ); + buf[8..].copy_from_slice(&num_objects.to_be_bytes()[..]); + buf +} + +/// +pub mod decode { + /// Returned by [`decode()`][super::decode()]. + #[derive(thiserror::Error, Debug)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not open pack file at '{path}'")] + Io { + source: std::io::Error, + path: std::path::PathBuf, + }, + #[error("{0}")] + Corrupt(String), + #[error("Unsupported pack version: {0}")] + UnsupportedVersion(u32), + } +} diff --git a/vendor/gix-pack/src/data/input/bytes_to_entries.rs b/vendor/gix-pack/src/data/input/bytes_to_entries.rs new file mode 100644 index 000000000..cf20d5fbf --- /dev/null +++ b/vendor/gix-pack/src/data/input/bytes_to_entries.rs @@ -0,0 +1,295 @@ +use std::{fs, io}; + +use gix_features::{ + hash, + hash::Sha1, + zlib::{stream::inflate::ReadBoxed, Decompress}, +}; +use gix_hash::ObjectId; + +use crate::data::input; + +/// An iterator over [`Entries`][input::Entry] in a byte stream. +/// +/// The iterator used as part of [Bundle::write_to_directory(…)][crate::Bundle::write_to_directory()]. +pub struct BytesToEntriesIter<BR> { + read: BR, + decompressor: Option<Box<Decompress>>, + offset: u64, + had_error: bool, + version: crate::data::Version, + objects_left: u32, + hash: Option<Sha1>, + mode: input::Mode, + compressed: input::EntryDataMode, + compressed_buf: Option<Vec<u8>>, + hash_len: usize, + object_hash: gix_hash::Kind, +} + +/// Access +impl<BR> BytesToEntriesIter<BR> { + /// The pack version currently being iterated + pub fn version(&self) -> crate::data::Version { + self.version + } + + /// The kind of iteration + pub fn mode(&self) -> input::Mode { + self.mode + } +} + +/// Initialization +impl<BR> BytesToEntriesIter<BR> +where + BR: io::BufRead, +{ + /// Obtain an iterator from a `read` stream to a pack data file and configure it using `mode` and `compressed`. + /// `object_hash` specifies which hash is used for objects in ref-delta entries. + /// + /// Note that `read` is expected at the beginning of a valid pack data file with a header, entries and a trailer. + pub fn new_from_header( + mut read: BR, + mode: input::Mode, + compressed: input::EntryDataMode, + object_hash: gix_hash::Kind, + ) -> Result<BytesToEntriesIter<BR>, input::Error> { + let mut header_data = [0u8; 12]; + read.read_exact(&mut header_data)?; + + let (version, num_objects) = crate::data::header::decode(&header_data)?; + assert_eq!( + version, + crate::data::Version::V2, + "let's stop here if we see undocumented pack formats" + ); + Ok(BytesToEntriesIter { + read, + decompressor: None, + compressed, + offset: 12, + had_error: false, + version, + objects_left: num_objects, + hash: (mode != input::Mode::AsIs).then(|| { + let mut hash = gix_features::hash::hasher(object_hash); + hash.update(&header_data); + hash + }), + mode, + compressed_buf: None, + hash_len: object_hash.len_in_bytes(), + object_hash, + }) + } + + fn next_inner(&mut self) -> Result<input::Entry, input::Error> { + self.objects_left -= 1; // even an error counts as objects + + // Read header + let entry = match self.hash.take() { + Some(hash) => { + let mut read = read_and_pass_to( + &mut self.read, + hash::Write { + inner: io::sink(), + hash, + }, + ); + let res = crate::data::Entry::from_read(&mut read, self.offset, self.hash_len); + self.hash = Some(read.write.hash); + res + } + None => crate::data::Entry::from_read(&mut self.read, self.offset, self.hash_len), + } + .map_err(input::Error::from)?; + + // Decompress object to learn its compressed bytes + let mut decompressor = self + .decompressor + .take() + .unwrap_or_else(|| Box::new(Decompress::new(true))); + let compressed_buf = self.compressed_buf.take().unwrap_or_else(|| Vec::with_capacity(4096)); + decompressor.reset(true); + let mut decompressed_reader = ReadBoxed { + inner: read_and_pass_to( + &mut self.read, + if self.compressed.keep() { + Vec::with_capacity(entry.decompressed_size as usize) + } else { + compressed_buf + }, + ), + decompressor, + }; + + let bytes_copied = io::copy(&mut decompressed_reader, &mut io::sink())?; + if bytes_copied != entry.decompressed_size { + return Err(input::Error::IncompletePack { + actual: bytes_copied, + expected: entry.decompressed_size, + }); + } + + let pack_offset = self.offset; + let compressed_size = decompressed_reader.decompressor.total_in(); + self.offset += entry.header_size() as u64 + compressed_size; + self.decompressor = Some(decompressed_reader.decompressor); + + let mut compressed = decompressed_reader.inner.write; + debug_assert_eq!( + compressed_size, + compressed.len() as u64, + "we must track exactly the same amount of bytes as read by the decompressor" + ); + if let Some(hash) = self.hash.as_mut() { + hash.update(&compressed); + } + + let crc32 = if self.compressed.crc32() { + let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; + let header_len = entry.header.write_to(bytes_copied, header_buf.as_mut())?; + let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); + Some(gix_features::hash::crc32_update(state, &compressed)) + } else { + None + }; + + let compressed = if self.compressed.keep() { + Some(compressed) + } else { + compressed.clear(); + self.compressed_buf = Some(compressed); + None + }; + + // Last objects gets trailer (which is potentially verified) + let trailer = self.try_read_trailer()?; + Ok(input::Entry { + header: entry.header, + header_size: entry.header_size() as u16, + compressed, + compressed_size, + crc32, + pack_offset, + decompressed_size: bytes_copied, + trailer, + }) + } + + fn try_read_trailer(&mut self) -> Result<Option<ObjectId>, input::Error> { + Ok(if self.objects_left == 0 { + let mut id = gix_hash::ObjectId::null(self.object_hash); + if let Err(err) = self.read.read_exact(id.as_mut_slice()) { + if self.mode != input::Mode::Restore { + return Err(err.into()); + } + } + + if let Some(hash) = self.hash.take() { + let actual_id = gix_hash::ObjectId::from(hash.digest()); + if self.mode == input::Mode::Restore { + id = actual_id; + } + if id != actual_id { + return Err(input::Error::ChecksumMismatch { + actual: actual_id, + expected: id, + }); + } + } + Some(id) + } else if self.mode == input::Mode::Restore { + let hash = self.hash.clone().expect("in restore mode a hash is set"); + Some(gix_hash::ObjectId::from(hash.digest())) + } else { + None + }) + } +} + +fn read_and_pass_to<R: io::Read, W: io::Write>(read: &mut R, to: W) -> PassThrough<&mut R, W> { + PassThrough { read, write: to } +} + +impl<R> Iterator for BytesToEntriesIter<R> +where + R: io::BufRead, +{ + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.had_error || self.objects_left == 0 { + return None; + } + let result = self.next_inner(); + self.had_error = result.is_err(); + if self.had_error { + self.objects_left = 0; + } + if self.mode == input::Mode::Restore && self.had_error { + None + } else { + Some(result) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (self.objects_left as usize, Some(self.objects_left as usize)) + } +} + +impl<R> std::iter::ExactSizeIterator for BytesToEntriesIter<R> where R: io::BufRead {} + +struct PassThrough<R, W> { + read: R, + write: W, +} + +impl<R, W> io::BufRead for PassThrough<R, W> +where + Self: io::Read, + R: io::BufRead, + W: io::Write, +{ + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.read.fill_buf() + } + + fn consume(&mut self, amt: usize) { + let buf = self + .read + .fill_buf() + .expect("never fail as we called fill-buf before and this does nothing"); + self.write + .write_all(&buf[..amt]) + .expect("a write to never fail - should be a memory buffer"); + self.read.consume(amt) + } +} + +impl<R, W> io::Read for PassThrough<R, W> +where + W: io::Write, + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + let bytes_read = self.read.read(buf)?; + self.write.write_all(&buf[..bytes_read])?; + Ok(bytes_read) + } +} + +impl crate::data::File { + /// Returns an iterator over [`Entries`][crate::data::input::Entry], without making use of the memory mapping. + pub fn streaming_iter(&self) -> Result<BytesToEntriesIter<impl io::BufRead>, input::Error> { + let reader = io::BufReader::with_capacity(4096 * 8, fs::File::open(&self.path)?); + BytesToEntriesIter::new_from_header( + reader, + input::Mode::Verify, + input::EntryDataMode::KeepAndCrc32, + self.object_hash, + ) + } +} diff --git a/vendor/gix-pack/src/data/input/entries_to_bytes.rs b/vendor/gix-pack/src/data/input/entries_to_bytes.rs new file mode 100644 index 000000000..a8c21e653 --- /dev/null +++ b/vendor/gix-pack/src/data/input/entries_to_bytes.rs @@ -0,0 +1,155 @@ +use std::iter::Peekable; + +use gix_features::hash; + +use crate::data::input; + +/// An implementation of [`Iterator`] to write [encoded entries][input::Entry] to an inner implementation each time +/// `next()` is called. +/// +/// It is able to deal with an unknown amount of objects as it will rewrite the pack header once the entries iterator +/// is depleted and compute the hash in one go by re-reading the whole file. +pub struct EntriesToBytesIter<I: Iterator, W> { + /// An iterator for input [`input::Entry`] instances + pub input: Peekable<I>, + /// A way of writing encoded bytes. + output: W, + /// Our trailing hash when done writing all input entries + trailer: Option<gix_hash::ObjectId>, + /// The amount of objects in the iteration and the version of the packfile to be written. + /// Will be `None` to signal the header was written already. + data_version: crate::data::Version, + /// The amount of entries seen so far + num_entries: u32, + /// If we are done, no additional writes will occur + is_done: bool, + /// The kind of hash to use for the digest + object_hash: gix_hash::Kind, +} + +impl<I, W> EntriesToBytesIter<I, W> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + W: std::io::Read + std::io::Write + std::io::Seek, +{ + /// Create a new instance reading [entries][input::Entry] from an `input` iterator and write pack data bytes to + /// `output` writer, resembling a pack of `version`. The amount of entries will be dynamically determined and + /// the pack is completed once the last entry was written. + /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. + /// + /// # Panics + /// + /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. + pub fn new(input: I, output: W, version: crate::data::Version, object_hash: gix_hash::Kind) -> Self { + assert!( + matches!(version, crate::data::Version::V2), + "currently only pack version 2 can be written", + ); + assert!( + matches!(object_hash, gix_hash::Kind::Sha1), + "currently only Sha1 is supported, right now we don't know how other hashes are encoded", + ); + EntriesToBytesIter { + input: input.peekable(), + output, + object_hash, + num_entries: 0, + trailer: None, + data_version: version, + is_done: false, + } + } + + /// Returns the trailing hash over all ~ entries once done. + /// It's `None` if we are not yet done writing. + pub fn digest(&self) -> Option<gix_hash::ObjectId> { + self.trailer + } + + fn next_inner(&mut self, entry: input::Entry) -> Result<input::Entry, input::Error> { + if self.num_entries == 0 { + let header_bytes = crate::data::header::encode(self.data_version, 0); + self.output.write_all(&header_bytes[..])?; + } + self.num_entries += 1; + entry.header.write_to(entry.decompressed_size, &mut self.output)?; + std::io::copy( + &mut entry + .compressed + .as_deref() + .expect("caller must configure generator to keep compressed bytes"), + &mut self.output, + )?; + Ok(entry) + } + + fn write_header_and_digest(&mut self, last_entry: Option<&mut input::Entry>) -> Result<(), input::Error> { + let header_bytes = crate::data::header::encode(self.data_version, self.num_entries); + let num_bytes_written = if last_entry.is_some() { + self.output.stream_position()? + } else { + header_bytes.len() as u64 + }; + self.output.rewind()?; + self.output.write_all(&header_bytes[..])?; + self.output.flush()?; + + self.output.rewind()?; + let interrupt_never = std::sync::atomic::AtomicBool::new(false); + let digest = hash::bytes( + &mut self.output, + num_bytes_written as usize, + self.object_hash, + &mut gix_features::progress::Discard, + &interrupt_never, + )?; + self.output.write_all(digest.as_slice())?; + self.output.flush()?; + + self.is_done = true; + if let Some(last_entry) = last_entry { + last_entry.trailer = Some(digest); + } + self.trailer = Some(digest); + Ok(()) + } +} + +impl<I, W> Iterator for EntriesToBytesIter<I, W> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + W: std::io::Read + std::io::Write + std::io::Seek, +{ + /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.is_done { + return None; + } + + match self.input.next() { + Some(res) => Some(match res { + Ok(entry) => self.next_inner(entry).and_then(|mut entry| { + if self.input.peek().is_none() { + self.write_header_and_digest(Some(&mut entry)).map(|_| entry) + } else { + Ok(entry) + } + }), + Err(err) => { + self.is_done = true; + Err(err) + } + }), + None => match self.write_header_and_digest(None) { + Ok(_) => None, + Err(err) => Some(Err(err)), + }, + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.input.size_hint() + } +} diff --git a/vendor/gix-pack/src/data/input/entry.rs b/vendor/gix-pack/src/data/input/entry.rs new file mode 100644 index 000000000..74d4800a0 --- /dev/null +++ b/vendor/gix-pack/src/data/input/entry.rs @@ -0,0 +1,65 @@ +use std::io::Write; + +use crate::data::{entry::Header, input}; + +impl input::Entry { + /// Create a new input entry from a given data `obj` set to be placed at the given `pack_offset`. + /// + /// This method is useful when arbitrary base entries are created + pub fn from_data_obj(obj: &gix_object::Data<'_>, pack_offset: u64) -> Result<Self, input::Error> { + let header = to_header(obj.kind); + let compressed = compress_data(obj)?; + let compressed_size = compressed.len() as u64; + let mut entry = input::Entry { + header, + header_size: header.size(obj.data.len() as u64) as u16, + pack_offset, + compressed: Some(compressed), + compressed_size, + crc32: None, + decompressed_size: obj.data.len() as u64, + trailer: None, + }; + entry.crc32 = Some(entry.compute_crc32()); + Ok(entry) + } + /// The amount of bytes this entry may consume in a pack data file + pub fn bytes_in_pack(&self) -> u64 { + self.header_size as u64 + self.compressed_size + } + + /// Update our CRC value by recalculating it from our header and compressed data. + pub fn compute_crc32(&self) -> u32 { + let mut header_buf = [0u8; 12 + gix_hash::Kind::longest().len_in_bytes()]; + let header_len = self + .header + .write_to(self.decompressed_size, header_buf.as_mut()) + .expect("write to memory will not fail"); + let state = gix_features::hash::crc32_update(0, &header_buf[..header_len]); + gix_features::hash::crc32_update(state, self.compressed.as_ref().expect("we always set it")) + } +} + +fn to_header(kind: gix_object::Kind) -> Header { + use gix_object::Kind::*; + match kind { + Tree => Header::Tree, + Blob => Header::Blob, + Commit => Header::Commit, + Tag => Header::Tag, + } +} + +fn compress_data(obj: &gix_object::Data<'_>) -> Result<Vec<u8>, input::Error> { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { + match err.kind() { + std::io::ErrorKind::Other => return Err(input::Error::Io(err)), + err => { + unreachable!("Should never see other errors than zlib, but got {:?}", err,) + } + } + }; + out.flush().expect("zlib flush should never fail"); + Ok(out.into_inner()) +} diff --git a/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs b/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs new file mode 100644 index 000000000..f52c645f8 --- /dev/null +++ b/vendor/gix-pack/src/data/input/lookup_ref_delta_objects.rs @@ -0,0 +1,211 @@ +use std::convert::TryInto; + +use gix_hash::ObjectId; + +use crate::data::{entry::Header, input}; + +/// An iterator to resolve thin packs on the fly. +pub struct LookupRefDeltaObjectsIter<I, LFn> { + /// The inner iterator whose entries we will resolve. + pub inner: I, + lookup: LFn, + /// The cached delta to provide next time we are called, it's the delta to go with the base we just resolved in its place. + next_delta: Option<input::Entry>, + /// Fuse to stop iteration after first missing object. + error: bool, + /// The overall pack-offset we accumulated thus far. Each inserted entry offsets all following + /// objects by its length. We need to determine exactly where the object was inserted to see if its affected at all. + inserted_entry_length_at_offset: Vec<Change>, + /// The sum of all entries added so far, as a cache to avoid recomputation + inserted_entries_length_in_bytes: i64, + buf: Vec<u8>, +} + +impl<I, LFn> LookupRefDeltaObjectsIter<I, LFn> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + LFn: for<'a> FnMut(ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>>, +{ + /// Create a new instance wrapping `iter` and using `lookup` as function to retrieve objects that will serve as bases + /// for ref deltas seen while traversing `iter`. + pub fn new(iter: I, lookup: LFn) -> Self { + LookupRefDeltaObjectsIter { + inner: iter, + lookup, + error: false, + inserted_entry_length_at_offset: Vec::new(), + inserted_entries_length_in_bytes: 0, + next_delta: None, + buf: Vec::new(), + } + } + + fn shifted_pack_offset(&self, pack_offset: u64) -> u64 { + let new_ofs = pack_offset as i64 + self.inserted_entries_length_in_bytes; + new_ofs.try_into().expect("offset value is never becomes negative") + } + + /// positive `size_change` values mean an object grew or was more commonly, was inserted. Negative values + /// mean the object shrunk, usually because there header changed from ref-deltas to ofs deltas. + fn track_change( + &mut self, + shifted_pack_offset: u64, + pack_offset: u64, + size_change: i64, + oid: impl Into<Option<ObjectId>>, + ) { + if size_change == 0 { + return; + } + self.inserted_entry_length_at_offset.push(Change { + shifted_pack_offset, + pack_offset, + size_change_in_bytes: size_change, + oid: oid.into().unwrap_or_else(|| + // NOTE: this value acts as sentinel and the actual hash kind doesn't matter. + gix_hash::Kind::Sha1.null()), + }); + self.inserted_entries_length_in_bytes += size_change; + } + + fn shift_entry_and_point_to_base_by_offset(&mut self, entry: &mut input::Entry, base_distance: u64) { + let pack_offset = entry.pack_offset; + entry.pack_offset = self.shifted_pack_offset(pack_offset); + entry.header = Header::OfsDelta { base_distance }; + let previous_header_size = entry.header_size; + entry.header_size = entry.header.size(entry.decompressed_size) as u16; + + let change = entry.header_size as i64 - previous_header_size as i64; + entry.crc32 = Some(entry.compute_crc32()); + self.track_change(entry.pack_offset, pack_offset, change, None); + } +} + +impl<I, LFn> Iterator for LookupRefDeltaObjectsIter<I, LFn> +where + I: Iterator<Item = Result<input::Entry, input::Error>>, + LFn: for<'a> FnMut(ObjectId, &'a mut Vec<u8>) -> Option<gix_object::Data<'a>>, +{ + type Item = Result<input::Entry, input::Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.error { + return None; + } + if let Some(delta) = self.next_delta.take() { + return Some(Ok(delta)); + } + match self.inner.next() { + Some(Ok(mut entry)) => match entry.header { + Header::RefDelta { base_id } => { + match self.inserted_entry_length_at_offset.iter().rfind(|e| e.oid == base_id) { + None => { + let base_entry = match (self.lookup)(base_id, &mut self.buf) { + Some(obj) => { + let current_pack_offset = entry.pack_offset; + let mut entry = match input::Entry::from_data_obj(&obj, 0) { + Ok(e) => e, + Err(err) => return Some(Err(err)), + }; + entry.pack_offset = self.shifted_pack_offset(current_pack_offset); + self.track_change( + entry.pack_offset, + current_pack_offset, + entry.bytes_in_pack() as i64, + base_id, + ); + entry + } + None => { + self.error = true; + return Some(Err(input::Error::NotFound { object_id: base_id })); + } + }; + + { + self.shift_entry_and_point_to_base_by_offset(&mut entry, base_entry.bytes_in_pack()); + self.next_delta = Some(entry); + } + Some(Ok(base_entry)) + } + Some(base_entry) => { + let base_distance = + self.shifted_pack_offset(entry.pack_offset) - base_entry.shifted_pack_offset; + self.shift_entry_and_point_to_base_by_offset(&mut entry, base_distance); + Some(Ok(entry)) + } + } + } + _ => { + if self.inserted_entries_length_in_bytes != 0 { + if let Header::OfsDelta { base_distance } = entry.header { + // We have to find the new distance based on the previous distance to the base, using the absolute + // pack offset computed from it as stored in `base_pack_offset`. + let base_pack_offset = entry + .pack_offset + .checked_sub(base_distance) + .expect("distance to be in range of pack"); + match self + .inserted_entry_length_at_offset + .binary_search_by_key(&base_pack_offset, |c| c.pack_offset) + { + Ok(index) => { + let index = { + let maybe_index_of_actual_entry = index + 1; + self.inserted_entry_length_at_offset + .get(maybe_index_of_actual_entry) + .and_then(|c| { + (c.pack_offset == base_pack_offset) + .then_some(maybe_index_of_actual_entry) + }) + .unwrap_or(index) + }; + let new_distance = self + .shifted_pack_offset(entry.pack_offset) + .checked_sub(self.inserted_entry_length_at_offset[index].shifted_pack_offset) + .expect("a base that is behind us in the pack"); + self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); + } + Err(index) => { + let change_since_offset = self.inserted_entry_length_at_offset[index..] + .iter() + .map(|c| c.size_change_in_bytes) + .sum::<i64>(); + let new_distance: u64 = { + (base_distance as i64 + change_since_offset) + .try_into() + .expect("it still points behind us") + }; + self.shift_entry_and_point_to_base_by_offset(&mut entry, new_distance); + } + } + } else { + // Offset this entry by all changes (positive or negative) that we saw thus far. + entry.pack_offset = self.shifted_pack_offset(entry.pack_offset); + } + } + Some(Ok(entry)) + } + }, + other => other, + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (min, max) = self.inner.size_hint(); + max.map(|max| (min, Some(max * 2))).unwrap_or_else(|| (min * 2, None)) + } +} + +#[derive(Debug)] +struct Change { + /// The original pack offset as mentioned in the entry we saw. This is used to find this as base object if deltas refer to it by + /// old offset. + pack_offset: u64, + /// The new pack offset that is the shifted location of the pack entry in the pack. + shifted_pack_offset: u64, + /// The size change of the entry header, negative values denote shrinking, positive denote growing. + size_change_in_bytes: i64, + /// The object id of the entry responsible for the change, or null if it's an entry just for tracking an insertion. + oid: ObjectId, +} diff --git a/vendor/gix-pack/src/data/input/mod.rs b/vendor/gix-pack/src/data/input/mod.rs new file mode 100644 index 000000000..df191de67 --- /dev/null +++ b/vendor/gix-pack/src/data/input/mod.rs @@ -0,0 +1,41 @@ +/// An item of the iteration produced by [`BytesToEntriesIter`] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The header of a pack entry + pub header: crate::data::entry::Header, + /// The amount of bytes used to encode the `header`. `pack_offset + header_size` is the beginning of + /// the compressed data in the pack. + pub header_size: u16, + /// The first byte of the entry at which the `header` can be read. + pub pack_offset: u64, + /// The bytes consumed while producing `decompressed` + /// These do not contain the header, which makes it possible to easily replace a RefDelta with offset deltas + /// when resolving thin packs. + /// Depends on `CompressionMode` when the iterator is initialized. + pub compressed: Option<Vec<u8>>, + /// The amount of bytes the compressed portion of the entry takes, i.e. the portion behind behind the header. + pub compressed_size: u64, + /// The CRC32 over the complete entry, that is encoded header and compressed object data. + /// Depends on `CompressionMode` when the iterator is initialized + pub crc32: Option<u32>, + /// The amount of decompressed bytes of the entry. + pub decompressed_size: u64, + /// Set for the last object in the iteration, providing the hash over all bytes of the iteration + /// for use as trailer in a pack or to verify it matches the trailer. + pub trailer: Option<gix_hash::ObjectId>, +} + +mod entry; + +mod types; +pub use types::{EntryDataMode, Error, Mode}; + +mod bytes_to_entries; +pub use bytes_to_entries::BytesToEntriesIter; + +mod lookup_ref_delta_objects; +pub use lookup_ref_delta_objects::LookupRefDeltaObjectsIter; + +mod entries_to_bytes; +pub use entries_to_bytes::EntriesToBytesIter; diff --git a/vendor/gix-pack/src/data/input/types.rs b/vendor/gix-pack/src/data/input/types.rs new file mode 100644 index 000000000..6fcd459e2 --- /dev/null +++ b/vendor/gix-pack/src/data/input/types.rs @@ -0,0 +1,73 @@ +use std::io; + +/// Returned by [`BytesToEntriesIter::new_from_header()`][crate::data::input::BytesToEntriesIter::new_from_header()] and as part +/// of `Item` of [`BytesToEntriesIter`][crate::data::input::BytesToEntriesIter]. +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("An IO operation failed while streaming an entry")] + Io(#[from] io::Error), + #[error(transparent)] + PackParse(#[from] crate::data::header::decode::Error), + #[error("pack checksum in trailer was {expected}, but actual checksum was {actual}")] + ChecksumMismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + }, + #[error("pack is incomplete: it was decompressed into {actual} bytes but {expected} bytes where expected.")] + IncompletePack { actual: u64, expected: u64 }, + #[error("The object {object_id} could not be decoded or wasn't found")] + NotFound { object_id: gix_hash::ObjectId }, +} + +/// Iteration Mode +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum Mode { + /// Provide the trailer as read from the pack + AsIs, + /// Generate an own hash and trigger an error on the last iterated object + /// if it does not match the hash provided with the pack. + /// + /// This way the one iterating the data cannot miss corruption as long as + /// the iteration is continued through to the end. + Verify, + /// Generate an own hash and if there was an error or the objects are depleted early + /// due to partial packs, return the last valid entry and with our own hash thus far. + /// Note that the existing pack hash, if present, will be ignored. + /// As we won't know which objects fails, every object will have the hash obtained thus far. + /// This also means that algorithms must know about this possibility, or else might wrongfully + /// assume the pack is finished. + Restore, +} + +/// Define what to do with the compressed bytes portion of a pack [`Entry`][super::Entry] +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum EntryDataMode { + /// Do nothing with the compressed bytes we read + Ignore, + /// Only create a CRC32 of the entry, otherwise similar to `Ignore` + Crc32, + /// Keep them and pass them along in a newly allocated buffer + Keep, + /// As above, but also compute a CRC32 + KeepAndCrc32, +} + +impl EntryDataMode { + /// Returns true if a crc32 should be computed + pub fn crc32(&self) -> bool { + match self { + EntryDataMode::KeepAndCrc32 | EntryDataMode::Crc32 => true, + EntryDataMode::Keep | EntryDataMode::Ignore => false, + } + } + /// Returns true if compressed bytes should be kept + pub fn keep(&self) -> bool { + match self { + EntryDataMode::Keep | EntryDataMode::KeepAndCrc32 => true, + EntryDataMode::Ignore | EntryDataMode::Crc32 => false, + } + } +} diff --git a/vendor/gix-pack/src/data/mod.rs b/vendor/gix-pack/src/data/mod.rs new file mode 100644 index 000000000..da717fc1a --- /dev/null +++ b/vendor/gix-pack/src/data/mod.rs @@ -0,0 +1,134 @@ +//! a pack data file +use std::{convert::TryInto, path::Path}; + +/// The offset to an entry into the pack data file, relative to its beginning. +pub type Offset = u64; + +/// An identifier to uniquely identify all packs loaded within a known context or namespace. +pub type Id = u32; + +use memmap2::Mmap; + +/// An representing an full- or delta-object within a pack +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The entry's header + pub header: entry::Header, + /// The decompressed size of the entry in bytes. + /// + /// Note that for non-delta entries this will be the size of the object itself. + pub decompressed_size: u64, + /// absolute offset to compressed object data in the pack, just behind the entry's header + pub data_offset: Offset, +} + +mod file; +pub use file::{decode, verify, Header}; +/// +pub mod header; + +/// +pub mod init { + pub use super::header::decode::Error; +} + +/// +pub mod entry; + +/// +pub mod input; + +/// Utilities to encode pack data entries and write them to a `Write` implementation to resemble a pack data file. +pub mod output; + +/// A slice into a pack file denoting a pack entry. +/// +/// An entry can be decoded into an object. +pub type EntryRange = std::ops::Range<Offset>; + +/// Supported versions of a pack data file +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +#[allow(missing_docs)] +pub enum Version { + V2, + V3, +} + +impl Default for Version { + fn default() -> Self { + Version::V2 + } +} + +/// A pack data file +pub struct File { + data: Mmap, + path: std::path::PathBuf, + /// A value to represent this pack uniquely when used with cache lookup, or a way to identify this pack by its location on disk. + /// The same location on disk should yield the same id. + /// + /// These must be unique per pack and must be stable, that is they don't change if the pack doesn't change. + /// If the same id is assigned (or reassigned) to different packs, pack creation or cache access will fail in hard-to-debug ways. + /// + /// This value is controlled by the owning object store, which can use it in whichever way it wants as long as the above constraints are met. + pub id: Id, + version: Version, + num_objects: u32, + /// The size of the hash contained within. This is entirely determined by the caller, and repositories have to know which hash to use + /// based on their configuration. + hash_len: usize, + object_hash: gix_hash::Kind, +} + +/// Information about the pack data file itself +impl File { + /// The pack data version of this file + pub fn version(&self) -> Version { + self.version + } + /// The number of objects stored in this pack data file + pub fn num_objects(&self) -> u32 { + self.num_objects + } + /// The length of all mapped data, including the pack header and the pack trailer + pub fn data_len(&self) -> usize { + self.data.len() + } + /// The kind of hash we use internally. + pub fn object_hash(&self) -> gix_hash::Kind { + self.object_hash + } + /// The position of the byte one past the last pack entry, or in other terms, the first byte of the trailing hash. + pub fn pack_end(&self) -> usize { + self.data.len() - self.hash_len + } + + /// The path to the pack data file on disk + pub fn path(&self) -> &Path { + &self.path + } + + /// Returns the pack data at the given slice if its range is contained in the mapped pack data + pub fn entry_slice(&self, slice: EntryRange) -> Option<&[u8]> { + let entry_end: usize = slice.end.try_into().expect("end of pack fits into usize"); + let entry_start = slice.start as usize; + self.data.get(entry_start..entry_end) + } + + /// Returns the CRC32 of the pack data indicated by `pack_offset` and the `size` of the mapped data. + /// + /// _Note:_ finding the right size is only possible by decompressing + /// the pack entry beforehand, or by using the (to be sorted) offsets stored in an index file. + /// + /// # Panics + /// + /// If `pack_offset` or `size` are pointing to a range outside of the mapped pack data. + pub fn entry_crc32(&self, pack_offset: Offset, size: usize) -> u32 { + let pack_offset: usize = pack_offset.try_into().expect("pack_size fits into usize"); + gix_features::hash::crc32(&self.data[pack_offset..pack_offset + size]) + } +} + +pub(crate) mod delta; diff --git a/vendor/gix-pack/src/data/output/bytes.rs b/vendor/gix-pack/src/data/output/bytes.rs new file mode 100644 index 000000000..ec219db9d --- /dev/null +++ b/vendor/gix-pack/src/data/output/bytes.rs @@ -0,0 +1,156 @@ +use std::io::Write; + +use gix_features::hash; + +use crate::data::output; + +/// The error returned by `next()` in the [`FromEntriesIter`] iterator. +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error<E> +where + E: std::error::Error + 'static, +{ + #[error(transparent)] + Io(#[from] std::io::Error), + #[error(transparent)] + Input(E), +} + +/// An implementation of [`Iterator`] to write [encoded entries][output::Entry] to an inner implementation each time +/// `next()` is called. +pub struct FromEntriesIter<I, W> { + /// An iterator for input [`output::Entry`] instances + pub input: I, + /// A way of writing encoded bytes. + output: hash::Write<W>, + /// Our trailing hash when done writing all input entries + trailer: Option<gix_hash::ObjectId>, + /// The amount of objects in the iteration and the version of the packfile to be written. + /// Will be `None` to signal the header was written already. + header_info: Option<(crate::data::Version, u32)>, + /// The pack data version with which pack entries should be written. + entry_version: crate::data::Version, + /// The amount of written bytes thus far + written: u64, + /// Required to quickly find offsets by object IDs, as future objects may refer to those in the past to become a delta offset base. + /// It stores the pack offsets at which objects begin. + /// Additionally we store if an object was invalid, and if so we will not write it nor will we allow delta objects to it. + pack_offsets_and_validity: Vec<(u64, bool)>, + /// If we are done, no additional writes will occur + is_done: bool, +} + +impl<I, W, E> FromEntriesIter<I, W> +where + I: Iterator<Item = Result<Vec<output::Entry>, E>>, + W: std::io::Write, + E: std::error::Error + 'static, +{ + /// Create a new instance reading [entries][output::Entry] from an `input` iterator and write pack data bytes to + /// `output` writer, resembling a pack of `version` with exactly `num_entries` amount of objects contained in it. + /// `object_hash` is the kind of hash to use for the pack checksum and maybe other places, depending on the version. + /// + /// The input chunks are expected to be sorted already. You can use the [InOrderIter][gix_features::parallel::InOrderIter] to assure + /// this happens on the fly holding entire chunks in memory as long as needed for them to be dispensed in order. + /// + /// # Panics + /// + /// Not all combinations of `object_hash` and `version` are supported currently triggering assertion errors. + pub fn new( + input: I, + output: W, + num_entries: u32, + version: crate::data::Version, + object_hash: gix_hash::Kind, + ) -> Self { + assert!( + matches!(version, crate::data::Version::V2), + "currently only pack version 2 can be written", + ); + FromEntriesIter { + input, + output: hash::Write::new(output, object_hash), + trailer: None, + entry_version: version, + pack_offsets_and_validity: Vec::with_capacity(num_entries as usize), + written: 0, + header_info: Some((version, num_entries)), + is_done: false, + } + } + + /// Consume this instance and return the `output` implementation. + /// + /// _Note_ that the `input` iterator can be moved out of this instance beforehand. + pub fn into_write(self) -> W { + self.output.inner + } + + /// Returns the trailing hash over all written entries once done. + /// It's `None` if we are not yet done writing. + pub fn digest(&self) -> Option<gix_hash::ObjectId> { + self.trailer + } + + fn next_inner(&mut self) -> Result<u64, Error<E>> { + let previous_written = self.written; + if let Some((version, num_entries)) = self.header_info.take() { + let header_bytes = crate::data::header::encode(version, num_entries); + self.output.write_all(&header_bytes[..])?; + self.written += header_bytes.len() as u64; + } + match self.input.next() { + Some(entries) => { + for entry in entries.map_err(Error::Input)? { + if entry.is_invalid() { + self.pack_offsets_and_validity.push((0, false)); + continue; + }; + self.pack_offsets_and_validity.push((self.written, true)); + let header = entry.to_entry_header(self.entry_version, |index| { + let (base_offset, is_valid_object) = self.pack_offsets_and_validity[index]; + if !is_valid_object { + unreachable!("if you see this the object database is correct as a delta refers to a non-existing object") + } + self.written - base_offset + }); + self.written += header.write_to(entry.decompressed_size as u64, &mut self.output)? as u64; + self.written += std::io::copy(&mut &*entry.compressed_data, &mut self.output)?; + } + } + None => { + let digest = self.output.hash.clone().digest(); + self.output.inner.write_all(&digest[..])?; + self.written += digest.len() as u64; + self.output.inner.flush()?; + self.is_done = true; + self.trailer = Some(gix_hash::ObjectId::from(digest)); + } + }; + Ok(self.written - previous_written) + } +} + +impl<I, W, E> Iterator for FromEntriesIter<I, W> +where + I: Iterator<Item = Result<Vec<output::Entry>, E>>, + W: std::io::Write, + E: std::error::Error + 'static, +{ + /// The amount of bytes written to `out` if `Ok` or the error `E` received from the input. + type Item = Result<u64, Error<E>>; + + fn next(&mut self) -> Option<Self::Item> { + if self.is_done { + return None; + } + Some(match self.next_inner() { + Err(err) => { + self.is_done = true; + Err(err) + } + Ok(written) => Ok(written), + }) + } +} diff --git a/vendor/gix-pack/src/data/output/count/mod.rs b/vendor/gix-pack/src/data/output/count/mod.rs new file mode 100644 index 000000000..e7ee767de --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/mod.rs @@ -0,0 +1,49 @@ +use gix_hash::ObjectId; + +use crate::data::output::Count; + +/// Specifies how the pack location was handled during counting +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum PackLocation { + /// We did not lookup this object + NotLookedUp, + /// The object was looked up and there may be a location in a pack, along with entry information + LookedUp(Option<crate::data::entry::Location>), +} + +impl PackLocation { + /// Directly go through to LookedUp variant, panic otherwise + pub fn is_none(&self) -> bool { + match self { + PackLocation::LookedUp(opt) => opt.is_none(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } + /// Directly go through to LookedUp variant, panic otherwise + pub fn as_ref(&self) -> Option<&crate::data::entry::Location> { + match self { + PackLocation::LookedUp(opt) => opt.as_ref(), + PackLocation::NotLookedUp => unreachable!("must have been resolved"), + } + } +} + +impl Count { + /// Create a new instance from the given `oid` and its corresponding git `obj`ect data. + pub fn from_data(oid: impl Into<ObjectId>, location: Option<crate::data::entry::Location>) -> Self { + Count { + id: oid.into(), + entry_pack_location: PackLocation::LookedUp(location), + } + } +} + +#[path = "objects/mod.rs"] +mod objects_impl; +pub use objects_impl::{objects, objects_unthreaded}; + +/// +pub mod objects { + pub use super::objects_impl::{Error, ObjectExpansion, Options, Outcome, Result}; +} diff --git a/vendor/gix-pack/src/data/output/count/objects/mod.rs b/vendor/gix-pack/src/data/output/count/objects/mod.rs new file mode 100644 index 000000000..d56bc9a5f --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/mod.rs @@ -0,0 +1,405 @@ +use std::{ + cell::RefCell, + sync::{atomic::AtomicBool, Arc}, +}; + +use gix_features::{parallel, progress::Progress}; +use gix_hash::ObjectId; + +use crate::{data::output, find}; + +pub(in crate::data::output::count::objects_impl) mod reduce; +mod util; + +mod types; +pub use types::{Error, ObjectExpansion, Options, Outcome}; + +mod tree; + +/// The return type used by [`objects()`]. +pub type Result<E1, E2> = std::result::Result<(Vec<output::Count>, Outcome), Error<E1, E2>>; + +/// Generate [`Count`][output::Count]s from input `objects` with object expansion based on [`options`][Options] +/// to learn which objects would would constitute a pack. This step is required to know exactly how many objects would +/// be in a pack while keeping data around to avoid minimize object database access. +/// +/// A [`Count`][output::Count] object maintains enough state to greatly accelerate future access of packed objects. +/// +/// * `db` - the object store to use for accessing objects. +/// * `objects_ids` +/// * A list of objects ids to add to the pack. Duplication checks are performed so no object is ever added to a pack twice. +/// * Objects may be expanded based on the provided [`options`][Options] +/// * `progress` +/// * a way to obtain progress information +/// * `should_interrupt` +/// * A flag that is set to true if the operation should stop +/// * `options` +/// * more configuration +pub fn objects<Find, Iter, IterErr, Oid>( + db: Find, + objects_ids: Iter, + progress: impl Progress, + should_interrupt: &AtomicBool, + Options { + thread_limit, + input_object_expansion, + chunk_size, + }: Options, +) -> Result<find::existing::Error<Find::Error>, IterErr> +where + Find: crate::Find + Send + Clone, + <Find as crate::Find>::Error: Send, + Iter: Iterator<Item = std::result::Result<Oid, IterErr>> + Send, + Oid: Into<ObjectId> + Send, + IterErr: std::error::Error + Send, +{ + let lower_bound = objects_ids.size_hint().0; + let (chunk_size, thread_limit, _) = parallel::optimize_chunk_size_and_thread_limit( + chunk_size, + if lower_bound == 0 { None } else { Some(lower_bound) }, + thread_limit, + None, + ); + let chunks = gix_features::iter::Chunks { + inner: objects_ids, + size: chunk_size, + }; + let seen_objs = gix_hashtable::sync::ObjectIdMap::default(); + let progress = Arc::new(parking_lot::Mutex::new(progress)); + + parallel::in_parallel( + chunks, + thread_limit, + { + let progress = Arc::clone(&progress); + move |n| { + ( + Vec::new(), // object data buffer + Vec::new(), // object data buffer 2 to hold two objects at a time + { + let mut p = progress + .lock() + .add_child_with_id(format!("thread {n}"), gix_features::progress::UNKNOWN); + p.init(None, gix_features::progress::count("objects")); + p + }, + ) + } + }, + { + let seen_objs = &seen_objs; + move |oids: Vec<std::result::Result<Oid, IterErr>>, (buf1, buf2, progress)| { + expand::this( + &db, + input_object_expansion, + seen_objs, + oids, + buf1, + buf2, + progress, + should_interrupt, + true, /*allow pack lookups*/ + ) + } + }, + reduce::Statistics::new(progress), + ) +} + +/// Like [`objects()`] but using a single thread only to mostly save on the otherwise required overhead. +pub fn objects_unthreaded<Find, IterErr, Oid>( + db: Find, + object_ids: impl Iterator<Item = std::result::Result<Oid, IterErr>>, + mut progress: impl Progress, + should_interrupt: &AtomicBool, + input_object_expansion: ObjectExpansion, +) -> Result<find::existing::Error<Find::Error>, IterErr> +where + Find: crate::Find, + Oid: Into<ObjectId>, + IterErr: std::error::Error, +{ + let seen_objs = RefCell::new(gix_hashtable::HashSet::default()); + + let (mut buf1, mut buf2) = (Vec::new(), Vec::new()); + expand::this( + &db, + input_object_expansion, + &seen_objs, + object_ids, + &mut buf1, + &mut buf2, + &mut progress, + should_interrupt, + false, /*allow pack lookups*/ + ) +} + +mod expand { + use std::sync::atomic::{AtomicBool, Ordering}; + + use gix_features::progress::Progress; + use gix_hash::{oid, ObjectId}; + use gix_object::{CommitRefIter, TagRefIter}; + + use super::{ + tree, + types::{Error, ObjectExpansion, Outcome}, + util, + }; + use crate::{ + data::{output, output::count::PackLocation}, + find, FindExt, + }; + + #[allow(clippy::too_many_arguments)] + pub fn this<Find, IterErr, Oid>( + db: &Find, + input_object_expansion: ObjectExpansion, + seen_objs: &impl util::InsertImmutable, + oids: impl IntoIterator<Item = std::result::Result<Oid, IterErr>>, + buf1: &mut Vec<u8>, + #[allow(clippy::ptr_arg)] buf2: &mut Vec<u8>, + progress: &mut impl Progress, + should_interrupt: &AtomicBool, + allow_pack_lookups: bool, + ) -> super::Result<find::existing::Error<Find::Error>, IterErr> + where + Find: crate::Find, + Oid: Into<ObjectId>, + IterErr: std::error::Error, + { + use ObjectExpansion::*; + + let mut out = Vec::new(); + let mut tree_traversal_state = gix_traverse::tree::breadthfirst::State::default(); + let mut tree_diff_state = gix_diff::tree::State::default(); + let mut parent_commit_ids = Vec::new(); + let mut traverse_delegate = tree::traverse::AllUnseen::new(seen_objs); + let mut changes_delegate = tree::changes::AllNew::new(seen_objs); + let mut outcome = Outcome::default(); + + let stats = &mut outcome; + for id in oids.into_iter() { + if should_interrupt.load(Ordering::Relaxed) { + return Err(Error::Interrupted); + } + + let id = id.map(|oid| oid.into()).map_err(Error::InputIteration)?; + let (obj, location) = db.find(id, buf1)?; + stats.input_objects += 1; + match input_object_expansion { + TreeAdditionsComparedToAncestor => { + use gix_object::Kind::*; + let mut obj = obj; + let mut location = location; + let mut id = id.to_owned(); + + loop { + push_obj_count_unique(&mut out, seen_objs, &id, location, progress, stats, false); + match obj.kind { + Tree | Blob => break, + Tag => { + id = TagRefIter::from_bytes(obj.data) + .target_id() + .expect("every tag has a target"); + let tmp = db.find(id, buf1)?; + + obj = tmp.0; + location = tmp.1; + + stats.expanded_objects += 1; + continue; + } + Commit => { + let current_tree_iter = { + let mut commit_iter = CommitRefIter::from_bytes(obj.data); + let tree_id = commit_iter.tree_id().expect("every commit has a tree"); + parent_commit_ids.clear(); + for token in commit_iter { + match token { + Ok(gix_object::commit::ref_iter::Token::Parent { id }) => { + parent_commit_ids.push(id) + } + Ok(_) => break, + Err(err) => return Err(Error::CommitDecode(err)), + } + } + let (obj, location) = db.find(tree_id, buf1)?; + push_obj_count_unique( + &mut out, seen_objs, &tree_id, location, progress, stats, true, + ); + gix_object::TreeRefIter::from_bytes(obj.data) + }; + + let objects = if parent_commit_ids.is_empty() { + traverse_delegate.clear(); + gix_traverse::tree::breadthfirst( + current_tree_iter, + &mut tree_traversal_state, + |oid, buf| { + stats.decoded_objects += 1; + match db.find(oid, buf).ok() { + Some((obj, location)) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, location)); + obj.try_into_tree_iter() + } + None => None, + } + }, + &mut traverse_delegate, + ) + .map_err(Error::TreeTraverse)?; + &traverse_delegate.non_trees + } else { + for commit_id in &parent_commit_ids { + let parent_tree_id = { + let (parent_commit_obj, location) = db.find(commit_id, buf2)?; + + push_obj_count_unique( + &mut out, seen_objs, commit_id, location, progress, stats, true, + ); + CommitRefIter::from_bytes(parent_commit_obj.data) + .tree_id() + .expect("every commit has a tree") + }; + let parent_tree = { + let (parent_tree_obj, location) = db.find(parent_tree_id, buf2)?; + push_obj_count_unique( + &mut out, + seen_objs, + &parent_tree_id, + location, + progress, + stats, + true, + ); + gix_object::TreeRefIter::from_bytes(parent_tree_obj.data) + }; + + changes_delegate.clear(); + gix_diff::tree::Changes::from(Some(parent_tree)) + .needed_to_obtain( + current_tree_iter.clone(), + &mut tree_diff_state, + |oid, buf| { + stats.decoded_objects += 1; + db.find_tree_iter(oid, buf).map(|t| t.0) + }, + &mut changes_delegate, + ) + .map_err(Error::TreeChanges)?; + } + &changes_delegate.objects + }; + for id in objects.iter() { + out.push(id_to_count(db, buf2, id, progress, stats, allow_pack_lookups)); + } + break; + } + } + } + } + TreeContents => { + use gix_object::Kind::*; + let mut id = id; + let mut obj = (obj, location); + loop { + push_obj_count_unique(&mut out, seen_objs, &id, obj.1.clone(), progress, stats, false); + match obj.0.kind { + Tree => { + traverse_delegate.clear(); + gix_traverse::tree::breadthfirst( + gix_object::TreeRefIter::from_bytes(obj.0.data), + &mut tree_traversal_state, + |oid, buf| { + stats.decoded_objects += 1; + match db.find(oid, buf).ok() { + Some((obj, location)) => { + progress.inc(); + stats.expanded_objects += 1; + out.push(output::Count::from_data(oid, location)); + obj.try_into_tree_iter() + } + None => None, + } + }, + &mut traverse_delegate, + ) + .map_err(Error::TreeTraverse)?; + for id in traverse_delegate.non_trees.iter() { + out.push(id_to_count(db, buf1, id, progress, stats, allow_pack_lookups)); + } + break; + } + Commit => { + id = CommitRefIter::from_bytes(obj.0.data) + .tree_id() + .expect("every commit has a tree"); + stats.expanded_objects += 1; + obj = db.find(id, buf1)?; + continue; + } + Blob => break, + Tag => { + id = TagRefIter::from_bytes(obj.0.data) + .target_id() + .expect("every tag has a target"); + stats.expanded_objects += 1; + obj = db.find(id, buf1)?; + continue; + } + } + } + } + AsIs => push_obj_count_unique(&mut out, seen_objs, &id, location, progress, stats, false), + } + } + outcome.total_objects = out.len(); + Ok((out, outcome)) + } + + #[inline] + fn push_obj_count_unique( + out: &mut Vec<output::Count>, + all_seen: &impl util::InsertImmutable, + id: &oid, + location: Option<crate::data::entry::Location>, + progress: &mut impl Progress, + statistics: &mut Outcome, + count_expanded: bool, + ) { + let inserted = all_seen.insert(id.to_owned()); + if inserted { + progress.inc(); + statistics.decoded_objects += 1; + if count_expanded { + statistics.expanded_objects += 1; + } + out.push(output::Count::from_data(id, location)); + } + } + + #[inline] + fn id_to_count<Find: crate::Find>( + db: &Find, + buf: &mut Vec<u8>, + id: &oid, + progress: &mut impl Progress, + statistics: &mut Outcome, + allow_pack_lookups: bool, + ) -> output::Count { + progress.inc(); + statistics.expanded_objects += 1; + output::Count { + id: id.to_owned(), + entry_pack_location: if allow_pack_lookups { + PackLocation::LookedUp(db.location_by_oid(id, buf)) + } else { + PackLocation::NotLookedUp + }, + } + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/reduce.rs b/vendor/gix-pack/src/data/output/count/objects/reduce.rs new file mode 100644 index 000000000..c6a61d467 --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/reduce.rs @@ -0,0 +1,49 @@ +use std::{marker::PhantomData, sync::Arc}; + +use gix_features::{parallel, progress::Progress}; + +use super::Outcome; +use crate::data::output; + +pub struct Statistics<E, P> { + total: Outcome, + counts: Vec<output::Count>, + progress: Arc<parking_lot::Mutex<P>>, + _err: PhantomData<E>, +} + +impl<E, P> Statistics<E, P> +where + P: Progress, +{ + pub fn new(progress: Arc<parking_lot::Mutex<P>>) -> Self { + Statistics { + total: Default::default(), + counts: Default::default(), + progress, + _err: PhantomData::default(), + } + } +} + +impl<E, P> parallel::Reduce for Statistics<E, P> +where + P: Progress, +{ + type Input = Result<(Vec<output::Count>, Outcome), E>; + type FeedProduce = (); + type Output = (Vec<output::Count>, Outcome); + type Error = E; + + fn feed(&mut self, item: Self::Input) -> Result<Self::FeedProduce, Self::Error> { + let (counts, stats) = item?; + self.total.aggregate(stats); + self.progress.lock().inc_by(counts.len()); + self.counts.extend(counts); + Ok(()) + } + + fn finalize(self) -> Result<Self::Output, Self::Error> { + Ok((self.counts, self.total)) + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/tree.rs b/vendor/gix-pack/src/data/output/count/objects/tree.rs new file mode 100644 index 000000000..d3f4f6b9a --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/tree.rs @@ -0,0 +1,124 @@ +pub mod changes { + use gix_diff::tree::{ + visit::{Action, Change}, + Visit, + }; + use gix_hash::ObjectId; + use gix_object::{bstr::BStr, tree::EntryMode}; + + use crate::data::output::count::objects_impl::util::InsertImmutable; + + pub struct AllNew<'a, H> { + pub objects: Vec<ObjectId>, + all_seen: &'a H, + } + + impl<'a, H> AllNew<'a, H> + where + H: InsertImmutable, + { + pub fn new(all_seen: &'a H) -> Self { + AllNew { + objects: Default::default(), + all_seen, + } + } + pub fn clear(&mut self) { + self.objects.clear(); + } + } + + impl<'a, H> Visit for AllNew<'a, H> + where + H: InsertImmutable, + { + fn pop_front_tracked_path_and_set_current(&mut self) {} + + fn push_back_tracked_path_component(&mut self, _component: &BStr) {} + + fn push_path_component(&mut self, _component: &BStr) {} + + fn pop_path_component(&mut self) {} + + fn visit(&mut self, change: Change) -> Action { + match change { + Change::Addition { oid, entry_mode } | Change::Modification { oid, entry_mode, .. } => { + if entry_mode == EntryMode::Commit { + return Action::Continue; + } + let inserted = self.all_seen.insert(oid); + if inserted { + self.objects.push(oid); + } + } + Change::Deletion { .. } => {} + }; + Action::Continue + } + } +} + +pub mod traverse { + use gix_hash::ObjectId; + use gix_object::{ + bstr::BStr, + tree::{EntryMode, EntryRef}, + }; + use gix_traverse::tree::{visit::Action, Visit}; + + use crate::data::output::count::objects_impl::util::InsertImmutable; + + pub struct AllUnseen<'a, H> { + pub non_trees: Vec<ObjectId>, + all_seen: &'a H, + } + + impl<'a, H> AllUnseen<'a, H> + where + H: InsertImmutable, + { + pub fn new(all_seen: &'a H) -> Self { + AllUnseen { + non_trees: Default::default(), + all_seen, + } + } + pub fn clear(&mut self) { + self.non_trees.clear(); + } + } + + impl<'a, H> Visit for AllUnseen<'a, H> + where + H: InsertImmutable, + { + fn pop_front_tracked_path_and_set_current(&mut self) {} + + fn push_back_tracked_path_component(&mut self, _component: &BStr) {} + + fn push_path_component(&mut self, _component: &BStr) {} + + fn pop_path_component(&mut self) {} + + fn visit_tree(&mut self, entry: &EntryRef<'_>) -> Action { + let inserted = self.all_seen.insert(entry.oid.to_owned()); + if inserted { + Action::Continue + } else { + Action::Skip + } + } + + fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> Action { + if entry.mode == EntryMode::Commit { + // links don't have a representation + return Action::Continue; + } + let inserted = self.all_seen.insert(entry.oid.to_owned()); + if inserted { + self.non_trees.push(entry.oid.to_owned()); + } + Action::Continue + } + } +} diff --git a/vendor/gix-pack/src/data/output/count/objects/types.rs b/vendor/gix-pack/src/data/output/count/objects/types.rs new file mode 100644 index 000000000..8c8c939df --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/types.rs @@ -0,0 +1,105 @@ +/// Information gathered during the run of [`iter_from_objects()`][super::objects()]. +#[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Outcome { + /// The amount of objects provided to start the iteration. + pub input_objects: usize, + /// The amount of objects that have been expanded from the input source. + /// It's desirable to do that as expansion happens on multiple threads, allowing the amount of input objects to be small. + /// `expanded_objects - decoded_objects` is the 'cheap' object we found without decoding the object itself. + pub expanded_objects: usize, + /// The amount of fully decoded objects. These are the most expensive as they are fully decoded + pub decoded_objects: usize, + /// The total amount of encountered objects. Should be `expanded_objects + input_objects`. + pub total_objects: usize, +} + +impl Outcome { + pub(in crate::data::output::count) fn aggregate( + &mut self, + Outcome { + input_objects, + decoded_objects, + expanded_objects, + total_objects, + }: Self, + ) { + self.input_objects += input_objects; + self.decoded_objects += decoded_objects; + self.expanded_objects += expanded_objects; + self.total_objects += total_objects; + } +} + +/// The way input objects are handled +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum ObjectExpansion { + /// Don't do anything with the input objects except for transforming them into pack entries + AsIs, + /// If the input object is a Commit then turn it into a pack entry. Additionally obtain its tree, turn it into a pack entry + /// along with all of its contents, that is nested trees, and any other objects reachable from it. + /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. + /// + /// This mode is useful if all reachable objects should be added, as in cloning a repository. + TreeContents, + /// If the input is a commit, obtain its ancestors and turn them into pack entries. Obtain the ancestor trees along with the commits + /// tree and turn them into pack entries. Finally obtain the added/changed objects when comparing the ancestor trees with the + /// current tree and turn them into entries as well. + /// Otherwise, the same as [`AsIs`][ObjectExpansion::AsIs]. + /// + /// This mode is useful to build a pack containing only new objects compared to a previous state. + TreeAdditionsComparedToAncestor, +} + +impl Default for ObjectExpansion { + fn default() -> Self { + ObjectExpansion::AsIs + } +} + +/// Configuration options for the pack generation functions provided in [this module][crate::data::output]. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Options { + /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. + /// If more than one thread is used, the order of returned [counts][crate::data::output::Count] is not deterministic anymore + /// especially when tree traversal is involved. Thus deterministic ordering requires `Some(1)` to be set. + pub thread_limit: Option<usize>, + /// The amount of objects per chunk or unit of work to be sent to threads for processing + pub chunk_size: usize, + /// The way input objects are handled + pub input_object_expansion: ObjectExpansion, +} + +impl Default for Options { + fn default() -> Self { + Options { + thread_limit: None, + chunk_size: 10, + input_object_expansion: Default::default(), + } + } +} + +/// The error returned by the pack generation iterator [bytes::FromEntriesIter][crate::data::output::bytes::FromEntriesIter]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error<FindErr, IterErr> +where + FindErr: std::error::Error + 'static, + IterErr: std::error::Error + 'static, +{ + #[error(transparent)] + CommitDecode(gix_object::decode::Error), + #[error(transparent)] + FindExisting(#[from] FindErr), + #[error(transparent)] + InputIteration(IterErr), + #[error(transparent)] + TreeTraverse(gix_traverse::tree::breadthfirst::Error), + #[error(transparent)] + TreeChanges(gix_diff::tree::changes::Error), + #[error("Operation interrupted")] + Interrupted, +} diff --git a/vendor/gix-pack/src/data/output/count/objects/util.rs b/vendor/gix-pack/src/data/output/count/objects/util.rs new file mode 100644 index 000000000..a80841313 --- /dev/null +++ b/vendor/gix-pack/src/data/output/count/objects/util.rs @@ -0,0 +1,24 @@ +pub trait InsertImmutable { + fn insert(&self, id: gix_hash::ObjectId) -> bool; +} + +mod trait_impls { + use gix_hash::ObjectId; + use std::cell::RefCell; + + use gix_hashtable::HashSet; + + use super::InsertImmutable; + + impl InsertImmutable for gix_hashtable::sync::ObjectIdMap<()> { + fn insert(&self, id: ObjectId) -> bool { + self.insert(id, ()).is_none() + } + } + + impl InsertImmutable for RefCell<HashSet<ObjectId>> { + fn insert(&self, item: ObjectId) -> bool { + self.borrow_mut().insert(item) + } + } +} diff --git a/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs b/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs new file mode 100644 index 000000000..25e256d5c --- /dev/null +++ b/vendor/gix-pack/src/data/output/entry/iter_from_counts.rs @@ -0,0 +1,428 @@ +pub(crate) mod function { + use std::{cmp::Ordering, sync::Arc}; + + use gix_features::{parallel, parallel::SequenceId, progress::Progress}; + + use super::{reduce, util, Error, Mode, Options, Outcome, ProgressId}; + use crate::data::output; + + /// Given a known list of object `counts`, calculate entries ready to be put into a data pack. + /// + /// This allows objects to be written quite soon without having to wait for the entire pack to be built in memory. + /// A chunk of objects is held in memory and compressed using DEFLATE, and serve the output of this iterator. + /// That way slow writers will naturally apply back pressure, and communicate to the implementation that more time can be + /// spent compressing objects. + /// + /// * `counts` + /// * A list of previously counted objects to add to the pack. Duplication checks are not performed, no object is expected to be duplicated. + /// * `progress` + /// * a way to obtain progress information + /// * `options` + /// * more configuration + /// + /// _Returns_ the checksum of the pack + /// + /// ## Discussion + /// + /// ### Advantages + /// + /// * Begins writing immediately and supports back-pressure. + /// * Abstract over object databases and how input is provided. + /// + /// ### Disadvantages + /// + /// * ~~currently there is no way to easily write the pack index, even though the state here is uniquely positioned to do + /// so with minimal overhead (especially compared to `gix index-from-pack`)~~ Probably works now by chaining Iterators + /// or keeping enough state to write a pack and then generate an index with recorded data. + /// + pub fn iter_from_counts<Find>( + mut counts: Vec<output::Count>, + db: Find, + mut progress: impl Progress + 'static, + Options { + version, + mode, + allow_thin_pack, + thread_limit, + chunk_size, + }: Options, + ) -> impl Iterator<Item = Result<(SequenceId, Vec<output::Entry>), Error<Find::Error>>> + + parallel::reduce::Finalize<Reduce = reduce::Statistics<Error<Find::Error>>> + where + Find: crate::Find + Send + Clone + 'static, + <Find as crate::Find>::Error: Send, + { + assert!( + matches!(version, crate::data::Version::V2), + "currently we can only write version 2" + ); + let (chunk_size, thread_limit, _) = + parallel::optimize_chunk_size_and_thread_limit(chunk_size, Some(counts.len()), thread_limit, None); + { + let progress = Arc::new(parking_lot::Mutex::new( + progress.add_child_with_id("resolving", ProgressId::ResolveCounts.into()), + )); + progress.lock().init(None, gix_features::progress::count("counts")); + let enough_counts_present = counts.len() > 4_000; + let start = std::time::Instant::now(); + parallel::in_parallel_if( + || enough_counts_present, + counts.chunks_mut(chunk_size), + thread_limit, + |_n| Vec::<u8>::new(), + { + let progress = Arc::clone(&progress); + let db = db.clone(); + move |chunk, buf| { + let chunk_size = chunk.len(); + for count in chunk { + use crate::data::output::count::PackLocation::*; + match count.entry_pack_location { + LookedUp(_) => continue, + NotLookedUp => count.entry_pack_location = LookedUp(db.location_by_oid(count.id, buf)), + } + } + progress.lock().inc_by(chunk_size); + Ok::<_, ()>(()) + } + }, + parallel::reduce::IdentityWithResult::<(), ()>::default(), + ) + .expect("infallible - we ignore none-existing objects"); + progress.lock().show_throughput(start); + } + let counts_range_by_pack_id = match mode { + Mode::PackCopyAndBaseObjects => { + let mut progress = progress.add_child_with_id("sorting", ProgressId::SortEntries.into()); + progress.init(Some(counts.len()), gix_features::progress::count("counts")); + let start = std::time::Instant::now(); + + use crate::data::output::count::PackLocation::*; + counts.sort_by(|lhs, rhs| match (&lhs.entry_pack_location, &rhs.entry_pack_location) { + (LookedUp(None), LookedUp(None)) => Ordering::Equal, + (LookedUp(Some(_)), LookedUp(None)) => Ordering::Greater, + (LookedUp(None), LookedUp(Some(_))) => Ordering::Less, + (LookedUp(Some(lhs)), LookedUp(Some(rhs))) => lhs + .pack_id + .cmp(&rhs.pack_id) + .then(lhs.pack_offset.cmp(&rhs.pack_offset)), + (_, _) => unreachable!("counts were resolved beforehand"), + }); + + let mut index: Vec<(u32, std::ops::Range<usize>)> = Vec::new(); + let mut chunks_pack_start = counts.partition_point(|e| e.entry_pack_location.is_none()); + let mut slice = &counts[chunks_pack_start..]; + while !slice.is_empty() { + let current_pack_id = slice[0].entry_pack_location.as_ref().expect("packed object").pack_id; + let pack_end = slice.partition_point(|e| { + e.entry_pack_location.as_ref().expect("packed object").pack_id == current_pack_id + }); + index.push((current_pack_id, chunks_pack_start..chunks_pack_start + pack_end)); + slice = &slice[pack_end..]; + chunks_pack_start += pack_end; + } + + progress.set(counts.len()); + progress.show_throughput(start); + + index + } + }; + + let counts = Arc::new(counts); + let progress = Arc::new(parking_lot::Mutex::new(progress)); + let chunks = util::ChunkRanges::new(chunk_size, counts.len()); + + parallel::reduce::Stepwise::new( + chunks.enumerate(), + thread_limit, + { + let progress = Arc::clone(&progress); + move |n| { + ( + Vec::new(), // object data buffer + progress + .lock() + .add_child_with_id(format!("thread {n}"), gix_features::progress::UNKNOWN), + ) + } + }, + { + let counts = Arc::clone(&counts); + move |(chunk_id, chunk_range): (SequenceId, std::ops::Range<usize>), (buf, progress)| { + let mut out = Vec::new(); + let chunk = &counts[chunk_range]; + let mut stats = Outcome::default(); + let mut pack_offsets_to_id = None; + progress.init(Some(chunk.len()), gix_features::progress::count("objects")); + + for count in chunk.iter() { + out.push(match count + .entry_pack_location + .as_ref() + .and_then(|l| db.entry_by_location(l).map(|pe| (l, pe))) + { + Some((location, pack_entry)) => { + if let Some((cached_pack_id, _)) = &pack_offsets_to_id { + if *cached_pack_id != location.pack_id { + pack_offsets_to_id = None; + } + } + let pack_range = counts_range_by_pack_id[counts_range_by_pack_id + .binary_search_by_key(&location.pack_id, |e| e.0) + .expect("pack-id always present")] + .1 + .clone(); + let base_index_offset = pack_range.start; + let counts_in_pack = &counts[pack_range]; + match output::Entry::from_pack_entry( + pack_entry, + count, + counts_in_pack, + base_index_offset, + allow_thin_pack.then_some({ + |pack_id, base_offset| { + let (cached_pack_id, cache) = pack_offsets_to_id.get_or_insert_with(|| { + db.pack_offsets_and_oid(pack_id) + .map(|mut v| { + v.sort_by_key(|e| e.0); + (pack_id, v) + }) + .expect("pack used for counts is still available") + }); + debug_assert_eq!(*cached_pack_id, pack_id); + stats.ref_delta_objects += 1; + cache + .binary_search_by_key(&base_offset, |e| e.0) + .ok() + .map(|idx| cache[idx].1) + } + }), + version, + ) { + Some(entry) => { + stats.objects_copied_from_pack += 1; + entry + } + None => match db.try_find(count.id, buf).map_err(Error::FindExisting)? { + Some((obj, _location)) => { + stats.decoded_and_recompressed_objects += 1; + output::Entry::from_data(count, &obj) + } + None => { + stats.missing_objects += 1; + Ok(output::Entry::invalid()) + } + }, + } + } + None => match db.try_find(count.id, buf).map_err(Error::FindExisting)? { + Some((obj, _location)) => { + stats.decoded_and_recompressed_objects += 1; + output::Entry::from_data(count, &obj) + } + None => { + stats.missing_objects += 1; + Ok(output::Entry::invalid()) + } + }, + }?); + progress.inc(); + } + Ok((chunk_id, out, stats)) + } + }, + reduce::Statistics::default(), + ) + } +} + +mod util { + #[derive(Clone)] + pub struct ChunkRanges { + cursor: usize, + size: usize, + len: usize, + } + + impl ChunkRanges { + pub fn new(size: usize, total: usize) -> Self { + ChunkRanges { + cursor: 0, + size, + len: total, + } + } + } + + impl Iterator for ChunkRanges { + type Item = std::ops::Range<usize>; + + fn next(&mut self) -> Option<Self::Item> { + if self.cursor >= self.len { + None + } else { + let upper = (self.cursor + self.size).min(self.len); + let range = self.cursor..upper; + self.cursor = upper; + Some(range) + } + } + } +} + +mod reduce { + use std::marker::PhantomData; + + use gix_features::{parallel, parallel::SequenceId}; + + use super::Outcome; + use crate::data::output; + + pub struct Statistics<E> { + total: Outcome, + _err: PhantomData<E>, + } + + impl<E> Default for Statistics<E> { + fn default() -> Self { + Statistics { + total: Default::default(), + _err: PhantomData::default(), + } + } + } + + impl<Error> parallel::Reduce for Statistics<Error> { + type Input = Result<(SequenceId, Vec<output::Entry>, Outcome), Error>; + type FeedProduce = (SequenceId, Vec<output::Entry>); + type Output = Outcome; + type Error = Error; + + fn feed(&mut self, item: Self::Input) -> Result<Self::FeedProduce, Self::Error> { + item.map(|(cid, entries, stats)| { + self.total.aggregate(stats); + (cid, entries) + }) + } + + fn finalize(self) -> Result<Self::Output, Self::Error> { + Ok(self.total) + } + } +} + +mod types { + use crate::data::output::entry; + + /// Information gathered during the run of [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub struct Outcome { + /// The amount of fully decoded objects. These are the most expensive as they are fully decoded. + pub decoded_and_recompressed_objects: usize, + /// The amount of objects that could not be located despite them being mentioned during iteration + pub missing_objects: usize, + /// The amount of base or delta objects that could be copied directly from the pack. These are cheapest as they + /// only cost a memory copy for the most part. + pub objects_copied_from_pack: usize, + /// The amount of objects that ref to their base as ref-delta, an indication for a thin back being created. + pub ref_delta_objects: usize, + } + + impl Outcome { + pub(in crate::data::output::entry) fn aggregate( + &mut self, + Outcome { + decoded_and_recompressed_objects: decoded_objects, + missing_objects, + objects_copied_from_pack, + ref_delta_objects, + }: Self, + ) { + self.decoded_and_recompressed_objects += decoded_objects; + self.missing_objects += missing_objects; + self.objects_copied_from_pack += objects_copied_from_pack; + self.ref_delta_objects += ref_delta_objects; + } + } + + /// The way the iterator operates. + #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub enum Mode { + /// Copy base objects and deltas from packs, while non-packed objects will be treated as base objects + /// (i.e. without trying to delta compress them). This is a fast way of obtaining a back while benefiting + /// from existing pack compression and spending the smallest possible time on compressing unpacked objects at + /// the cost of bandwidth. + PackCopyAndBaseObjects, + } + + /// Configuration options for the pack generation functions provided in [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] + #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] + pub struct Options { + /// The amount of threads to use at most when resolving the pack. If `None`, all logical cores are used. + pub thread_limit: Option<usize>, + /// The algorithm to produce a pack + pub mode: Mode, + /// If set, the resulting back can have deltas that refer to an object which is not in the pack. This can happen + /// if the initial counted objects do not contain an object that an existing packed delta refers to, for example, because + /// it wasn't part of the iteration, for instance when the iteration was performed on tree deltas or only a part of the + /// commit graph. Please note that thin packs are not valid packs at rest, thus they are only valid for packs in transit. + /// + /// If set to false, delta objects will be decompressed and recompressed as base objects. + pub allow_thin_pack: bool, + /// The amount of objects per chunk or unit of work to be sent to threads for processing + /// TODO: could this become the window size? + pub chunk_size: usize, + /// The pack data version to produce for each entry + pub version: crate::data::Version, + } + + impl Default for Options { + fn default() -> Self { + Options { + thread_limit: None, + mode: Mode::PackCopyAndBaseObjects, + allow_thin_pack: false, + chunk_size: 10, + version: Default::default(), + } + } + } + + /// The error returned by the pack generation function [`iter_from_counts()`][crate::data::output::entry::iter_from_counts()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error<FindErr> + where + FindErr: std::error::Error + 'static, + { + #[error(transparent)] + FindExisting(FindErr), + #[error(transparent)] + NewEntry(#[from] entry::Error), + } + + /// The progress ids used in [`write_to_directory()`][crate::Bundle::write_to_directory()]. + /// + /// Use this information to selectively extract the progress of interest in case the parent application has custom visualization. + #[derive(Debug, Copy, Clone)] + pub enum ProgressId { + /// The amount of [`Count`][crate::data::output::Count] objects which are resolved to their pack location. + ResolveCounts, + /// Layout pack entries for placement into a pack (by pack-id and by offset). + SortEntries, + } + + impl From<ProgressId> for gix_features::progress::Id { + fn from(v: ProgressId) -> Self { + match v { + ProgressId::ResolveCounts => *b"ECRC", + ProgressId::SortEntries => *b"ECSE", + } + } + } +} +pub use types::{Error, Mode, Options, Outcome, ProgressId}; diff --git a/vendor/gix-pack/src/data/output/entry/mod.rs b/vendor/gix-pack/src/data/output/entry/mod.rs new file mode 100644 index 000000000..401d2f24c --- /dev/null +++ b/vendor/gix-pack/src/data/output/entry/mod.rs @@ -0,0 +1,181 @@ +use std::{convert::TryFrom, io::Write}; + +use gix_hash::ObjectId; + +use crate::{data, data::output, find}; + +/// +pub mod iter_from_counts; +pub use iter_from_counts::function::iter_from_counts; + +/// The kind of pack entry to be written +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub enum Kind { + /// A complete base object, including its kind + Base(gix_object::Kind), + /// A delta against the object with the given index. It's always an index that was already encountered to refer only + /// to object we have written already. + DeltaRef { + /// The absolute index to the object to serve as base. It's up to the writer to maintain enough state to allow producing + /// a packed delta object from it. + object_index: usize, + }, + /// A delta against the given object as identified by its `ObjectId`. + /// This is the case for thin packs only, i.e. those that are sent over the wire. + /// Note that there is the option of the `ObjectId` being used to refer to an object within + /// the same pack, but it's a discontinued practice which won't be encountered here. + DeltaOid { + /// The object serving as base for this delta + id: ObjectId, + }, +} + +/// The error returned by [`output::Entry::from_data()`]. +#[allow(missing_docs)] +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("{0}")] + ZlibDeflate(#[from] std::io::Error), +} + +impl output::Entry { + /// An object which can be identified as invalid easily which happens if objects didn't exist even if they were referred to. + pub fn invalid() -> output::Entry { + output::Entry { + id: gix_hash::Kind::Sha1.null(), // NOTE: the actual object hash used in the repo doesn't matter here, this is a sentinel value. + kind: Kind::Base(gix_object::Kind::Blob), + decompressed_size: 0, + compressed_data: vec![], + } + } + + /// Returns true if this object doesn't really exist but still has to be handled responsibly + /// + /// Note that this is true for tree entries that are commits/git submodules, or for objects which aren't present in our local clone + /// due to shallow clones. + pub fn is_invalid(&self) -> bool { + self.id.is_null() + } + + /// Create an Entry from a previously counted object which is located in a pack. It's `entry` is provided here. + /// The `version` specifies what kind of target `Entry` version the caller desires. + pub fn from_pack_entry( + mut entry: find::Entry, + count: &output::Count, + potential_bases: &[output::Count], + bases_index_offset: usize, + pack_offset_to_oid: Option<impl FnMut(u32, u64) -> Option<ObjectId>>, + target_version: crate::data::Version, + ) -> Option<Result<Self, Error>> { + if entry.version != target_version { + return None; + }; + + let pack_offset_must_be_zero = 0; + let pack_entry = + crate::data::Entry::from_bytes(&entry.data, pack_offset_must_be_zero, count.id.as_slice().len()); + + use crate::data::entry::Header::*; + match pack_entry.header { + Commit => Some(output::entry::Kind::Base(gix_object::Kind::Commit)), + Tree => Some(output::entry::Kind::Base(gix_object::Kind::Tree)), + Blob => Some(output::entry::Kind::Base(gix_object::Kind::Blob)), + Tag => Some(output::entry::Kind::Base(gix_object::Kind::Tag)), + OfsDelta { base_distance } => { + let pack_location = count.entry_pack_location.as_ref().expect("packed"); + let base_offset = pack_location + .pack_offset + .checked_sub(base_distance) + .expect("pack-offset - distance is firmly within the pack"); + potential_bases + .binary_search_by(|e| { + e.entry_pack_location + .as_ref() + .expect("packed") + .pack_offset + .cmp(&base_offset) + }) + .ok() + .map(|idx| output::entry::Kind::DeltaRef { + object_index: idx + bases_index_offset, + }) + .or_else(|| { + pack_offset_to_oid + .and_then(|mut f| f(pack_location.pack_id, base_offset)) + .map(|id| output::entry::Kind::DeltaOid { id }) + }) + } + RefDelta { base_id: _ } => None, // ref deltas are for thin packs or legacy, repack them as base objects + } + .map(|kind| { + Ok(output::Entry { + id: count.id.to_owned(), + kind, + decompressed_size: pack_entry.decompressed_size as usize, + compressed_data: { + entry.data.copy_within(pack_entry.data_offset as usize.., 0); + entry.data.resize( + entry.data.len() + - usize::try_from(pack_entry.data_offset).expect("offset representable as usize"), + 0, + ); + entry.data + }, + }) + }) + } + + /// Create a new instance from the given `oid` and its corresponding git `obj`ect data. + pub fn from_data(count: &output::Count, obj: &gix_object::Data<'_>) -> Result<Self, Error> { + Ok(output::Entry { + id: count.id.to_owned(), + kind: Kind::Base(obj.kind), + decompressed_size: obj.data.len(), + compressed_data: { + let mut out = gix_features::zlib::stream::deflate::Write::new(Vec::new()); + if let Err(err) = std::io::copy(&mut &*obj.data, &mut out) { + match err.kind() { + std::io::ErrorKind::Other => return Err(Error::ZlibDeflate(err)), + err => unreachable!("Should never see other errors than zlib, but got {:?}", err,), + } + }; + out.flush()?; + out.into_inner() + }, + }) + } + + /// Transform ourselves into pack entry header of `version` which can be written into a pack. + /// + /// `index_to_pack(object_index) -> pack_offset` is a function to convert the base object's index into + /// the input object array (if each object is numbered) to an offset into the pack. + /// This information is known to the one calling the method. + pub fn to_entry_header( + &self, + version: crate::data::Version, + index_to_base_distance: impl FnOnce(usize) -> u64, + ) -> crate::data::entry::Header { + assert!( + matches!(version, data::Version::V2), + "we can only write V2 pack entries for now" + ); + + use Kind::*; + match self.kind { + Base(kind) => { + use gix_object::Kind::*; + match kind { + Tree => data::entry::Header::Tree, + Blob => data::entry::Header::Blob, + Commit => data::entry::Header::Commit, + Tag => data::entry::Header::Tag, + } + } + DeltaOid { id } => data::entry::Header::RefDelta { base_id: id.to_owned() }, + DeltaRef { object_index } => data::entry::Header::OfsDelta { + base_distance: index_to_base_distance(object_index), + }, + } + } +} diff --git a/vendor/gix-pack/src/data/output/mod.rs b/vendor/gix-pack/src/data/output/mod.rs new file mode 100644 index 000000000..f94d32e8e --- /dev/null +++ b/vendor/gix-pack/src/data/output/mod.rs @@ -0,0 +1,41 @@ +use gix_hash::ObjectId; + +/// +pub mod count; + +/// An item representing a future Entry in the leanest way possible. +/// +/// One can expect to have one of these in memory when building big objects, so smaller is better here. +/// They should contain everything of importance to generate a pack as fast as possible. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Count { + /// The hash of the object to write + pub id: ObjectId, + /// A way to locate a pack entry in the object database, only available if the object is in a pack. + pub entry_pack_location: count::PackLocation, +} + +/// An entry to be written to a file. +/// +/// Some of these will be in-flight and in memory while waiting to be written. Memory requirements depend on the amount of compressed +/// data they hold. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] +pub struct Entry { + /// The hash of the object to write + pub id: ObjectId, + /// The kind of entry represented by `data`. It's used alongside with it to complete the pack entry + /// at rest or in transit. + pub kind: entry::Kind, + /// The size in bytes needed once `data` gets decompressed + pub decompressed_size: usize, + /// The compressed data right behind the header + pub compressed_data: Vec<u8>, +} + +/// +pub mod entry; + +/// +pub mod bytes; |