diff options
Diffstat (limited to 'vendor/gix-pack/src/multi_index/chunk.rs')
-rw-r--r-- | vendor/gix-pack/src/multi_index/chunk.rs | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/vendor/gix-pack/src/multi_index/chunk.rs b/vendor/gix-pack/src/multi_index/chunk.rs new file mode 100644 index 000000000..7ed8eebcb --- /dev/null +++ b/vendor/gix-pack/src/multi_index/chunk.rs @@ -0,0 +1,276 @@ +/// Information for the chunk about index names +pub mod index_names { + use std::path::{Path, PathBuf}; + + use gix_object::bstr::{BString, ByteSlice}; + + /// The ID used for the index-names chunk. + pub const ID: gix_chunk::Id = *b"PNAM"; + + /// + pub mod decode { + use gix_object::bstr::BString; + + /// The error returned by [from_bytes()][super::from_bytes()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("The pack names were not ordered alphabetically.")] + NotOrderedAlphabetically, + #[error("Each pack path name must be terminated with a null byte")] + MissingNullByte, + #[error("Couldn't turn path '{path}' into OS path due to encoding issues")] + PathEncoding { path: BString }, + #[error("non-padding bytes found after all paths were read.")] + UnknownTrailerBytes, + } + } + + /// Parse null-separated index names from the given `chunk` of bytes and the expected number of packs and indices. + /// Ignore padding bytes which are typically \0. + pub fn from_bytes(mut chunk: &[u8], num_packs: u32) -> Result<Vec<PathBuf>, decode::Error> { + let mut out = Vec::new(); + for _ in 0..num_packs { + let null_byte_pos = chunk.find_byte(b'\0').ok_or(decode::Error::MissingNullByte)?; + + let path = &chunk[..null_byte_pos]; + let path = gix_path::try_from_byte_slice(path) + .map_err(|_| decode::Error::PathEncoding { + path: BString::from(path), + })? + .to_owned(); + + if let Some(previous) = out.last() { + if previous >= &path { + return Err(decode::Error::NotOrderedAlphabetically); + } + } + out.push(path); + + chunk = &chunk[null_byte_pos + 1..]; + } + + if !chunk.is_empty() && !chunk.iter().all(|b| *b == 0) { + return Err(decode::Error::UnknownTrailerBytes); + } + // NOTE: git writes garbage into this chunk, usually extra \0 bytes, which we simply ignore. If we were strict + // about it we couldn't read this chunk data at all. + Ok(out) + } + + /// Calculate the size on disk for our chunk with the given index paths. Note that these are expected to have been processed already + /// to actually be file names. + pub fn storage_size(paths: impl IntoIterator<Item = impl AsRef<Path>>) -> u64 { + let mut count = 0u64; + for path in paths { + let path = path.as_ref(); + let ascii_path = path.to_str().expect("UTF-8 compatible paths"); + assert!( + ascii_path.is_ascii(), + "must use ascii bytes for correct size computation" + ); + count += (ascii_path.as_bytes().len() + 1/* null byte */) as u64 + } + + let needed_alignment = CHUNK_ALIGNMENT - (count % CHUNK_ALIGNMENT); + if needed_alignment < CHUNK_ALIGNMENT { + count += needed_alignment; + } + count + } + + /// Write all `paths` in order to `out`, including padding. + pub fn write( + paths: impl IntoIterator<Item = impl AsRef<Path>>, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + let mut written_bytes = 0; + for path in paths { + let path = path.as_ref().to_str().expect("UTF-8 path"); + out.write_all(path.as_bytes())?; + out.write_all(&[0])?; + written_bytes += path.as_bytes().len() as u64 + 1; + } + + let needed_alignment = CHUNK_ALIGNMENT - (written_bytes % CHUNK_ALIGNMENT); + if needed_alignment < CHUNK_ALIGNMENT { + let padding = [0u8; CHUNK_ALIGNMENT as usize]; + out.write_all(&padding[..needed_alignment as usize])?; + } + Ok(()) + } + + const CHUNK_ALIGNMENT: u64 = 4; +} + +/// Information for the chunk with the fanout table +pub mod fanout { + use std::convert::TryInto; + + use crate::multi_index; + + /// The size of the fanout table + pub const SIZE: usize = 4 * 256; + + /// The id uniquely identifying the fanout table. + pub const ID: gix_chunk::Id = *b"OIDF"; + + /// Decode the fanout table contained in `chunk`, or return `None` if it didn't have the expected size. + pub fn from_bytes(chunk: &[u8]) -> Option<[u32; 256]> { + if chunk.len() != SIZE { + return None; + } + let mut out = [0; 256]; + for (c, f) in chunk.chunks(4).zip(out.iter_mut()) { + *f = u32::from_be_bytes(c.try_into().unwrap()); + } + out.into() + } + + /// Write the fanout for the given entries, which must be sorted by oid + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + let fanout = crate::index::write::encode::fanout(sorted_entries.iter().map(|e| e.id.first_byte())); + + for value in fanout.iter() { + out.write_all(&value.to_be_bytes())?; + } + Ok(()) + } +} + +/// Information about the oid lookup table. +pub mod lookup { + use std::ops::Range; + + use crate::multi_index; + + /// The id uniquely identifying the oid lookup table. + pub const ID: gix_chunk::Id = *b"OIDL"; + + /// Return the amount of bytes needed to store the data on disk for the given amount of `entries` + pub fn storage_size(entries: usize, object_hash: gix_hash::Kind) -> u64 { + (entries * object_hash.len_in_bytes()) as u64 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + for entry in sorted_entries { + out.write_all(entry.id.as_slice())?; + } + Ok(()) + } + + /// Return true if the size of the `offset` range seems to match for a `hash` of the given kind and the amount of objects. + pub fn is_valid(offset: &Range<usize>, hash: gix_hash::Kind, num_objects: u32) -> bool { + (offset.end - offset.start) / hash.len_in_bytes() == num_objects as usize + } +} + +/// Information about the offsets table. +pub mod offsets { + use std::{convert::TryInto, ops::Range}; + + use crate::multi_index; + + /// The id uniquely identifying the offsets table. + pub const ID: gix_chunk::Id = *b"OOFF"; + + /// Return the amount of bytes needed to offset data for `entries`. + pub fn storage_size(entries: usize) -> u64 { + (entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + large_offsets_needed: bool, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + use crate::index::write::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD}; + let mut num_large_offsets = 0u32; + + for entry in sorted_entries { + out.write_all(&entry.pack_index.to_be_bytes())?; + + let offset: u32 = if large_offsets_needed { + if entry.pack_offset > LARGE_OFFSET_THRESHOLD { + let res = num_large_offsets | HIGH_BIT; + num_large_offsets += 1; + res + } else { + entry.pack_offset as u32 + } + } else { + entry + .pack_offset + .try_into() + .expect("without large offsets, pack-offset fits u32") + }; + out.write_all(&offset.to_be_bytes())?; + } + Ok(()) + } + + /// Returns true if the `offset` range seems to match the size required for `num_objects`. + pub fn is_valid(offset: &Range<usize>, num_objects: u32) -> bool { + let entry_size = 4 /* pack-id */ + 4 /* pack-offset */; + ((offset.end - offset.start) / num_objects as usize) == entry_size + } +} + +/// Information about the large offsets table. +pub mod large_offsets { + use std::ops::Range; + + use crate::{index::write::encode::LARGE_OFFSET_THRESHOLD, multi_index}; + + /// The id uniquely identifying the large offsets table (with 64 bit offsets) + pub const ID: gix_chunk::Id = *b"LOFF"; + + /// Returns Some(num-large-offset) if there are offsets larger than u32. + pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option<usize> { + let mut num_large_offsets = 0; + let mut needs_large_offsets = false; + for entry in entries { + if entry.pack_offset > LARGE_OFFSET_THRESHOLD { + num_large_offsets += 1; + } + if entry.pack_offset > u32::MAX as crate::data::Offset { + needs_large_offsets = true; + } + } + + needs_large_offsets.then_some(num_large_offsets) + } + /// Returns true if the `offsets` range seems to be properly aligned for the data we expect. + pub fn is_valid(offset: &Range<usize>) -> bool { + (offset.end - offset.start) % 8 == 0 + } + + pub(crate) fn write( + sorted_entries: &[multi_index::write::Entry], + mut num_large_offsets: usize, + mut out: impl std::io::Write, + ) -> std::io::Result<()> { + for offset in sorted_entries + .iter() + .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then_some(e.pack_offset)) + { + out.write_all(&offset.to_be_bytes())?; + num_large_offsets = num_large_offsets + .checked_sub(1) + .expect("BUG: wrote more offsets the previously found"); + } + assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted"); + Ok(()) + } + + /// Return the amount of bytes needed to store the given amount of `large_offsets` + pub(crate) fn storage_size(large_offsets: usize) -> u64 { + 8 * large_offsets as u64 + } +} |