use filetime::FileTime; use crate::{entry, extension, Entry, State, Version}; mod entries; /// pub mod header; mod error { use crate::{decode, extension}; /// The error returned by [`State::from_bytes()`][crate::State::from_bytes()]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Header(#[from] decode::header::Error), #[error("Could not parse entry at index {index}")] Entry { index: u32 }, #[error("Mandatory extension wasn't implemented or malformed.")] Extension(#[from] extension::decode::Error), #[error("Index trailer should have been {expected} bytes long, but was {actual}")] UnexpectedTrailerLength { expected: usize, actual: usize }, #[error("Shared index checksum was {actual_checksum} but should have been {expected_checksum}")] ChecksumMismatch { actual_checksum: gix_hash::ObjectId, expected_checksum: gix_hash::ObjectId, }, } } pub use error::Error; use gix_features::parallel::InOrderIter; use crate::util::read_u32; /// Options to define how to decode an index state [from bytes][State::from_bytes()]. #[derive(Default, Clone, Copy)] pub struct Options { /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads /// as there are logical cores. /// /// This applies to loading extensions in parallel to entries if the common EOIE extension is available. /// It also allows to use multiple threads for loading entries if the IEOT extension is present. pub thread_limit: Option, /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available. /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available. pub min_extension_block_in_bytes_for_threading: usize, /// Set the expected hash of this index if we are read as part of a `link` extension. /// /// We will abort reading this file if it doesn't match. pub expected_checksum: Option, } impl State { /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash` /// to be used through the file. pub fn from_bytes( data: &[u8], timestamp: FileTime, object_hash: gix_hash::Kind, Options { thread_limit, min_extension_block_in_bytes_for_threading, expected_checksum, }: Options, ) -> Result<(Self, gix_hash::ObjectId), Error> { let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); let mut num_threads = gix_features::parallel::num_threads(thread_limit); let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( num_entries, data.len(), start_of_extensions, object_hash, version, ); let (entries, ext, data) = match start_of_extensions { Some(offset) if num_threads > 1 => { let extensions_data = &data[offset..]; let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash); let (entries_res, ext_res) = gix_features::parallel::threads(|scope| { let extension_loading = (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({ num_threads -= 1; || { gix_features::parallel::build_thread() .name("gix-index.from_bytes.load-extensions".into()) .spawn_scoped(scope, || extension::decode::all(extensions_data, object_hash)) .expect("valid name") } }); let entries_res = match index_offsets_table { Some(entry_offsets) => { let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize; let num_chunks = entry_offsets.chunks(chunk_size).count(); let mut threads = Vec::with_capacity(num_chunks); for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() { let chunks = chunks.to_vec(); threads.push( gix_features::parallel::build_thread() .name(format!("gix-index.from_bytes.read-entries.{id}")) .spawn_scoped(scope, move || { let num_entries_for_chunks = chunks.iter().map(|c| c.num_entries).sum::() as usize; let mut entries = Vec::with_capacity(num_entries_for_chunks); let path_backing_buffer_size_for_chunks = entries::estimate_path_storage_requirements_in_bytes( num_entries_for_chunks as u32, data.len() / num_chunks, start_of_extensions.map(|ofs| ofs / num_chunks), object_hash, version, ); let mut path_backing = Vec::with_capacity(path_backing_buffer_size_for_chunks); let mut is_sparse = false; for offset in chunks { let ( entries::Outcome { is_sparse: chunk_is_sparse, }, _data, ) = entries::chunk( &data[offset.from_beginning_of_file as usize..], &mut entries, &mut path_backing, offset.num_entries, object_hash, version, )?; is_sparse |= chunk_is_sparse; } Ok::<_, Error>(( id, EntriesOutcome { entries, path_backing, is_sparse, }, )) }) .expect("valid name"), ); } let mut results = InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap())); let mut acc = results.next().expect("have at least two results, one per thread"); // We explicitly don't adjust the reserve in acc and rather allow for more copying // to happens as vectors grow to keep the peak memory size low. // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory // in some bytes in the path offset. That way there is more indirection/slower access // to the path, but it would save time here. // As it stands, `git` is definitely more efficient at this and probably uses less memory too. // Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just // 100GB/s on a single core. while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) { match res { Ok(rhs) => { lhs.is_sparse |= rhs.is_sparse; let ofs = lhs.path_backing.len(); lhs.path_backing.extend(rhs.path_backing); lhs.entries.extend(rhs.entries.into_iter().map(|mut e| { e.path.start += ofs; e.path.end += ofs; e })); } Err(err) => { acc = Err(err); } } } acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..])) } None => entries( post_header_data, path_backing_buffer_size, num_entries, object_hash, version, ), }; let ext_res = extension_loading.map_or_else( || extension::decode::all(extensions_data, object_hash), |thread| thread.join().unwrap(), ); (entries_res, ext_res) }); let (ext, data) = ext_res?; (entries_res?.0, ext, data) } None | Some(_) => { let (entries, data) = entries( post_header_data, path_backing_buffer_size, num_entries, object_hash, version, )?; let (ext, data) = extension::decode::all(data, object_hash)?; (entries, ext, data) } }; if data.len() != object_hash.len_in_bytes() { return Err(Error::UnexpectedTrailerLength { expected: object_hash.len_in_bytes(), actual: data.len(), }); } let checksum = gix_hash::ObjectId::from(data); if let Some(expected_checksum) = expected_checksum { if checksum != expected_checksum { return Err(Error::ChecksumMismatch { actual_checksum: checksum, expected_checksum, }); } } let EntriesOutcome { entries, path_backing, mut is_sparse, } = entries; let extension::decode::Outcome { tree, link, resolve_undo, untracked, fs_monitor, is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories } = ext; is_sparse |= is_sparse_from_ext; Ok(( State { object_hash, timestamp, version, entries, path_backing, is_sparse, tree, link, resolve_undo, untracked, fs_monitor, }, checksum, )) } } struct EntriesOutcome { pub entries: Vec, pub path_backing: Vec, pub is_sparse: bool, } fn entries( post_header_data: &[u8], path_backing_buffer_size: usize, num_entries: u32, object_hash: gix_hash::Kind, version: Version, ) -> Result<(EntriesOutcome, &[u8]), Error> { let mut entries = Vec::with_capacity(num_entries as usize); let mut path_backing = Vec::with_capacity(path_backing_buffer_size); entries::chunk( post_header_data, &mut entries, &mut path_backing, num_entries, object_hash, version, ) .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| { ( EntriesOutcome { entries, path_backing, is_sparse, }, data, ) }) } pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> { let (ctime_secs, data) = read_u32(data)?; let (ctime_nsecs, data) = read_u32(data)?; let (mtime_secs, data) = read_u32(data)?; let (mtime_nsecs, data) = read_u32(data)?; let (dev, data) = read_u32(data)?; let (ino, data) = read_u32(data)?; let (uid, data) = read_u32(data)?; let (gid, data) = read_u32(data)?; let (size, data) = read_u32(data)?; Some(( entry::Stat { mtime: entry::stat::Time { secs: ctime_secs, nsecs: ctime_nsecs, }, ctime: entry::stat::Time { secs: mtime_secs, nsecs: mtime_nsecs, }, dev, ino, uid, gid, size, }, data, )) }