diff options
Diffstat (limited to 'vendor/gix-chunk/src/file')
-rw-r--r-- | vendor/gix-chunk/src/file/decode.rs | 105 | ||||
-rw-r--r-- | vendor/gix-chunk/src/file/index.rs | 107 | ||||
-rw-r--r-- | vendor/gix-chunk/src/file/mod.rs | 20 | ||||
-rw-r--r-- | vendor/gix-chunk/src/file/write.rs | 136 |
4 files changed, 368 insertions, 0 deletions
diff --git a/vendor/gix-chunk/src/file/decode.rs b/vendor/gix-chunk/src/file/decode.rs new file mode 100644 index 000000000..1543ac9be --- /dev/null +++ b/vendor/gix-chunk/src/file/decode.rs @@ -0,0 +1,105 @@ +use std::{convert::TryInto, ops::Range}; + +mod error { + /// The value returned by [crate::FileRef::from_bytes() + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Sentinel value encountered while still processing chunks.")] + EarlySentinelValue, + #[error("Sentinel value wasn't found, saw {:?}", std::str::from_utf8(actual.as_ref()).unwrap_or("<non-ascii>"))] + MissingSentinelValue { actual: crate::Id }, + #[error("The chunk offset {offset} went past the file of length {file_length} - was it truncated?")] + ChunkSizeOutOfBounds { + offset: crate::file::Offset, + file_length: u64, + }, + #[error("All chunk offsets must be incrementing.")] + NonIncrementalChunkOffsets, + #[error("The chunk of kind {:?} was encountered more than once", std::str::from_utf8(kind.as_ref()).unwrap_or("<non-ascii>"))] + DuplicateChunk { kind: crate::Id }, + #[error("The table of contents would be {expected} bytes, but got only {actual}")] + TocTooSmall { actual: usize, expected: usize }, + #[error("Empty chunk indices are not allowed as the point of chunked files is to have chunks.")] + Empty, + } +} +pub use error::Error; + +use crate::{file, file::index}; + +impl file::Index { + /// Provided a mapped file at the beginning via `data`, starting at `toc_offset` decode all chunk information to return + /// an index with `num_chunks` chunks. + pub fn from_bytes(data: &[u8], toc_offset: usize, num_chunks: u32) -> Result<Self, Error> { + if num_chunks == 0 { + return Err(Error::Empty); + } + + let data_len: u64 = data.len() as u64; + let mut chunks = Vec::with_capacity(num_chunks as usize); + let mut toc_entry = &data[toc_offset..]; + let expected_min_size = (num_chunks as usize + 1) * file::Index::ENTRY_SIZE; + if toc_entry.len() < expected_min_size { + return Err(Error::TocTooSmall { + expected: expected_min_size, + actual: toc_entry.len(), + }); + } + + for _ in 0..num_chunks { + let (kind, offset) = toc_entry.split_at(4); + let kind = to_kind(kind); + if kind == crate::SENTINEL { + return Err(Error::EarlySentinelValue); + } + if chunks.iter().any(|c: &index::Entry| c.kind == kind) { + return Err(Error::DuplicateChunk { kind }); + } + + let offset = be_u64(offset); + if offset > data_len { + return Err(Error::ChunkSizeOutOfBounds { + offset, + file_length: data_len, + }); + } + toc_entry = &toc_entry[file::Index::ENTRY_SIZE..]; + let next_offset = be_u64(&toc_entry[4..]); + if next_offset > data_len { + return Err(Error::ChunkSizeOutOfBounds { + offset: next_offset, + file_length: data_len, + }); + } + if next_offset <= offset { + return Err(Error::NonIncrementalChunkOffsets); + } + chunks.push(index::Entry { + kind, + offset: Range { + start: offset, + end: next_offset, + }, + }) + } + + let sentinel = to_kind(&toc_entry[..4]); + if sentinel != crate::SENTINEL { + return Err(Error::MissingSentinelValue { actual: sentinel }); + } + + Ok(file::Index { + chunks, + will_write: false, + }) + } +} + +fn to_kind(data: &[u8]) -> crate::Id { + data[..4].try_into().unwrap() +} + +fn be_u64(data: &[u8]) -> u64 { + u64::from_be_bytes(data[..8].try_into().unwrap()) +} diff --git a/vendor/gix-chunk/src/file/index.rs b/vendor/gix-chunk/src/file/index.rs new file mode 100644 index 000000000..5b59f6767 --- /dev/null +++ b/vendor/gix-chunk/src/file/index.rs @@ -0,0 +1,107 @@ +use std::ops::Range; + +use crate::file::Index; + +/// +pub mod offset_by_kind { + use std::fmt::{Display, Formatter}; + + /// The error returned by [Index::offset_by_kind()][super::Index::offset_by_id()]. + #[allow(missing_docs)] + #[derive(Debug)] + pub struct Error { + pub kind: crate::Id, + } + + impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Chunk named {:?} was not found in chunk file index", + std::str::from_utf8(&self.kind).unwrap_or("<non-ascii>") + ) + } + } + + impl std::error::Error for Error {} +} + +/// +pub mod data_by_kind { + /// The error returned by [Index::data_by_kind()][super::Index::data_by_id()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("The chunk wasn't found in the file index")] + NotFound(#[from] super::offset_by_kind::Error), + #[error("The offsets into the file couldn't be represented by usize")] + FileTooLarge, + } +} + +/// An entry of a chunk file index +pub struct Entry { + /// The kind of the chunk file + pub kind: crate::Id, + /// The offset, relative to the beginning of the file, at which to find the chunk and its end. + pub offset: Range<crate::file::Offset>, +} + +impl Index { + /// The size of a single index entry in bytes + pub const ENTRY_SIZE: usize = std::mem::size_of::<u32>() + std::mem::size_of::<u64>(); + /// The smallest possible size of an index, consisting only of the sentinel value pointing past itself. + pub const EMPTY_SIZE: usize = Index::ENTRY_SIZE; + + /// Returns the size in bytes an index with `num_entries` would take. + pub const fn size_for_entries(num_entries: usize) -> usize { + Self::ENTRY_SIZE * (num_entries + 1/*sentinel*/) + } + + /// Find a chunk of `kind` and return its offset into the data if found + pub fn offset_by_id(&self, kind: crate::Id) -> Result<Range<crate::file::Offset>, offset_by_kind::Error> { + self.chunks + .iter() + .find_map(|c| (c.kind == kind).then(|| c.offset.clone())) + .ok_or(offset_by_kind::Error { kind }) + } + + /// Find a chunk of `kind` and return its offset as usize range into the data if found. + /// + /// + /// # Panics + /// + /// - if the usize conversion fails, which isn't expected as memory maps can't be created if files are too large + /// to require such offsets. + pub fn usize_offset_by_id(&self, kind: crate::Id) -> Result<Range<usize>, offset_by_kind::Error> { + self.chunks + .iter() + .find_map(|c| (c.kind == kind).then(|| crate::range::into_usize_or_panic(c.offset.clone()))) + .ok_or(offset_by_kind::Error { kind }) + } + + /// Like [`Index::usize_offset_by_id()`] but with support for validation and transformation using a function. + pub fn validated_usize_offset_by_id<T>( + &self, + kind: crate::Id, + validate: impl FnOnce(Range<usize>) -> T, + ) -> Result<T, offset_by_kind::Error> { + self.chunks + .iter() + .find_map(|c| (c.kind == kind).then(|| crate::range::into_usize_or_panic(c.offset.clone()))) + .map(validate) + .ok_or(offset_by_kind::Error { kind }) + } + + /// Find a chunk of `kind` and return its data slice based on its offset. + pub fn data_by_id<'a>(&self, data: &'a [u8], kind: crate::Id) -> Result<&'a [u8], data_by_kind::Error> { + let offset = self.offset_by_id(kind)?; + Ok(&data[crate::range::into_usize(offset).ok_or(data_by_kind::Error::FileTooLarge)?]) + } + + /// Return the end offset lf the last chunk, which is the highest offset as well. + /// It's definitely available as we have one or more chunks. + pub fn highest_offset(&self) -> crate::file::Offset { + self.chunks.last().expect("at least one chunk").offset.end + } +} diff --git a/vendor/gix-chunk/src/file/mod.rs b/vendor/gix-chunk/src/file/mod.rs new file mode 100644 index 000000000..4ddd94999 --- /dev/null +++ b/vendor/gix-chunk/src/file/mod.rs @@ -0,0 +1,20 @@ +/// +pub mod decode; +/// +pub mod index; + +/// +pub mod write; + +/// The offset to a chunk as seen relative to the beginning of the file containing it. +pub type Offset = u64; + +/// A chunk file providing a table into the parent data. +pub struct Index { + /// If true, we use `chunks` in a way that facilitates writing them. + will_write: bool, + /// Validated chunks as defined by their index entries. + /// + /// Note that this list cannot be empty. + chunks: Vec<index::Entry>, +} diff --git a/vendor/gix-chunk/src/file/write.rs b/vendor/gix-chunk/src/file/write.rs new file mode 100644 index 000000000..8189140fe --- /dev/null +++ b/vendor/gix-chunk/src/file/write.rs @@ -0,0 +1,136 @@ +use crate::file::{index::Entry, Index}; + +mod write_chunk { + use std::collections::VecDeque; + + use crate::file::index; + + /// A [`Write`][std::io::Write] implementation that validates chunk sizes while allowing the user to know + /// which chunk is to be written next. + pub struct Chunk<W> { + chunks_to_write: VecDeque<index::Entry>, + inner: W, + next_chunk: Option<index::Entry>, + written_bytes: usize, + } + + impl<W> Chunk<W> + where + W: std::io::Write, + { + pub(crate) fn new(out: W, chunks: VecDeque<index::Entry>) -> Chunk<W> + where + W: std::io::Write, + { + Chunk { + chunks_to_write: chunks, + inner: out, + next_chunk: None, + written_bytes: 0, + } + } + } + + impl<W> std::io::Write for Chunk<W> + where + W: std::io::Write, + { + fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { + let written = self.inner.write(buf)?; + self.written_bytes += written; + Ok(written) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } + } + + impl<W> Chunk<W> { + /// Return the inner writer - should only be called once there is no more chunk to write. + pub fn into_inner(self) -> W { + self.inner + } + /// Return the next chunk-id to write, if there is one. + pub fn next_chunk(&mut self) -> Option<crate::Id> { + if let Some(entry) = self.next_chunk.take() { + assert_eq!( + entry.offset.end, + self.written_bytes as u64, + "BUG: expected to write {} bytes, but only wrote {} for chunk {:?}", + entry.offset.end, + self.written_bytes, + std::str::from_utf8(&entry.kind) + ) + } + self.written_bytes = 0; + self.next_chunk = self.chunks_to_write.pop_front(); + self.next_chunk.as_ref().map(|e| e.kind) + } + } +} +pub use write_chunk::Chunk; + +/// Writing +impl Index { + /// Create a new index whose sole purpose is to be receiving chunks using [`plan_chunk()`][Index::plan_chunk()] and to be written to + /// an output using [`into_write()`][Index::into_write()] + pub fn for_writing() -> Self { + Index { + will_write: true, + chunks: Vec::new(), + } + } + /// Plan to write a new chunk as part of the index when [`into_write()`][Index::into_write()] is called. + pub fn plan_chunk(&mut self, chunk: crate::Id, exact_size_on_disk: u64) { + assert!(self.will_write, "BUG: create the index with `for_writing()`"); + assert!( + !self.chunks.iter().any(|e| e.kind == chunk), + "BUG: must not add chunk of same kind twice: {:?}", + std::str::from_utf8(&chunk) + ); + self.chunks.push(Entry { + kind: chunk, + offset: 0..exact_size_on_disk, + }) + } + + /// Return the total size of all planned chunks thus far. + pub fn planned_storage_size(&self) -> u64 { + assert!(self.will_write, "BUG: create the index with `for_writing()`"); + self.chunks.iter().map(|e| e.offset.end).sum() + } + + /// Return the amount of chunks we currently know. + pub fn num_chunks(&self) -> usize { + self.chunks.len() + } + + /// After [planning all chunks][Index::plan_chunk()] call this method with the destination to write the chunks to. + /// Use the [Chunk] writer to write each chunk in order. + /// `current_offset` is the byte position at which `out` will continue writing. + pub fn into_write<W>(self, mut out: W, current_offset: usize) -> std::io::Result<Chunk<W>> + where + W: std::io::Write, + { + assert!( + self.will_write, + "BUG: create the index with `for_writing()`, cannot write decoded indices" + ); + // First chunk starts past the table of contents + let mut current_offset = (current_offset + Self::size_for_entries(self.num_chunks())) as u64; + + for entry in &self.chunks { + out.write_all(&entry.kind)?; + out.write_all(¤t_offset.to_be_bytes())?; + + current_offset += entry.offset.end; + } + + // sentinel to mark end of chunks + out.write_all(&0u32.to_be_bytes())?; + out.write_all(¤t_offset.to_be_bytes())?; + + Ok(Chunk::new(out, self.chunks.into())) + } +} |