From 837b550238aa671a591ccf282dddeab29cadb206 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 04:49:42 +0200 Subject: Merging upstream version 1.71.1+dfsg1. Signed-off-by: Daniel Baumann --- compiler/rustc_serialize/src/leb128.rs | 16 +- compiler/rustc_serialize/src/lib.rs | 1 + compiler/rustc_serialize/src/opaque.rs | 503 +++++++++--------------------- compiler/rustc_serialize/src/serialize.rs | 79 ++++- 4 files changed, 223 insertions(+), 376 deletions(-) (limited to 'compiler/rustc_serialize/src') diff --git a/compiler/rustc_serialize/src/leb128.rs b/compiler/rustc_serialize/src/leb128.rs index 7dad9aa01..e568b9e67 100644 --- a/compiler/rustc_serialize/src/leb128.rs +++ b/compiler/rustc_serialize/src/leb128.rs @@ -1,3 +1,6 @@ +use crate::opaque::MemDecoder; +use crate::serialize::Decoder; + /// Returns the length of the longest LEB128 encoding for `T`, assuming `T` is an integer type pub const fn max_leb128_len() -> usize { // The longest LEB128 encoding for an integer uses 7 bits per byte. @@ -50,21 +53,19 @@ impl_write_unsigned_leb128!(write_usize_leb128, usize); macro_rules! impl_read_unsigned_leb128 { ($fn_name:ident, $int_ty:ty) => { #[inline] - pub fn $fn_name(slice: &[u8], position: &mut usize) -> $int_ty { + pub fn $fn_name(decoder: &mut MemDecoder<'_>) -> $int_ty { // The first iteration of this loop is unpeeled. This is a // performance win because this code is hot and integer values less // than 128 are very common, typically occurring 50-80% or more of // the time, even for u64 and u128. - let byte = slice[*position]; - *position += 1; + let byte = decoder.read_u8(); if (byte & 0x80) == 0 { return byte as $int_ty; } let mut result = (byte & 0x7F) as $int_ty; let mut shift = 7; loop { - let byte = slice[*position]; - *position += 1; + let byte = decoder.read_u8(); if (byte & 0x80) == 0 { result |= (byte as $int_ty) << shift; return result; @@ -127,14 +128,13 @@ impl_write_signed_leb128!(write_isize_leb128, isize); macro_rules! impl_read_signed_leb128 { ($fn_name:ident, $int_ty:ty) => { #[inline] - pub fn $fn_name(slice: &[u8], position: &mut usize) -> $int_ty { + pub fn $fn_name(decoder: &mut MemDecoder<'_>) -> $int_ty { let mut result = 0; let mut shift = 0; let mut byte; loop { - byte = slice[*position]; - *position += 1; + byte = decoder.read_u8(); result |= <$int_ty>::from(byte & 0x7F) << shift; shift += 7; diff --git a/compiler/rustc_serialize/src/lib.rs b/compiler/rustc_serialize/src/lib.rs index 1f8d2336c..ce8503918 100644 --- a/compiler/rustc_serialize/src/lib.rs +++ b/compiler/rustc_serialize/src/lib.rs @@ -16,6 +16,7 @@ Core encoding and decoding interfaces. #![feature(maybe_uninit_slice)] #![feature(new_uninit)] #![feature(allocator_api)] +#![feature(ptr_sub_ptr)] #![cfg_attr(test, feature(test))] #![allow(rustc::internal)] #![deny(rustc::untranslatable_diagnostic)] diff --git a/compiler/rustc_serialize/src/opaque.rs b/compiler/rustc_serialize/src/opaque.rs index 53e5c8967..0ffc537ee 100644 --- a/compiler/rustc_serialize/src/opaque.rs +++ b/compiler/rustc_serialize/src/opaque.rs @@ -1,8 +1,10 @@ -use crate::leb128::{self, largest_max_leb128_len}; +use crate::leb128; use crate::serialize::{Decodable, Decoder, Encodable, Encoder}; use std::fs::File; use std::io::{self, Write}; +use std::marker::PhantomData; use std::mem::MaybeUninit; +use std::ops::Range; use std::path::Path; use std::ptr; @@ -10,147 +12,17 @@ use std::ptr; // Encoder // ----------------------------------------------------------------------------- -pub struct MemEncoder { - pub data: Vec, -} - -impl MemEncoder { - pub fn new() -> MemEncoder { - MemEncoder { data: vec![] } - } - - #[inline] - pub fn position(&self) -> usize { - self.data.len() - } - - pub fn finish(self) -> Vec { - self.data - } -} - -macro_rules! write_leb128 { - ($enc:expr, $value:expr, $int_ty:ty, $fun:ident) => {{ - const MAX_ENCODED_LEN: usize = $crate::leb128::max_leb128_len::<$int_ty>(); - let old_len = $enc.data.len(); - - if MAX_ENCODED_LEN > $enc.data.capacity() - old_len { - $enc.data.reserve(MAX_ENCODED_LEN); - } - - // SAFETY: The above check and `reserve` ensures that there is enough - // room to write the encoded value to the vector's internal buffer. - unsafe { - let buf = &mut *($enc.data.as_mut_ptr().add(old_len) - as *mut [MaybeUninit; MAX_ENCODED_LEN]); - let encoded = leb128::$fun(buf, $value); - $enc.data.set_len(old_len + encoded.len()); - } - }}; -} - -/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string. -/// This way we can skip validation and still be relatively sure that deserialization -/// did not desynchronize. -/// -/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout -const STR_SENTINEL: u8 = 0xC1; - -impl Encoder for MemEncoder { - #[inline] - fn emit_usize(&mut self, v: usize) { - write_leb128!(self, v, usize, write_usize_leb128) - } - - #[inline] - fn emit_u128(&mut self, v: u128) { - write_leb128!(self, v, u128, write_u128_leb128); - } - - #[inline] - fn emit_u64(&mut self, v: u64) { - write_leb128!(self, v, u64, write_u64_leb128); - } - - #[inline] - fn emit_u32(&mut self, v: u32) { - write_leb128!(self, v, u32, write_u32_leb128); - } - - #[inline] - fn emit_u16(&mut self, v: u16) { - self.data.extend_from_slice(&v.to_le_bytes()); - } - - #[inline] - fn emit_u8(&mut self, v: u8) { - self.data.push(v); - } - - #[inline] - fn emit_isize(&mut self, v: isize) { - write_leb128!(self, v, isize, write_isize_leb128) - } - - #[inline] - fn emit_i128(&mut self, v: i128) { - write_leb128!(self, v, i128, write_i128_leb128) - } - - #[inline] - fn emit_i64(&mut self, v: i64) { - write_leb128!(self, v, i64, write_i64_leb128) - } - - #[inline] - fn emit_i32(&mut self, v: i32) { - write_leb128!(self, v, i32, write_i32_leb128) - } - - #[inline] - fn emit_i16(&mut self, v: i16) { - self.data.extend_from_slice(&v.to_le_bytes()); - } - - #[inline] - fn emit_i8(&mut self, v: i8) { - self.emit_u8(v as u8); - } - - #[inline] - fn emit_bool(&mut self, v: bool) { - self.emit_u8(if v { 1 } else { 0 }); - } - - #[inline] - fn emit_char(&mut self, v: char) { - self.emit_u32(v as u32); - } - - #[inline] - fn emit_str(&mut self, v: &str) { - self.emit_usize(v.len()); - self.emit_raw_bytes(v.as_bytes()); - self.emit_u8(STR_SENTINEL); - } - - #[inline] - fn emit_raw_bytes(&mut self, s: &[u8]) { - self.data.extend_from_slice(s); - } -} - pub type FileEncodeResult = Result; +/// The size of the buffer in `FileEncoder`. +const BUF_SIZE: usize = 8192; + /// `FileEncoder` encodes data to file via fixed-size buffer. /// -/// When encoding large amounts of data to a file, using `FileEncoder` may be -/// preferred over using `MemEncoder` to encode to a `Vec`, and then writing the -/// `Vec` to file, as the latter uses as much memory as there is encoded data, -/// while the former uses the fixed amount of memory allocated to the buffer. -/// `FileEncoder` also has the advantage of not needing to reallocate as data -/// is appended to it, but the disadvantage of requiring more error handling, -/// which has some runtime overhead. +/// There used to be a `MemEncoder` type that encoded all the data into a +/// `Vec`. `FileEncoder` is better because its memory use is determined by the +/// size of the buffer, rather than the full length of the encoded data, and +/// because it doesn't need to reallocate memory along the way. pub struct FileEncoder { /// The input buffer. For adequate performance, we need more control over /// buffering than `BufWriter` offers. If `BufWriter` ever offers a raw @@ -166,26 +38,12 @@ pub struct FileEncoder { impl FileEncoder { pub fn new>(path: P) -> io::Result { - const DEFAULT_BUF_SIZE: usize = 8192; - FileEncoder::with_capacity(path, DEFAULT_BUF_SIZE) - } - - pub fn with_capacity>(path: P, capacity: usize) -> io::Result { - // Require capacity at least as large as the largest LEB128 encoding - // here, so that we don't have to check or handle this on every write. - assert!(capacity >= largest_max_leb128_len()); - - // Require capacity small enough such that some capacity checks can be - // done using guaranteed non-overflowing add rather than sub, which - // shaves an instruction off those code paths (on x86 at least). - assert!(capacity <= usize::MAX - largest_max_leb128_len()); - // Create the file for reading and writing, because some encoders do both // (e.g. the metadata encoder when -Zmeta-stats is enabled) let file = File::options().read(true).write(true).create(true).truncate(true).open(path)?; Ok(FileEncoder { - buf: Box::new_uninit_slice(capacity), + buf: Box::new_uninit_slice(BUF_SIZE), buffered: 0, flushed: 0, file, @@ -290,19 +148,11 @@ impl FileEncoder { &self.file } - #[inline] - fn capacity(&self) -> usize { - self.buf.len() - } - #[inline] fn write_one(&mut self, value: u8) { - // We ensure this during `FileEncoder` construction. - debug_assert!(self.capacity() >= 1); - let mut buffered = self.buffered; - if std::intrinsics::unlikely(buffered >= self.capacity()) { + if std::intrinsics::unlikely(buffered + 1 > BUF_SIZE) { self.flush(); buffered = 0; } @@ -318,13 +168,12 @@ impl FileEncoder { #[inline] fn write_all(&mut self, buf: &[u8]) { - let capacity = self.capacity(); let buf_len = buf.len(); - if std::intrinsics::likely(buf_len <= capacity) { + if std::intrinsics::likely(buf_len <= BUF_SIZE) { let mut buffered = self.buffered; - if std::intrinsics::unlikely(buf_len > capacity - buffered) { + if std::intrinsics::unlikely(buffered + buf_len > BUF_SIZE) { self.flush(); buffered = 0; } @@ -396,52 +245,39 @@ impl Drop for FileEncoder { } } -macro_rules! file_encoder_write_leb128 { - ($enc:expr, $value:expr, $int_ty:ty, $fun:ident) => {{ - const MAX_ENCODED_LEN: usize = $crate::leb128::max_leb128_len::<$int_ty>(); - - // We ensure this during `FileEncoder` construction. - debug_assert!($enc.capacity() >= MAX_ENCODED_LEN); +macro_rules! write_leb128 { + ($this_fn:ident, $int_ty:ty, $write_leb_fn:ident) => { + #[inline] + fn $this_fn(&mut self, v: $int_ty) { + const MAX_ENCODED_LEN: usize = $crate::leb128::max_leb128_len::<$int_ty>(); - let mut buffered = $enc.buffered; + let mut buffered = self.buffered; - // This can't overflow. See assertion in `FileEncoder::with_capacity`. - if std::intrinsics::unlikely(buffered + MAX_ENCODED_LEN > $enc.capacity()) { - $enc.flush(); - buffered = 0; - } + // This can't overflow because BUF_SIZE and MAX_ENCODED_LEN are both + // quite small. + if std::intrinsics::unlikely(buffered + MAX_ENCODED_LEN > BUF_SIZE) { + self.flush(); + buffered = 0; + } - // SAFETY: The above check and flush ensures that there is enough - // room to write the encoded value to the buffer. - let buf = unsafe { - &mut *($enc.buf.as_mut_ptr().add(buffered) as *mut [MaybeUninit; MAX_ENCODED_LEN]) - }; + // SAFETY: The above check and flush ensures that there is enough + // room to write the encoded value to the buffer. + let buf = unsafe { + &mut *(self.buf.as_mut_ptr().add(buffered) + as *mut [MaybeUninit; MAX_ENCODED_LEN]) + }; - let encoded = leb128::$fun(buf, $value); - $enc.buffered = buffered + encoded.len(); - }}; + let encoded = leb128::$write_leb_fn(buf, v); + self.buffered = buffered + encoded.len(); + } + }; } impl Encoder for FileEncoder { - #[inline] - fn emit_usize(&mut self, v: usize) { - file_encoder_write_leb128!(self, v, usize, write_usize_leb128) - } - - #[inline] - fn emit_u128(&mut self, v: u128) { - file_encoder_write_leb128!(self, v, u128, write_u128_leb128) - } - - #[inline] - fn emit_u64(&mut self, v: u64) { - file_encoder_write_leb128!(self, v, u64, write_u64_leb128) - } - - #[inline] - fn emit_u32(&mut self, v: u32) { - file_encoder_write_leb128!(self, v, u32, write_u32_leb128) - } + write_leb128!(emit_usize, usize, write_usize_leb128); + write_leb128!(emit_u128, u128, write_u128_leb128); + write_leb128!(emit_u64, u64, write_u64_leb128); + write_leb128!(emit_u32, u32, write_u32_leb128); #[inline] fn emit_u16(&mut self, v: u16) { @@ -453,53 +289,16 @@ impl Encoder for FileEncoder { self.write_one(v); } - #[inline] - fn emit_isize(&mut self, v: isize) { - file_encoder_write_leb128!(self, v, isize, write_isize_leb128) - } - - #[inline] - fn emit_i128(&mut self, v: i128) { - file_encoder_write_leb128!(self, v, i128, write_i128_leb128) - } - - #[inline] - fn emit_i64(&mut self, v: i64) { - file_encoder_write_leb128!(self, v, i64, write_i64_leb128) - } - - #[inline] - fn emit_i32(&mut self, v: i32) { - file_encoder_write_leb128!(self, v, i32, write_i32_leb128) - } + write_leb128!(emit_isize, isize, write_isize_leb128); + write_leb128!(emit_i128, i128, write_i128_leb128); + write_leb128!(emit_i64, i64, write_i64_leb128); + write_leb128!(emit_i32, i32, write_i32_leb128); #[inline] fn emit_i16(&mut self, v: i16) { self.write_all(&v.to_le_bytes()); } - #[inline] - fn emit_i8(&mut self, v: i8) { - self.emit_u8(v as u8); - } - - #[inline] - fn emit_bool(&mut self, v: bool) { - self.emit_u8(if v { 1 } else { 0 }); - } - - #[inline] - fn emit_char(&mut self, v: char) { - self.emit_u32(v as u32); - } - - #[inline] - fn emit_str(&mut self, v: &str) { - self.emit_usize(v.len()); - self.emit_raw_bytes(v.as_bytes()); - self.emit_u8(STR_SENTINEL); - } - #[inline] fn emit_raw_bytes(&mut self, s: &[u8]) { self.write_all(s); @@ -510,137 +309,155 @@ impl Encoder for FileEncoder { // Decoder // ----------------------------------------------------------------------------- +// Conceptually, `MemDecoder` wraps a `&[u8]` with a cursor into it that is always valid. +// This is implemented with three pointers, two which represent the original slice and a +// third that is our cursor. +// It is an invariant of this type that start <= current <= end. +// Additionally, the implementation of this type never modifies start and end. pub struct MemDecoder<'a> { - pub data: &'a [u8], - position: usize, + start: *const u8, + current: *const u8, + end: *const u8, + _marker: PhantomData<&'a u8>, } impl<'a> MemDecoder<'a> { #[inline] pub fn new(data: &'a [u8], position: usize) -> MemDecoder<'a> { - MemDecoder { data, position } + let Range { start, end } = data.as_ptr_range(); + MemDecoder { start, current: data[position..].as_ptr(), end, _marker: PhantomData } } #[inline] - pub fn position(&self) -> usize { - self.position + pub fn data(&self) -> &'a [u8] { + // SAFETY: This recovers the original slice, only using members we never modify. + unsafe { std::slice::from_raw_parts(self.start, self.len()) } } #[inline] - pub fn set_position(&mut self, pos: usize) { - self.position = pos + pub fn len(&self) -> usize { + // SAFETY: This recovers the length of the original slice, only using members we never modify. + unsafe { self.end.sub_ptr(self.start) } } #[inline] - pub fn advance(&mut self, bytes: usize) { - self.position += bytes; + pub fn remaining(&self) -> usize { + // SAFETY: This type guarantees current <= end. + unsafe { self.end.sub_ptr(self.current) } } -} - -macro_rules! read_leb128 { - ($dec:expr, $fun:ident) => {{ leb128::$fun($dec.data, &mut $dec.position) }}; -} -impl<'a> Decoder for MemDecoder<'a> { - #[inline] - fn read_u128(&mut self) -> u128 { - read_leb128!(self, read_u128_leb128) + #[cold] + #[inline(never)] + fn decoder_exhausted() -> ! { + panic!("MemDecoder exhausted") } #[inline] - fn read_u64(&mut self) -> u64 { - read_leb128!(self, read_u64_leb128) + fn read_array(&mut self) -> [u8; N] { + self.read_raw_bytes(N).try_into().unwrap() } + /// While we could manually expose manipulation of the decoder position, + /// all current users of that method would need to reset the position later, + /// incurring the bounds check of set_position twice. #[inline] - fn read_u32(&mut self) -> u32 { - read_leb128!(self, read_u32_leb128) - } + pub fn with_position(&mut self, pos: usize, func: F) -> T + where + F: Fn(&mut MemDecoder<'a>) -> T, + { + struct SetOnDrop<'a, 'guarded> { + decoder: &'guarded mut MemDecoder<'a>, + current: *const u8, + } + impl Drop for SetOnDrop<'_, '_> { + fn drop(&mut self) { + self.decoder.current = self.current; + } + } - #[inline] - fn read_u16(&mut self) -> u16 { - let bytes = [self.data[self.position], self.data[self.position + 1]]; - let value = u16::from_le_bytes(bytes); - self.position += 2; - value + if pos >= self.len() { + Self::decoder_exhausted(); + } + let previous = self.current; + // SAFETY: We just checked if this add is in-bounds above. + unsafe { + self.current = self.start.add(pos); + } + let guard = SetOnDrop { current: previous, decoder: self }; + func(guard.decoder) } +} - #[inline] - fn read_u8(&mut self) -> u8 { - let value = self.data[self.position]; - self.position += 1; - value - } +macro_rules! read_leb128 { + ($this_fn:ident, $int_ty:ty, $read_leb_fn:ident) => { + #[inline] + fn $this_fn(&mut self) -> $int_ty { + leb128::$read_leb_fn(self) + } + }; +} - #[inline] - fn read_usize(&mut self) -> usize { - read_leb128!(self, read_usize_leb128) - } +impl<'a> Decoder for MemDecoder<'a> { + read_leb128!(read_usize, usize, read_usize_leb128); + read_leb128!(read_u128, u128, read_u128_leb128); + read_leb128!(read_u64, u64, read_u64_leb128); + read_leb128!(read_u32, u32, read_u32_leb128); #[inline] - fn read_i128(&mut self) -> i128 { - read_leb128!(self, read_i128_leb128) + fn read_u16(&mut self) -> u16 { + u16::from_le_bytes(self.read_array()) } #[inline] - fn read_i64(&mut self) -> i64 { - read_leb128!(self, read_i64_leb128) + fn read_u8(&mut self) -> u8 { + if self.current == self.end { + Self::decoder_exhausted(); + } + // SAFETY: This type guarantees current <= end, and we just checked current == end. + unsafe { + let byte = *self.current; + self.current = self.current.add(1); + byte + } } - #[inline] - fn read_i32(&mut self) -> i32 { - read_leb128!(self, read_i32_leb128) - } + read_leb128!(read_isize, isize, read_isize_leb128); + read_leb128!(read_i128, i128, read_i128_leb128); + read_leb128!(read_i64, i64, read_i64_leb128); + read_leb128!(read_i32, i32, read_i32_leb128); #[inline] fn read_i16(&mut self) -> i16 { - let bytes = [self.data[self.position], self.data[self.position + 1]]; - let value = i16::from_le_bytes(bytes); - self.position += 2; - value - } - - #[inline] - fn read_i8(&mut self) -> i8 { - let value = self.data[self.position]; - self.position += 1; - value as i8 - } - - #[inline] - fn read_isize(&mut self) -> isize { - read_leb128!(self, read_isize_leb128) + i16::from_le_bytes(self.read_array()) } #[inline] - fn read_bool(&mut self) -> bool { - let value = self.read_u8(); - value != 0 - } - - #[inline] - fn read_char(&mut self) -> char { - let bits = self.read_u32(); - std::char::from_u32(bits).unwrap() + fn read_raw_bytes(&mut self, bytes: usize) -> &'a [u8] { + if bytes > self.remaining() { + Self::decoder_exhausted(); + } + // SAFETY: We just checked if this range is in-bounds above. + unsafe { + let slice = std::slice::from_raw_parts(self.current, bytes); + self.current = self.current.add(bytes); + slice + } } #[inline] - fn read_str(&mut self) -> &'a str { - let len = self.read_usize(); - let sentinel = self.data[self.position + len]; - assert!(sentinel == STR_SENTINEL); - let s = unsafe { - std::str::from_utf8_unchecked(&self.data[self.position..self.position + len]) - }; - self.position += len + 1; - s + fn peek_byte(&self) -> u8 { + if self.current == self.end { + Self::decoder_exhausted(); + } + // SAFETY: This type guarantees current is inbounds or one-past-the-end, which is end. + // Since we just checked current == end, the current pointer must be inbounds. + unsafe { *self.current } } #[inline] - fn read_raw_bytes(&mut self, bytes: usize) -> &'a [u8] { - let start = self.position; - self.position += bytes; - &self.data[start..self.position] + fn position(&self) -> usize { + // SAFETY: This type guarantees start <= current + unsafe { self.current.sub_ptr(self.start) } } } @@ -651,13 +468,6 @@ impl<'a> Decoder for MemDecoder<'a> { // Specialize encoding byte slices. This specialization also applies to encoding `Vec`s, etc., // since the default implementations call `encode` on their slices internally. -impl Encodable for [u8] { - fn encode(&self, e: &mut MemEncoder) { - Encoder::emit_usize(e, self.len()); - e.emit_raw_bytes(self); - } -} - impl Encodable for [u8] { fn encode(&self, e: &mut FileEncoder) { Encoder::emit_usize(e, self.len()); @@ -681,16 +491,6 @@ impl IntEncodedWithFixedSize { pub const ENCODED_SIZE: usize = 8; } -impl Encodable for IntEncodedWithFixedSize { - #[inline] - fn encode(&self, e: &mut MemEncoder) { - let _start_pos = e.position(); - e.emit_raw_bytes(&self.0.to_le_bytes()); - let _end_pos = e.position(); - debug_assert_eq!((_end_pos - _start_pos), IntEncodedWithFixedSize::ENCODED_SIZE); - } -} - impl Encodable for IntEncodedWithFixedSize { #[inline] fn encode(&self, e: &mut FileEncoder) { @@ -704,12 +504,7 @@ impl Encodable for IntEncodedWithFixedSize { impl<'a> Decodable> for IntEncodedWithFixedSize { #[inline] fn decode(decoder: &mut MemDecoder<'a>) -> IntEncodedWithFixedSize { - let _start_pos = decoder.position(); - let bytes = decoder.read_raw_bytes(IntEncodedWithFixedSize::ENCODED_SIZE); - let value = u64::from_le_bytes(bytes.try_into().unwrap()); - let _end_pos = decoder.position(); - debug_assert_eq!((_end_pos - _start_pos), IntEncodedWithFixedSize::ENCODED_SIZE); - - IntEncodedWithFixedSize(value) + let bytes = decoder.read_array::<{ IntEncodedWithFixedSize::ENCODED_SIZE }>(); + IntEncodedWithFixedSize(u64::from_le_bytes(bytes)) } } diff --git a/compiler/rustc_serialize/src/serialize.rs b/compiler/rustc_serialize/src/serialize.rs index 527abc237..06166cabc 100644 --- a/compiler/rustc_serialize/src/serialize.rs +++ b/compiler/rustc_serialize/src/serialize.rs @@ -1,9 +1,5 @@ //! Support code for encoding and decoding types. -/* -Core encoding and decoding interfaces. -*/ - use std::alloc::Allocator; use std::borrow::Cow; use std::cell::{Cell, RefCell}; @@ -12,6 +8,13 @@ use std::path; use std::rc::Rc; use std::sync::Arc; +/// A byte that [cannot occur in UTF8 sequences][utf8]. Used to mark the end of a string. +/// This way we can skip validation and still be relatively sure that deserialization +/// did not desynchronize. +/// +/// [utf8]: https://en.wikipedia.org/w/index.php?title=UTF-8&oldid=1058865525#Codepage_layout +const STR_SENTINEL: u8 = 0xC1; + /// A note about error handling. /// /// Encoders may be fallible, but in practice failure is rare and there are so @@ -28,22 +31,41 @@ use std::sync::Arc; /// really makes sense to store floating-point values at all. /// (If you need it, revert .) pub trait Encoder { - // Primitive types: fn emit_usize(&mut self, v: usize); fn emit_u128(&mut self, v: u128); fn emit_u64(&mut self, v: u64); fn emit_u32(&mut self, v: u32); fn emit_u16(&mut self, v: u16); fn emit_u8(&mut self, v: u8); + fn emit_isize(&mut self, v: isize); fn emit_i128(&mut self, v: i128); fn emit_i64(&mut self, v: i64); fn emit_i32(&mut self, v: i32); fn emit_i16(&mut self, v: i16); - fn emit_i8(&mut self, v: i8); - fn emit_bool(&mut self, v: bool); - fn emit_char(&mut self, v: char); - fn emit_str(&mut self, v: &str); + + #[inline] + fn emit_i8(&mut self, v: i8) { + self.emit_u8(v as u8); + } + + #[inline] + fn emit_bool(&mut self, v: bool) { + self.emit_u8(if v { 1 } else { 0 }); + } + + #[inline] + fn emit_char(&mut self, v: char) { + self.emit_u32(v as u32); + } + + #[inline] + fn emit_str(&mut self, v: &str) { + self.emit_usize(v.len()); + self.emit_raw_bytes(v.as_bytes()); + self.emit_u8(STR_SENTINEL); + } + fn emit_raw_bytes(&mut self, s: &[u8]); fn emit_enum_variant(&mut self, v_id: usize, f: F) @@ -67,23 +89,52 @@ pub trait Encoder { /// really makes sense to store floating-point values at all. /// (If you need it, revert .) pub trait Decoder { - // Primitive types: fn read_usize(&mut self) -> usize; fn read_u128(&mut self) -> u128; fn read_u64(&mut self) -> u64; fn read_u32(&mut self) -> u32; fn read_u16(&mut self) -> u16; fn read_u8(&mut self) -> u8; + fn read_isize(&mut self) -> isize; fn read_i128(&mut self) -> i128; fn read_i64(&mut self) -> i64; fn read_i32(&mut self) -> i32; fn read_i16(&mut self) -> i16; - fn read_i8(&mut self) -> i8; - fn read_bool(&mut self) -> bool; - fn read_char(&mut self) -> char; - fn read_str(&mut self) -> &str; + + #[inline] + fn read_i8(&mut self) -> i8 { + self.read_u8() as i8 + } + + #[inline] + fn read_bool(&mut self) -> bool { + let value = self.read_u8(); + value != 0 + } + + #[inline] + fn read_char(&mut self) -> char { + let bits = self.read_u32(); + std::char::from_u32(bits).unwrap() + } + + #[inline] + fn read_str(&mut self) -> &str { + let len = self.read_usize(); + let bytes = self.read_raw_bytes(len + 1); + assert!(bytes[len] == STR_SENTINEL); + unsafe { std::str::from_utf8_unchecked(&bytes[..len]) } + } + fn read_raw_bytes(&mut self, len: usize) -> &[u8]; + + // Although there is an `emit_enum_variant` method in `Encoder`, the code + // patterns in decoding are different enough to encoding that there is no + // need for a corresponding `read_enum_variant` method here. + + fn peek_byte(&self) -> u8; + fn position(&self) -> usize; } /// Trait for types that can be serialized -- cgit v1.2.3