diff options
Diffstat (limited to '')
-rw-r--r-- | third_party/rust/semver/src/identifier.rs | 422 |
1 files changed, 422 insertions, 0 deletions
diff --git a/third_party/rust/semver/src/identifier.rs b/third_party/rust/semver/src/identifier.rs new file mode 100644 index 0000000000..0273ae62a8 --- /dev/null +++ b/third_party/rust/semver/src/identifier.rs @@ -0,0 +1,422 @@ +// This module implements Identifier, a short-optimized string allowed to +// contain only the ASCII characters hyphen, dot, 0-9, A-Z, a-z. +// +// As of mid-2021, the distribution of pre-release lengths on crates.io is: +// +// length count length count length count +// 0 355929 11 81 24 2 +// 1 208 12 48 25 6 +// 2 236 13 55 26 10 +// 3 1909 14 25 27 4 +// 4 1284 15 15 28 1 +// 5 1742 16 35 30 1 +// 6 3440 17 9 31 5 +// 7 5624 18 6 32 1 +// 8 1321 19 12 36 2 +// 9 179 20 2 37 379 +// 10 65 23 11 +// +// and the distribution of build metadata lengths is: +// +// length count length count length count +// 0 364445 8 7725 18 1 +// 1 72 9 16 19 1 +// 2 7 10 85 20 1 +// 3 28 11 17 22 4 +// 4 9 12 10 26 1 +// 5 68 13 9 27 1 +// 6 73 14 10 40 5 +// 7 53 15 6 +// +// Therefore it really behooves us to be able to use the entire 8 bytes of a +// pointer for inline storage. For both pre-release and build metadata there are +// vastly more strings with length exactly 8 bytes than the sum over all lengths +// longer than 8 bytes. +// +// To differentiate the inline representation from the heap allocated long +// representation, we'll allocate heap pointers with 2-byte alignment so that +// they are guaranteed to have an unset least significant bit. Then in the repr +// we store for pointers, we rotate a 1 into the most significant bit of the +// most significant byte, which is never set for an ASCII byte. +// +// Inline repr: +// +// 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx +// +// Heap allocated repr: +// +// 1ppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp 0 +// ^ most significant bit least significant bit of orig ptr, rotated out ^ +// +// Since the most significant bit doubles as a sign bit for the similarly sized +// signed integer type, the CPU has an efficient instruction for inspecting it, +// meaning we can differentiate between an inline repr and a heap allocated repr +// in one instruction. Effectively an inline repr always looks like a positive +// i64 while a heap allocated repr always looks like a negative i64. +// +// For the inline repr, we store \0 padding on the end of the stored characters, +// and thus the string length is readily determined efficiently by a cttz (count +// trailing zeros) or bsf (bit scan forward) instruction. +// +// For the heap allocated repr, the length is encoded as a base-128 varint at +// the head of the allocation. +// +// Empty strings are stored as an all-1 bit pattern, corresponding to -1i64. +// Consequently the all-0 bit pattern is never a legal representation in any +// repr, leaving it available as a niche for downstream code. For example this +// allows size_of::<Version>() == size_of::<Option<Version>>(). + +use crate::alloc::alloc::{alloc, dealloc, handle_alloc_error, Layout}; +use core::isize; +use core::mem; +use core::num::{NonZeroU64, NonZeroUsize}; +use core::ptr::{self, NonNull}; +use core::slice; +use core::str; +use core::usize; + +const PTR_BYTES: usize = mem::size_of::<NonNull<u8>>(); + +// If pointers are already 8 bytes or bigger, then 0. If pointers are smaller +// than 8 bytes, then Identifier will contain a byte array to raise its size up +// to 8 bytes total. +const TAIL_BYTES: usize = 8 * (PTR_BYTES < 8) as usize - PTR_BYTES * (PTR_BYTES < 8) as usize; + +#[repr(C, align(8))] +pub(crate) struct Identifier { + head: NonNull<u8>, + tail: [u8; TAIL_BYTES], +} + +impl Identifier { + pub(crate) const fn empty() -> Self { + // This is a separate constant because unsafe function calls are not + // allowed in a const fn body, only in a const, until later rustc than + // what we support. + const HEAD: NonNull<u8> = unsafe { NonNull::new_unchecked(!0 as *mut u8) }; + + // `mov rax, -1` + Identifier { + head: HEAD, + tail: [!0; TAIL_BYTES], + } + } + + // SAFETY: string must be ASCII and not contain \0 bytes. + pub(crate) unsafe fn new_unchecked(string: &str) -> Self { + let len = string.len(); + debug_assert!(len <= isize::MAX as usize); + match len as u64 { + 0 => Self::empty(), + 1..=8 => { + let mut bytes = [0u8; mem::size_of::<Identifier>()]; + // SAFETY: string is big enough to read len bytes, bytes is big + // enough to write len bytes, and they do not overlap. + unsafe { ptr::copy_nonoverlapping(string.as_ptr(), bytes.as_mut_ptr(), len) }; + // SAFETY: the head field is nonzero because the input string + // was at least 1 byte of ASCII and did not contain \0. + unsafe { mem::transmute::<[u8; mem::size_of::<Identifier>()], Identifier>(bytes) } + } + 9..=0xff_ffff_ffff_ffff => { + // SAFETY: len is in a range that does not contain 0. + let size = bytes_for_varint(unsafe { NonZeroUsize::new_unchecked(len) }) + len; + let align = 2; + // On 32-bit and 16-bit architecture, check for size overflowing + // isize::MAX. Making an allocation request bigger than this to + // the allocator is considered UB. All allocations (including + // static ones) are limited to isize::MAX so we're guaranteed + // len <= isize::MAX, and we know bytes_for_varint(len) <= 5 + // because 128**5 > isize::MAX, which means the only problem + // that can arise is when isize::MAX - 5 <= len <= isize::MAX. + // This is pretty much guaranteed to be malicious input so we + // don't need to care about returning a good error message. + if mem::size_of::<usize>() < 8 { + let max_alloc = usize::MAX / 2 - align; + assert!(size <= max_alloc); + } + // SAFETY: align is not zero, align is a power of two, and + // rounding size up to align does not overflow isize::MAX. + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + // SAFETY: layout's size is nonzero. + let ptr = unsafe { alloc(layout) }; + if ptr.is_null() { + handle_alloc_error(layout); + } + let mut write = ptr; + let mut varint_remaining = len; + while varint_remaining > 0 { + // SAFETY: size is bytes_for_varint(len) bytes + len bytes. + // This is writing the first bytes_for_varint(len) bytes. + unsafe { ptr::write(write, varint_remaining as u8 | 0x80) }; + varint_remaining >>= 7; + // SAFETY: still in bounds of the same allocation. + write = unsafe { write.add(1) }; + } + // SAFETY: size is bytes_for_varint(len) bytes + len bytes. This + // is writing to the last len bytes. + unsafe { ptr::copy_nonoverlapping(string.as_ptr(), write, len) }; + Identifier { + head: ptr_to_repr(ptr), + tail: [0; TAIL_BYTES], + } + } + 0x100_0000_0000_0000..=0xffff_ffff_ffff_ffff => { + unreachable!("please refrain from storing >64 petabytes of text in semver version"); + } + #[cfg(no_exhaustive_int_match)] // rustc <1.33 + _ => unreachable!(), + } + } + + pub(crate) fn is_empty(&self) -> bool { + // `cmp rdi, -1` -- basically: `repr as i64 == -1` + let empty = Self::empty(); + let is_empty = self.head == empty.head && self.tail == empty.tail; + // The empty representation does nothing on Drop. We can't let this one + // drop normally because `impl Drop for Identifier` calls is_empty; that + // would be an infinite recursion. + mem::forget(empty); + is_empty + } + + fn is_inline(&self) -> bool { + // `test rdi, rdi` -- basically: `repr as i64 >= 0` + self.head.as_ptr() as usize >> (PTR_BYTES * 8 - 1) == 0 + } + + fn is_empty_or_inline(&self) -> bool { + // `cmp rdi, -2` -- basically: `repr as i64 > -2` + self.is_empty() || self.is_inline() + } + + pub(crate) fn as_str(&self) -> &str { + if self.is_empty() { + "" + } else if self.is_inline() { + // SAFETY: repr is in the inline representation. + unsafe { inline_as_str(self) } + } else { + // SAFETY: repr is in the heap allocated representation. + unsafe { ptr_as_str(&self.head) } + } + } +} + +impl Clone for Identifier { + fn clone(&self) -> Self { + if self.is_empty_or_inline() { + Identifier { + head: self.head, + tail: self.tail, + } + } else { + let ptr = repr_to_ptr(self.head); + // SAFETY: ptr is one of our own heap allocations. + let len = unsafe { decode_len(ptr) }; + let size = bytes_for_varint(len) + len.get(); + let align = 2; + // SAFETY: align is not zero, align is a power of two, and rounding + // size up to align does not overflow isize::MAX. This is just + // duplicating a previous allocation where all of these guarantees + // were already made. + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + // SAFETY: layout's size is nonzero. + let clone = unsafe { alloc(layout) }; + if clone.is_null() { + handle_alloc_error(layout); + } + // SAFETY: new allocation cannot overlap the previous one (this was + // not a realloc). The argument ptrs are readable/writeable + // respectively for size bytes. + unsafe { ptr::copy_nonoverlapping(ptr, clone, size) } + Identifier { + head: ptr_to_repr(clone), + tail: [0; TAIL_BYTES], + } + } + } +} + +impl Drop for Identifier { + fn drop(&mut self) { + if self.is_empty_or_inline() { + return; + } + let ptr = repr_to_ptr_mut(self.head); + // SAFETY: ptr is one of our own heap allocations. + let len = unsafe { decode_len(ptr) }; + let size = bytes_for_varint(len) + len.get(); + let align = 2; + // SAFETY: align is not zero, align is a power of two, and rounding + // size up to align does not overflow usize::MAX. These guarantees were + // made when originally allocating this memory. + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + // SAFETY: ptr was previously allocated by the same allocator with the + // same layout. + unsafe { dealloc(ptr, layout) } + } +} + +impl PartialEq for Identifier { + fn eq(&self, rhs: &Self) -> bool { + if self.is_empty_or_inline() { + // Fast path (most common) + self.head == rhs.head && self.tail == rhs.tail + } else if rhs.is_empty_or_inline() { + false + } else { + // SAFETY: both reprs are in the heap allocated representation. + unsafe { ptr_as_str(&self.head) == ptr_as_str(&rhs.head) } + } + } +} + +unsafe impl Send for Identifier {} +unsafe impl Sync for Identifier {} + +// We use heap pointers that are 2-byte aligned, meaning they have an +// insignificant 0 in the least significant bit. We take advantage of that +// unneeded bit to rotate a 1 into the most significant bit to make the repr +// distinguishable from ASCII bytes. +fn ptr_to_repr(original: *mut u8) -> NonNull<u8> { + // `mov eax, 1` + // `shld rax, rdi, 63` + let modified = (original as usize | 1).rotate_right(1); + + // `original + (modified - original)`, but being mindful of provenance. + let diff = modified.wrapping_sub(original as usize); + let modified = original.wrapping_add(diff); + + // SAFETY: the most significant bit of repr is known to be set, so the value + // is not zero. + unsafe { NonNull::new_unchecked(modified) } +} + +// Shift out the 1 previously placed into the most significant bit of the least +// significant byte. Shift in a low 0 bit to reconstruct the original 2-byte +// aligned pointer. +fn repr_to_ptr(modified: NonNull<u8>) -> *const u8 { + // `lea rax, [rdi + rdi]` + let modified = modified.as_ptr(); + let original = (modified as usize) << 1; + + // `modified + (original - modified)`, but being mindful of provenance. + let diff = original.wrapping_sub(modified as usize); + modified.wrapping_add(diff) +} + +fn repr_to_ptr_mut(repr: NonNull<u8>) -> *mut u8 { + repr_to_ptr(repr) as *mut u8 +} + +// Compute the length of the inline string, assuming the argument is in short +// string representation. Short strings are stored as 1 to 8 nonzero ASCII +// bytes, followed by \0 padding for the remaining bytes. +// +// SAFETY: the identifier must indeed be in the inline representation. +unsafe fn inline_len(repr: &Identifier) -> NonZeroUsize { + // SAFETY: Identifier's layout is align(8) and at least size 8. We're doing + // an aligned read of the first 8 bytes from it. The bytes are not all zero + // because inline strings are at least 1 byte long and cannot contain \0. + let repr = unsafe { ptr::read(repr as *const Identifier as *const NonZeroU64) }; + + // Rustc >=1.53 has intrinsics for counting zeros on a non-zeroable integer. + // On many architectures these are more efficient than counting on ordinary + // zeroable integers (bsf vs cttz). On rustc <1.53 without those intrinsics, + // we count zeros in the u64 rather than the NonZeroU64. + #[cfg(no_nonzero_bitscan)] + let repr = repr.get(); + + #[cfg(target_endian = "little")] + let zero_bits_on_string_end = repr.leading_zeros(); + #[cfg(target_endian = "big")] + let zero_bits_on_string_end = repr.trailing_zeros(); + + let nonzero_bytes = 8 - zero_bits_on_string_end as usize / 8; + + // SAFETY: repr is nonzero, so it has at most 63 zero bits on either end, + // thus at least one nonzero byte. + unsafe { NonZeroUsize::new_unchecked(nonzero_bytes) } +} + +// SAFETY: repr must be in the inline representation, i.e. at least 1 and at +// most 8 nonzero ASCII bytes padded on the end with \0 bytes. +unsafe fn inline_as_str(repr: &Identifier) -> &str { + let ptr = repr as *const Identifier as *const u8; + let len = unsafe { inline_len(repr) }.get(); + // SAFETY: we are viewing the nonzero ASCII prefix of the inline repr's + // contents as a slice of bytes. Input/output lifetimes are correctly + // associated. + let slice = unsafe { slice::from_raw_parts(ptr, len) }; + // SAFETY: the string contents are known to be only ASCII bytes, which are + // always valid UTF-8. + unsafe { str::from_utf8_unchecked(slice) } +} + +// Decode varint. Varints consist of between one and eight base-128 digits, each +// of which is stored in a byte with most significant bit set. Adjacent to the +// varint in memory there is guaranteed to be at least 9 ASCII bytes, each of +// which has an unset most significant bit. +// +// SAFETY: ptr must be one of our own heap allocations, with the varint header +// already written. +unsafe fn decode_len(ptr: *const u8) -> NonZeroUsize { + // SAFETY: There is at least one byte of varint followed by at least 9 bytes + // of string content, which is at least 10 bytes total for the allocation, + // so reading the first two is no problem. + let [first, second] = unsafe { ptr::read(ptr as *const [u8; 2]) }; + if second < 0x80 { + // SAFETY: the length of this heap allocated string has been encoded as + // one base-128 digit, so the length is at least 9 and at most 127. It + // cannot be zero. + unsafe { NonZeroUsize::new_unchecked((first & 0x7f) as usize) } + } else { + return unsafe { decode_len_cold(ptr) }; + + // Identifiers 128 bytes or longer. This is not exercised by any crate + // version currently published to crates.io. + #[cold] + #[inline(never)] + unsafe fn decode_len_cold(mut ptr: *const u8) -> NonZeroUsize { + let mut len = 0; + let mut shift = 0; + loop { + // SAFETY: varint continues while there are bytes having the + // most significant bit set, i.e. until we start hitting the + // ASCII string content with msb unset. + let byte = unsafe { *ptr }; + if byte < 0x80 { + // SAFETY: the string length is known to be 128 bytes or + // longer. + return unsafe { NonZeroUsize::new_unchecked(len) }; + } + // SAFETY: still in bounds of the same allocation. + ptr = unsafe { ptr.add(1) }; + len += ((byte & 0x7f) as usize) << shift; + shift += 7; + } + } + } +} + +// SAFETY: repr must be in the heap allocated representation, with varint header +// and string contents already written. +unsafe fn ptr_as_str(repr: &NonNull<u8>) -> &str { + let ptr = repr_to_ptr(*repr); + let len = unsafe { decode_len(ptr) }; + let header = bytes_for_varint(len); + let slice = unsafe { slice::from_raw_parts(ptr.add(header), len.get()) }; + // SAFETY: all identifier contents are ASCII bytes, which are always valid + // UTF-8. + unsafe { str::from_utf8_unchecked(slice) } +} + +// Number of base-128 digits required for the varint representation of a length. +fn bytes_for_varint(len: NonZeroUsize) -> usize { + #[cfg(no_nonzero_bitscan)] // rustc <1.53 + let len = len.get(); + + let usize_bits = mem::size_of::<usize>() * 8; + let len_bits = usize_bits - len.leading_zeros() as usize; + (len_bits + 6) / 7 +} |