summaryrefslogtreecommitdiffstats
path: root/third_party/rust/semver/src/identifier.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/semver/src/identifier.rs')
-rw-r--r--third_party/rust/semver/src/identifier.rs422
1 files changed, 422 insertions, 0 deletions
diff --git a/third_party/rust/semver/src/identifier.rs b/third_party/rust/semver/src/identifier.rs
new file mode 100644
index 0000000000..0273ae62a8
--- /dev/null
+++ b/third_party/rust/semver/src/identifier.rs
@@ -0,0 +1,422 @@
+// This module implements Identifier, a short-optimized string allowed to
+// contain only the ASCII characters hyphen, dot, 0-9, A-Z, a-z.
+//
+// As of mid-2021, the distribution of pre-release lengths on crates.io is:
+//
+// length count length count length count
+// 0 355929 11 81 24 2
+// 1 208 12 48 25 6
+// 2 236 13 55 26 10
+// 3 1909 14 25 27 4
+// 4 1284 15 15 28 1
+// 5 1742 16 35 30 1
+// 6 3440 17 9 31 5
+// 7 5624 18 6 32 1
+// 8 1321 19 12 36 2
+// 9 179 20 2 37 379
+// 10 65 23 11
+//
+// and the distribution of build metadata lengths is:
+//
+// length count length count length count
+// 0 364445 8 7725 18 1
+// 1 72 9 16 19 1
+// 2 7 10 85 20 1
+// 3 28 11 17 22 4
+// 4 9 12 10 26 1
+// 5 68 13 9 27 1
+// 6 73 14 10 40 5
+// 7 53 15 6
+//
+// Therefore it really behooves us to be able to use the entire 8 bytes of a
+// pointer for inline storage. For both pre-release and build metadata there are
+// vastly more strings with length exactly 8 bytes than the sum over all lengths
+// longer than 8 bytes.
+//
+// To differentiate the inline representation from the heap allocated long
+// representation, we'll allocate heap pointers with 2-byte alignment so that
+// they are guaranteed to have an unset least significant bit. Then in the repr
+// we store for pointers, we rotate a 1 into the most significant bit of the
+// most significant byte, which is never set for an ASCII byte.
+//
+// Inline repr:
+//
+// 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx
+//
+// Heap allocated repr:
+//
+// 1ppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp 0
+// ^ most significant bit least significant bit of orig ptr, rotated out ^
+//
+// Since the most significant bit doubles as a sign bit for the similarly sized
+// signed integer type, the CPU has an efficient instruction for inspecting it,
+// meaning we can differentiate between an inline repr and a heap allocated repr
+// in one instruction. Effectively an inline repr always looks like a positive
+// i64 while a heap allocated repr always looks like a negative i64.
+//
+// For the inline repr, we store \0 padding on the end of the stored characters,
+// and thus the string length is readily determined efficiently by a cttz (count
+// trailing zeros) or bsf (bit scan forward) instruction.
+//
+// For the heap allocated repr, the length is encoded as a base-128 varint at
+// the head of the allocation.
+//
+// Empty strings are stored as an all-1 bit pattern, corresponding to -1i64.
+// Consequently the all-0 bit pattern is never a legal representation in any
+// repr, leaving it available as a niche for downstream code. For example this
+// allows size_of::<Version>() == size_of::<Option<Version>>().
+
+use crate::alloc::alloc::{alloc, dealloc, handle_alloc_error, Layout};
+use core::isize;
+use core::mem;
+use core::num::{NonZeroU64, NonZeroUsize};
+use core::ptr::{self, NonNull};
+use core::slice;
+use core::str;
+use core::usize;
+
+const PTR_BYTES: usize = mem::size_of::<NonNull<u8>>();
+
+// If pointers are already 8 bytes or bigger, then 0. If pointers are smaller
+// than 8 bytes, then Identifier will contain a byte array to raise its size up
+// to 8 bytes total.
+const TAIL_BYTES: usize = 8 * (PTR_BYTES < 8) as usize - PTR_BYTES * (PTR_BYTES < 8) as usize;
+
+#[repr(C, align(8))]
+pub(crate) struct Identifier {
+ head: NonNull<u8>,
+ tail: [u8; TAIL_BYTES],
+}
+
+impl Identifier {
+ pub(crate) const fn empty() -> Self {
+ // This is a separate constant because unsafe function calls are not
+ // allowed in a const fn body, only in a const, until later rustc than
+ // what we support.
+ const HEAD: NonNull<u8> = unsafe { NonNull::new_unchecked(!0 as *mut u8) };
+
+ // `mov rax, -1`
+ Identifier {
+ head: HEAD,
+ tail: [!0; TAIL_BYTES],
+ }
+ }
+
+ // SAFETY: string must be ASCII and not contain \0 bytes.
+ pub(crate) unsafe fn new_unchecked(string: &str) -> Self {
+ let len = string.len();
+ debug_assert!(len <= isize::MAX as usize);
+ match len as u64 {
+ 0 => Self::empty(),
+ 1..=8 => {
+ let mut bytes = [0u8; mem::size_of::<Identifier>()];
+ // SAFETY: string is big enough to read len bytes, bytes is big
+ // enough to write len bytes, and they do not overlap.
+ unsafe { ptr::copy_nonoverlapping(string.as_ptr(), bytes.as_mut_ptr(), len) };
+ // SAFETY: the head field is nonzero because the input string
+ // was at least 1 byte of ASCII and did not contain \0.
+ unsafe { mem::transmute::<[u8; mem::size_of::<Identifier>()], Identifier>(bytes) }
+ }
+ 9..=0xff_ffff_ffff_ffff => {
+ // SAFETY: len is in a range that does not contain 0.
+ let size = bytes_for_varint(unsafe { NonZeroUsize::new_unchecked(len) }) + len;
+ let align = 2;
+ // On 32-bit and 16-bit architecture, check for size overflowing
+ // isize::MAX. Making an allocation request bigger than this to
+ // the allocator is considered UB. All allocations (including
+ // static ones) are limited to isize::MAX so we're guaranteed
+ // len <= isize::MAX, and we know bytes_for_varint(len) <= 5
+ // because 128**5 > isize::MAX, which means the only problem
+ // that can arise is when isize::MAX - 5 <= len <= isize::MAX.
+ // This is pretty much guaranteed to be malicious input so we
+ // don't need to care about returning a good error message.
+ if mem::size_of::<usize>() < 8 {
+ let max_alloc = usize::MAX / 2 - align;
+ assert!(size <= max_alloc);
+ }
+ // SAFETY: align is not zero, align is a power of two, and
+ // rounding size up to align does not overflow isize::MAX.
+ let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
+ // SAFETY: layout's size is nonzero.
+ let ptr = unsafe { alloc(layout) };
+ if ptr.is_null() {
+ handle_alloc_error(layout);
+ }
+ let mut write = ptr;
+ let mut varint_remaining = len;
+ while varint_remaining > 0 {
+ // SAFETY: size is bytes_for_varint(len) bytes + len bytes.
+ // This is writing the first bytes_for_varint(len) bytes.
+ unsafe { ptr::write(write, varint_remaining as u8 | 0x80) };
+ varint_remaining >>= 7;
+ // SAFETY: still in bounds of the same allocation.
+ write = unsafe { write.add(1) };
+ }
+ // SAFETY: size is bytes_for_varint(len) bytes + len bytes. This
+ // is writing to the last len bytes.
+ unsafe { ptr::copy_nonoverlapping(string.as_ptr(), write, len) };
+ Identifier {
+ head: ptr_to_repr(ptr),
+ tail: [0; TAIL_BYTES],
+ }
+ }
+ 0x100_0000_0000_0000..=0xffff_ffff_ffff_ffff => {
+ unreachable!("please refrain from storing >64 petabytes of text in semver version");
+ }
+ #[cfg(no_exhaustive_int_match)] // rustc <1.33
+ _ => unreachable!(),
+ }
+ }
+
+ pub(crate) fn is_empty(&self) -> bool {
+ // `cmp rdi, -1` -- basically: `repr as i64 == -1`
+ let empty = Self::empty();
+ let is_empty = self.head == empty.head && self.tail == empty.tail;
+ // The empty representation does nothing on Drop. We can't let this one
+ // drop normally because `impl Drop for Identifier` calls is_empty; that
+ // would be an infinite recursion.
+ mem::forget(empty);
+ is_empty
+ }
+
+ fn is_inline(&self) -> bool {
+ // `test rdi, rdi` -- basically: `repr as i64 >= 0`
+ self.head.as_ptr() as usize >> (PTR_BYTES * 8 - 1) == 0
+ }
+
+ fn is_empty_or_inline(&self) -> bool {
+ // `cmp rdi, -2` -- basically: `repr as i64 > -2`
+ self.is_empty() || self.is_inline()
+ }
+
+ pub(crate) fn as_str(&self) -> &str {
+ if self.is_empty() {
+ ""
+ } else if self.is_inline() {
+ // SAFETY: repr is in the inline representation.
+ unsafe { inline_as_str(self) }
+ } else {
+ // SAFETY: repr is in the heap allocated representation.
+ unsafe { ptr_as_str(&self.head) }
+ }
+ }
+}
+
+impl Clone for Identifier {
+ fn clone(&self) -> Self {
+ if self.is_empty_or_inline() {
+ Identifier {
+ head: self.head,
+ tail: self.tail,
+ }
+ } else {
+ let ptr = repr_to_ptr(self.head);
+ // SAFETY: ptr is one of our own heap allocations.
+ let len = unsafe { decode_len(ptr) };
+ let size = bytes_for_varint(len) + len.get();
+ let align = 2;
+ // SAFETY: align is not zero, align is a power of two, and rounding
+ // size up to align does not overflow isize::MAX. This is just
+ // duplicating a previous allocation where all of these guarantees
+ // were already made.
+ let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
+ // SAFETY: layout's size is nonzero.
+ let clone = unsafe { alloc(layout) };
+ if clone.is_null() {
+ handle_alloc_error(layout);
+ }
+ // SAFETY: new allocation cannot overlap the previous one (this was
+ // not a realloc). The argument ptrs are readable/writeable
+ // respectively for size bytes.
+ unsafe { ptr::copy_nonoverlapping(ptr, clone, size) }
+ Identifier {
+ head: ptr_to_repr(clone),
+ tail: [0; TAIL_BYTES],
+ }
+ }
+ }
+}
+
+impl Drop for Identifier {
+ fn drop(&mut self) {
+ if self.is_empty_or_inline() {
+ return;
+ }
+ let ptr = repr_to_ptr_mut(self.head);
+ // SAFETY: ptr is one of our own heap allocations.
+ let len = unsafe { decode_len(ptr) };
+ let size = bytes_for_varint(len) + len.get();
+ let align = 2;
+ // SAFETY: align is not zero, align is a power of two, and rounding
+ // size up to align does not overflow usize::MAX. These guarantees were
+ // made when originally allocating this memory.
+ let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
+ // SAFETY: ptr was previously allocated by the same allocator with the
+ // same layout.
+ unsafe { dealloc(ptr, layout) }
+ }
+}
+
+impl PartialEq for Identifier {
+ fn eq(&self, rhs: &Self) -> bool {
+ if self.is_empty_or_inline() {
+ // Fast path (most common)
+ self.head == rhs.head && self.tail == rhs.tail
+ } else if rhs.is_empty_or_inline() {
+ false
+ } else {
+ // SAFETY: both reprs are in the heap allocated representation.
+ unsafe { ptr_as_str(&self.head) == ptr_as_str(&rhs.head) }
+ }
+ }
+}
+
+unsafe impl Send for Identifier {}
+unsafe impl Sync for Identifier {}
+
+// We use heap pointers that are 2-byte aligned, meaning they have an
+// insignificant 0 in the least significant bit. We take advantage of that
+// unneeded bit to rotate a 1 into the most significant bit to make the repr
+// distinguishable from ASCII bytes.
+fn ptr_to_repr(original: *mut u8) -> NonNull<u8> {
+ // `mov eax, 1`
+ // `shld rax, rdi, 63`
+ let modified = (original as usize | 1).rotate_right(1);
+
+ // `original + (modified - original)`, but being mindful of provenance.
+ let diff = modified.wrapping_sub(original as usize);
+ let modified = original.wrapping_add(diff);
+
+ // SAFETY: the most significant bit of repr is known to be set, so the value
+ // is not zero.
+ unsafe { NonNull::new_unchecked(modified) }
+}
+
+// Shift out the 1 previously placed into the most significant bit of the least
+// significant byte. Shift in a low 0 bit to reconstruct the original 2-byte
+// aligned pointer.
+fn repr_to_ptr(modified: NonNull<u8>) -> *const u8 {
+ // `lea rax, [rdi + rdi]`
+ let modified = modified.as_ptr();
+ let original = (modified as usize) << 1;
+
+ // `modified + (original - modified)`, but being mindful of provenance.
+ let diff = original.wrapping_sub(modified as usize);
+ modified.wrapping_add(diff)
+}
+
+fn repr_to_ptr_mut(repr: NonNull<u8>) -> *mut u8 {
+ repr_to_ptr(repr) as *mut u8
+}
+
+// Compute the length of the inline string, assuming the argument is in short
+// string representation. Short strings are stored as 1 to 8 nonzero ASCII
+// bytes, followed by \0 padding for the remaining bytes.
+//
+// SAFETY: the identifier must indeed be in the inline representation.
+unsafe fn inline_len(repr: &Identifier) -> NonZeroUsize {
+ // SAFETY: Identifier's layout is align(8) and at least size 8. We're doing
+ // an aligned read of the first 8 bytes from it. The bytes are not all zero
+ // because inline strings are at least 1 byte long and cannot contain \0.
+ let repr = unsafe { ptr::read(repr as *const Identifier as *const NonZeroU64) };
+
+ // Rustc >=1.53 has intrinsics for counting zeros on a non-zeroable integer.
+ // On many architectures these are more efficient than counting on ordinary
+ // zeroable integers (bsf vs cttz). On rustc <1.53 without those intrinsics,
+ // we count zeros in the u64 rather than the NonZeroU64.
+ #[cfg(no_nonzero_bitscan)]
+ let repr = repr.get();
+
+ #[cfg(target_endian = "little")]
+ let zero_bits_on_string_end = repr.leading_zeros();
+ #[cfg(target_endian = "big")]
+ let zero_bits_on_string_end = repr.trailing_zeros();
+
+ let nonzero_bytes = 8 - zero_bits_on_string_end as usize / 8;
+
+ // SAFETY: repr is nonzero, so it has at most 63 zero bits on either end,
+ // thus at least one nonzero byte.
+ unsafe { NonZeroUsize::new_unchecked(nonzero_bytes) }
+}
+
+// SAFETY: repr must be in the inline representation, i.e. at least 1 and at
+// most 8 nonzero ASCII bytes padded on the end with \0 bytes.
+unsafe fn inline_as_str(repr: &Identifier) -> &str {
+ let ptr = repr as *const Identifier as *const u8;
+ let len = unsafe { inline_len(repr) }.get();
+ // SAFETY: we are viewing the nonzero ASCII prefix of the inline repr's
+ // contents as a slice of bytes. Input/output lifetimes are correctly
+ // associated.
+ let slice = unsafe { slice::from_raw_parts(ptr, len) };
+ // SAFETY: the string contents are known to be only ASCII bytes, which are
+ // always valid UTF-8.
+ unsafe { str::from_utf8_unchecked(slice) }
+}
+
+// Decode varint. Varints consist of between one and eight base-128 digits, each
+// of which is stored in a byte with most significant bit set. Adjacent to the
+// varint in memory there is guaranteed to be at least 9 ASCII bytes, each of
+// which has an unset most significant bit.
+//
+// SAFETY: ptr must be one of our own heap allocations, with the varint header
+// already written.
+unsafe fn decode_len(ptr: *const u8) -> NonZeroUsize {
+ // SAFETY: There is at least one byte of varint followed by at least 9 bytes
+ // of string content, which is at least 10 bytes total for the allocation,
+ // so reading the first two is no problem.
+ let [first, second] = unsafe { ptr::read(ptr as *const [u8; 2]) };
+ if second < 0x80 {
+ // SAFETY: the length of this heap allocated string has been encoded as
+ // one base-128 digit, so the length is at least 9 and at most 127. It
+ // cannot be zero.
+ unsafe { NonZeroUsize::new_unchecked((first & 0x7f) as usize) }
+ } else {
+ return unsafe { decode_len_cold(ptr) };
+
+ // Identifiers 128 bytes or longer. This is not exercised by any crate
+ // version currently published to crates.io.
+ #[cold]
+ #[inline(never)]
+ unsafe fn decode_len_cold(mut ptr: *const u8) -> NonZeroUsize {
+ let mut len = 0;
+ let mut shift = 0;
+ loop {
+ // SAFETY: varint continues while there are bytes having the
+ // most significant bit set, i.e. until we start hitting the
+ // ASCII string content with msb unset.
+ let byte = unsafe { *ptr };
+ if byte < 0x80 {
+ // SAFETY: the string length is known to be 128 bytes or
+ // longer.
+ return unsafe { NonZeroUsize::new_unchecked(len) };
+ }
+ // SAFETY: still in bounds of the same allocation.
+ ptr = unsafe { ptr.add(1) };
+ len += ((byte & 0x7f) as usize) << shift;
+ shift += 7;
+ }
+ }
+ }
+}
+
+// SAFETY: repr must be in the heap allocated representation, with varint header
+// and string contents already written.
+unsafe fn ptr_as_str(repr: &NonNull<u8>) -> &str {
+ let ptr = repr_to_ptr(*repr);
+ let len = unsafe { decode_len(ptr) };
+ let header = bytes_for_varint(len);
+ let slice = unsafe { slice::from_raw_parts(ptr.add(header), len.get()) };
+ // SAFETY: all identifier contents are ASCII bytes, which are always valid
+ // UTF-8.
+ unsafe { str::from_utf8_unchecked(slice) }
+}
+
+// Number of base-128 digits required for the varint representation of a length.
+fn bytes_for_varint(len: NonZeroUsize) -> usize {
+ #[cfg(no_nonzero_bitscan)] // rustc <1.53
+ let len = len.get();
+
+ let usize_bits = mem::size_of::<usize>() * 8;
+ let len_bits = usize_bits - len.leading_zeros() as usize;
+ (len_bits + 6) / 7
+}