summaryrefslogtreecommitdiffstats
path: root/vendor/measureme/src/stringtable.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/measureme/src/stringtable.rs')
-rw-r--r--vendor/measureme/src/stringtable.rs328
1 files changed, 328 insertions, 0 deletions
diff --git a/vendor/measureme/src/stringtable.rs b/vendor/measureme/src/stringtable.rs
new file mode 100644
index 000000000..a56bbcbfc
--- /dev/null
+++ b/vendor/measureme/src/stringtable.rs
@@ -0,0 +1,328 @@
+//! A string table implementation with a tree-like encoding.
+//!
+//! Each entry in the table represents a string and is encoded as a list of
+//! components where each component can either be
+//!
+//! 1. a string _value_ that contains actual UTF-8 string content,
+//! 2. a string _ID_ that contains a reference to another entry, or
+//! 3. a terminator tag which marks the end of a component list.
+//!
+//! The string _content_ of an entry is defined as the concatenation of the
+//! content of its components. The content of a string value is its actual
+//! UTF-8 bytes. The content of a string ID is the contents of the entry
+//! it references.
+//!
+//! The byte-level encoding of component lists uses the structure of UTF-8 in
+//! order to save space:
+//!
+//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
+//! of this fact by letting all string ID components start with this `0xFE`
+//! prefix. Thus when we parse the contents of a value we know to stop if
+//! we encounter this byte.
+//!
+//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
+//! use `0xFF` as our component list terminator.
+//!
+//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
+//! encoded as:
+//!
+//! ```ignore
+//! ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
+//! ^^^^^^^^^^^^^^^^ ^^^
+//! string ID with 0xFE prefix terminator (0xFF)
+//! ```
+//!
+//! As you can see string IDs are encoded in little endian format.
+//!
+//! ----------------------------------------------------------------------------
+//!
+//! Each string in the table is referred to via a `StringId`. `StringId`s may
+//! be generated in two ways:
+//!
+//! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for
+//! the allocated string.
+//! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that
+//! later can be mapped to an actual string via
+//! `StringTableBuilder::map_virtual_to_concrete_string()`.
+//!
+//! String IDs allow you to deduplicate strings by allocating a string
+//! once and then referring to it by id over and over. This is a useful trick
+//! for strings which are recorded many times and it can significantly reduce
+//! the size of profile trace files.
+//!
+//! `StringId`s are partitioned according to type:
+//!
+//! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
+//!
+//! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
+//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
+//! which is used internally by `measureme` to record additional metadata about
+//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
+//! values.
+
+use crate::file_header::{
+ write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
+};
+use crate::serialization::Addr;
+use crate::serialization::SerializationSink;
+use std::{error::Error, sync::Arc};
+
+/// A `StringId` is used to identify a string in the `StringTable`. It is
+/// either a regular `StringId`, meaning that it contains the absolute address
+/// of a string within the string table data. Or it is "virtual", which means
+/// that the address it points to is resolved via the string table index data,
+/// that maps virtual `StringId`s to addresses.
+#[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)]
+#[repr(C)]
+pub struct StringId(u32);
+
+impl StringId {
+ pub const INVALID: StringId = StringId(INVALID_STRING_ID);
+
+ #[inline]
+ pub fn new(id: u32) -> StringId {
+ StringId(id)
+ }
+
+ #[inline]
+ pub fn new_virtual(id: u32) -> StringId {
+ assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
+ StringId(id)
+ }
+
+ #[inline]
+ pub fn is_virtual(self) -> bool {
+ self.0 <= METADATA_STRING_ID
+ }
+
+ #[inline]
+ pub fn as_u32(self) -> u32 {
+ self.0
+ }
+
+ #[inline]
+ pub fn from_addr(addr: Addr) -> StringId {
+ let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
+ StringId::new(id)
+ }
+
+ #[inline]
+ pub fn to_addr(self) -> Addr {
+ Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
+ }
+}
+
+// See module-level documentation for more information on the encoding.
+pub const TERMINATOR: u8 = 0xFF;
+pub const STRING_REF_TAG: u8 = 0xFE;
+pub const STRING_REF_ENCODED_SIZE: usize = 5;
+
+/// The maximum id value a virtual string may be.
+const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
+
+/// The id of the profile metadata string entry.
+pub const METADATA_STRING_ID: u32 = MAX_USER_VIRTUAL_STRING_ID + 1;
+
+/// Some random string ID that we make sure cannot be generated or assigned to.
+const INVALID_STRING_ID: u32 = METADATA_STRING_ID + 1;
+
+pub const FIRST_REGULAR_STRING_ID: u32 = INVALID_STRING_ID + 1;
+
+/// Write-only version of the string table
+pub struct StringTableBuilder {
+ data_sink: Arc<SerializationSink>,
+ index_sink: Arc<SerializationSink>,
+}
+
+/// Anything that implements `SerializableString` can be written to a
+/// `StringTable`.
+pub trait SerializableString {
+ fn serialized_size(&self) -> usize;
+ fn serialize(&self, bytes: &mut [u8]);
+}
+
+// A single string is encoded as `[UTF-8 bytes][TERMINATOR]`
+impl SerializableString for str {
+ #[inline]
+ fn serialized_size(&self) -> usize {
+ self.len() + // actual bytes
+ 1 // terminator
+ }
+
+ #[inline]
+ fn serialize(&self, bytes: &mut [u8]) {
+ let last_byte_index = bytes.len() - 1;
+ bytes[0..last_byte_index].copy_from_slice(self.as_bytes());
+ bytes[last_byte_index] = TERMINATOR;
+ }
+}
+
+/// A single component of a string. Used for building composite table entries.
+pub enum StringComponent<'s> {
+ Value(&'s str),
+ Ref(StringId),
+}
+
+impl<'s> StringComponent<'s> {
+ #[inline]
+ fn serialized_size(&self) -> usize {
+ match *self {
+ StringComponent::Value(s) => s.len(),
+ StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
+ }
+ }
+
+ #[inline]
+ fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] {
+ match *self {
+ StringComponent::Value(s) => {
+ bytes[..s.len()].copy_from_slice(s.as_bytes());
+ &mut bytes[s.len()..]
+ }
+ StringComponent::Ref(string_id) => {
+ // The code below assumes we use a 5-byte encoding for string
+ // refs, where the first byte is STRING_REF_TAG and the
+ // following 4 bytes are a little-endian u32 string ID value.
+ assert!(STRING_REF_ENCODED_SIZE == 5);
+
+ bytes[0] = STRING_REF_TAG;
+ bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
+ &mut bytes[5..]
+ }
+ }
+ }
+}
+
+impl<'a> SerializableString for [StringComponent<'a>] {
+ #[inline]
+ fn serialized_size(&self) -> usize {
+ self.iter().map(|c| c.serialized_size()).sum::<usize>() + // size of components
+ 1 // terminator
+ }
+
+ #[inline]
+ fn serialize(&self, mut bytes: &mut [u8]) {
+ assert!(bytes.len() == self.serialized_size());
+ for component in self.iter() {
+ bytes = component.serialize(bytes);
+ }
+
+ // Assert that we used the exact number of bytes we anticipated.
+ assert!(bytes.len() == 1);
+ bytes[0] = TERMINATOR;
+ }
+}
+
+macro_rules! impl_serializable_string_for_fixed_size {
+ ($n:expr) => {
+ impl<'a> SerializableString for [StringComponent<'a>; $n] {
+ #[inline(always)]
+ fn serialized_size(&self) -> usize {
+ (&self[..]).serialized_size()
+ }
+
+ #[inline(always)]
+ fn serialize(&self, bytes: &mut [u8]) {
+ (&self[..]).serialize(bytes);
+ }
+ }
+ };
+}
+
+impl_serializable_string_for_fixed_size!(0);
+impl_serializable_string_for_fixed_size!(1);
+impl_serializable_string_for_fixed_size!(2);
+impl_serializable_string_for_fixed_size!(3);
+impl_serializable_string_for_fixed_size!(4);
+impl_serializable_string_for_fixed_size!(5);
+impl_serializable_string_for_fixed_size!(6);
+impl_serializable_string_for_fixed_size!(7);
+impl_serializable_string_for_fixed_size!(8);
+impl_serializable_string_for_fixed_size!(9);
+impl_serializable_string_for_fixed_size!(10);
+impl_serializable_string_for_fixed_size!(11);
+impl_serializable_string_for_fixed_size!(12);
+impl_serializable_string_for_fixed_size!(13);
+impl_serializable_string_for_fixed_size!(14);
+impl_serializable_string_for_fixed_size!(15);
+impl_serializable_string_for_fixed_size!(16);
+
+fn serialize_index_entry(sink: &SerializationSink, id: StringId, addr: Addr) {
+ sink.write_atomic(8, |bytes| {
+ bytes[0..4].copy_from_slice(&id.0.to_le_bytes());
+ bytes[4..8].copy_from_slice(&addr.0.to_le_bytes());
+ });
+}
+
+impl StringTableBuilder {
+ pub fn new(
+ data_sink: Arc<SerializationSink>,
+ index_sink: Arc<SerializationSink>,
+ ) -> Result<StringTableBuilder, Box<dyn Error + Send + Sync>> {
+ // The first thing in every stream we generate must be the stream header.
+ write_file_header(&mut data_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_DATA)?;
+ write_file_header(&mut index_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_INDEX)?;
+
+ Ok(StringTableBuilder {
+ data_sink,
+ index_sink,
+ })
+ }
+
+ /// Creates a mapping so that `virtual_id` will resolve to the contents of
+ /// `concrete_id` when reading the string table.
+ pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
+ // This assertion does not use `is_virtual` on purpose because that
+ // would also allow to overwrite `METADATA_STRING_ID`.
+ assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID);
+ serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
+ }
+
+ pub fn bulk_map_virtual_to_single_concrete_string<I>(
+ &self,
+ virtual_ids: I,
+ concrete_id: StringId,
+ ) where
+ I: Iterator<Item = StringId> + ExactSizeIterator,
+ {
+ // TODO: Index data encoding could have a special bulk mode that assigns
+ // multiple StringIds to the same addr, so we don't have to repeat
+ // the `concrete_id` over and over.
+
+ type MappingEntry = [u32; 2];
+ assert!(std::mem::size_of::<MappingEntry>() == 8);
+
+ let to_addr_le = concrete_id.to_addr().0.to_le();
+
+ let serialized: Vec<MappingEntry> = virtual_ids
+ .map(|from| {
+ let id = from.0;
+ assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
+ [id.to_le(), to_addr_le]
+ })
+ .collect();
+
+ let num_bytes = serialized.len() * std::mem::size_of::<MappingEntry>();
+ let byte_ptr = serialized.as_ptr() as *const u8;
+
+ let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) };
+
+ self.index_sink.write_bytes_atomic(bytes);
+ }
+
+ pub fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) {
+ let concrete_id = self.alloc(s);
+ let virtual_id = StringId(METADATA_STRING_ID);
+ assert!(virtual_id.is_virtual());
+ serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
+ }
+
+ pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
+ let size_in_bytes = s.serialized_size();
+ let addr = self.data_sink.write_atomic(size_in_bytes, |mem| {
+ s.serialize(mem);
+ });
+
+ StringId::from_addr(addr)
+ }
+}