diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/tendril/src/fmt.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/tendril/src/fmt.rs')
-rw-r--r-- | vendor/tendril/src/fmt.rs | 519 |
1 files changed, 519 insertions, 0 deletions
diff --git a/vendor/tendril/src/fmt.rs b/vendor/tendril/src/fmt.rs new file mode 100644 index 000000000..2ff04bbca --- /dev/null +++ b/vendor/tendril/src/fmt.rs @@ -0,0 +1,519 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Marker types for formats. +//! +//! This module defines the types and traits used to mark a `Tendril` +//! with the format of data it contains. It includes those formats +//! for which `Tendril` supports at least some operations without +//! conversion. +//! +//! To convert a string tendril to/from a byte tendril in an arbitrary +//! character encoding, see the `encode` and `decode` methods on +//! `Tendril`. +//! +//! `Tendril` operations may become memory-unsafe if data invalid for +//! the format sneaks in. For that reason, these traits require +//! `unsafe impl`. + +use std::default::Default; +use std::{char, mem, str}; + +use futf::{self, Codepoint, Meaning}; + +/// Implementation details. +/// +/// You don't need these unless you are implementing +/// a new format. +pub mod imp { + use std::default::Default; + use std::{iter, mem, slice}; + + /// Describes how to fix up encodings when concatenating. + /// + /// We can drop characters on either side of the splice, + /// and insert up to 4 bytes in the middle. + pub struct Fixup { + pub drop_left: u32, + pub drop_right: u32, + pub insert_len: u32, + pub insert_bytes: [u8; 4], + } + + impl Default for Fixup { + #[inline(always)] + fn default() -> Fixup { + Fixup { + drop_left: 0, + drop_right: 0, + insert_len: 0, + insert_bytes: [0; 4], + } + } + } + + #[inline(always)] + unsafe fn from_u32_unchecked(n: u32) -> char { + mem::transmute(n) + } + + pub struct SingleByteCharIndices<'a> { + inner: iter::Enumerate<slice::Iter<'a, u8>>, + } + + impl<'a> Iterator for SingleByteCharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + self.inner + .next() + .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) + } + } + + impl<'a> SingleByteCharIndices<'a> { + #[inline] + pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { + SingleByteCharIndices { + inner: buf.iter().enumerate(), + } + } + } +} + +/// Trait for format marker types. +/// +/// The type implementing this trait is usually not instantiated. +/// It's used with a phantom type parameter of `Tendril`. +pub unsafe trait Format { + /// Check whether the buffer is valid for this format. + fn validate(buf: &[u8]) -> bool; + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a prefix of a valid buffer. + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a suffix of a valid buffer. + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a contiguous subsequence + /// of a valid buffer, but not necessarily a prefix or + /// a suffix. + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Compute any fixup needed when concatenating buffers. + /// + /// The default is to do nothing. + /// + /// The function is `unsafe` because it may assume the input + /// buffers are already valid for the format. Also, no + /// bounds-checking is performed on the return value! + #[inline(always)] + unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { + Default::default() + } +} + +/// Indicates that one format is a subset of another. +/// +/// The subset format can be converted to the superset format +/// for free. +pub unsafe trait SubsetOf<Super>: Format +where + Super: Format, +{ + /// Validate the *other* direction of conversion; check if + /// this buffer from the superset format conforms to the + /// subset format. + /// + /// The default calls `Self::validate`, but some conversions + /// may implement a check which is cheaper than validating + /// from scratch. + fn revalidate_subset(x: &[u8]) -> bool { + Self::validate(x) + } +} + +/// Indicates a format which corresponds to a Rust slice type, +/// representing exactly the same invariants. +pub unsafe trait SliceFormat: Format + Sized { + type Slice: ?Sized + Slice; +} + +/// Indicates a format which contains characters from Unicode +/// (all of it, or some proper subset). +pub unsafe trait CharFormat<'a>: Format { + /// Iterator for characters and their byte indices. + type Iter: Iterator<Item = (usize, char)>; + + /// Iterate over the characters of the string and their byte + /// indices. + /// + /// You may assume the buffer is *already validated* for `Format`. + unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; + + /// Encode the character as bytes and pass them to a continuation. + /// + /// Returns `Err(())` iff the character cannot be represented. + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]); +} + +/// Indicates a Rust slice type that is represented in memory as bytes. +pub unsafe trait Slice { + /// Access the raw bytes of the slice. + fn as_bytes(&self) -> &[u8]; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_bytes(x: &[u8]) -> &Self; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; +} + +/// Marker type for uninterpreted bytes. +/// +/// Validation will never fail for this format. +#[derive(Copy, Clone, Default, Debug)] +pub struct Bytes; + +unsafe impl Format for Bytes { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } +} + +unsafe impl SliceFormat for Bytes { + type Slice = [u8]; +} + +unsafe impl Slice for [u8] { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &[u8] { + x + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { + x + } +} + +/// Marker type for ASCII text. +#[derive(Copy, Clone, Default, Debug)] +pub struct ASCII; + +unsafe impl Format for ASCII { + #[inline] + fn validate(buf: &[u8]) -> bool { + buf.iter().all(|&n| n <= 127) + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl SubsetOf<UTF8> for ASCII {} +unsafe impl SubsetOf<Latin1> for ASCII {} + +unsafe impl<'a> CharFormat<'a> for ASCII { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0x7F { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} + +/// Marker type for UTF-8 text. +#[derive(Copy, Clone, Default, Debug)] +pub struct UTF8; + +unsafe impl Format for UTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + str::from_utf8(buf).is_ok() + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } +} + +unsafe impl SubsetOf<WTF8> for UTF8 {} + +unsafe impl SliceFormat for UTF8 { + type Slice = str; +} + +unsafe impl Slice for str { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &str { + str::from_utf8_unchecked(x) + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { + mem::transmute(x) + } +} + +unsafe impl<'a> CharFormat<'a> for UTF8 { + type Iter = str::CharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { + str::from_utf8_unchecked(buf).char_indices() + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); + Ok(()) + } +} + +/// Marker type for WTF-8 text. +/// +/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). +#[derive(Copy, Clone, Default, Debug)] +pub struct WTF8; + +#[inline] +fn wtf8_meaningful(m: Meaning) -> bool { + match m { + Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, + _ => false, + } +} + +unsafe impl Format for WTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + let mut i = 0; + let mut prev_lead = false; + while i < buf.len() { + let codept = unwrap_or_return!(futf::classify(buf, i), false); + if !wtf8_meaningful(codept.meaning) { + return false; + } + i += codept.bytes.len(); + prev_lead = match codept.meaning { + Meaning::TrailSurrogate(_) if prev_lead => return false, + Meaning::LeadSurrogate(_) => true, + _ => false, + }; + } + + true + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } + + #[inline] + unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { + const ERR: &'static str = "WTF8: internal error"; + + if lhs.len() >= 3 && rhs.len() >= 3 { + if let ( + Some(Codepoint { + meaning: Meaning::LeadSurrogate(hi), + .. + }), + Some(Codepoint { + meaning: Meaning::TrailSurrogate(lo), + .. + }), + ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) + { + let mut fixup = imp::Fixup { + drop_left: 3, + drop_right: 3, + insert_len: 0, + insert_bytes: [0_u8; 4], + }; + + let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); + + let ch = char::from_u32(n).expect(ERR); + fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; + + return fixup; + } + } + + Default::default() + } +} + +/// Marker type for the single-byte encoding of the first 256 Unicode codepoints. +/// +/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the +/// C0 and C1 control characters from ECMA-48 / ISO 6429. +/// +/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the +/// many other aliases), which actually stand for Windows-1252. +#[derive(Copy, Clone, Default, Debug)] +pub struct Latin1; + +unsafe impl Format for Latin1 { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl<'a> CharFormat<'a> for Latin1 { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0xFF { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} |