diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/unix_str/src/lossy.rs | |
parent | Initial commit. (diff) | |
download | firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/unix_str/src/lossy.rs')
-rw-r--r-- | third_party/rust/unix_str/src/lossy.rs | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/third_party/rust/unix_str/src/lossy.rs b/third_party/rust/unix_str/src/lossy.rs new file mode 100644 index 0000000000..270ae30d71 --- /dev/null +++ b/third_party/rust/unix_str/src/lossy.rs @@ -0,0 +1,222 @@ +use core::char; +use core::fmt::{self, Write}; +use core::mem; +use core::str as core_str; + +// https://tools.ietf.org/html/rfc3629 +static UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0xBF + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, // 0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF +]; + +/// Given a first byte, determines how many bytes are in this UTF-8 character. +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + UTF8_CHAR_WIDTH[b as usize] as usize +} + +/// Lossy UTF-8 string. +pub struct Utf8Lossy { + bytes: [u8], +} + +impl Utf8Lossy { + pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { + // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. + unsafe { mem::transmute(bytes) } + } + + pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { + Utf8LossyChunksIter { + source: &self.bytes, + } + } +} + +/// Iterator over lossy UTF-8 string +#[allow(missing_debug_implementations)] +pub struct Utf8LossyChunksIter<'a> { + source: &'a [u8], +} + +#[derive(PartialEq, Eq, Debug)] +pub struct Utf8LossyChunk<'a> { + /// Sequence of valid chars. + /// Can be empty between broken UTF-8 chars. + pub valid: &'a str, + /// Single broken char, empty if none. + /// Empty iff iterator item is last. + pub broken: &'a [u8], +} + +impl<'a> Iterator for Utf8LossyChunksIter<'a> { + type Item = Utf8LossyChunk<'a>; + + fn next(&mut self) -> Option<Utf8LossyChunk<'a>> { + if self.source.is_empty() { + return None; + } + + const TAG_CONT_U8: u8 = 128; + fn safe_get(xs: &[u8], i: usize) -> u8 { + *xs.get(i).unwrap_or(&0) + } + + let mut i = 0; + while i < self.source.len() { + let i_ = i; + + // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and + // only increases, so `0 <= i < self.source.len()`. + let byte = unsafe { *self.source.get_unchecked(i) }; + i += 1; + + if byte < 128 { + } else { + let w = utf8_char_width(byte); + + macro_rules! error { + () => {{ + // SAFETY: We have checked up to `i` that source is valid UTF-8. + unsafe { + let r = Utf8LossyChunk { + valid: core_str::from_utf8_unchecked(&self.source[0..i_]), + broken: &self.source[i_..i], + }; + self.source = &self.source[i..]; + return Some(r); + } + }}; + } + + match w { + 2 => { + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + error!(); + } + i += 1; + } + 3 => { + match (byte, safe_get(self.source, i)) { + (0xE0, 0xA0..=0xBF) => (), + (0xE1..=0xEC, 0x80..=0xBF) => (), + (0xED, 0x80..=0x9F) => (), + (0xEE..=0xEF, 0x80..=0xBF) => (), + _ => { + error!(); + } + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + error!(); + } + i += 1; + } + 4 => { + match (byte, safe_get(self.source, i)) { + (0xF0, 0x90..=0xBF) => (), + (0xF1..=0xF3, 0x80..=0xBF) => (), + (0xF4, 0x80..=0x8F) => (), + _ => { + error!(); + } + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + error!(); + } + i += 1; + if safe_get(self.source, i) & 192 != TAG_CONT_U8 { + error!(); + } + i += 1; + } + _ => { + error!(); + } + } + } + } + + let r = Utf8LossyChunk { + // SAFETY: We have checked that the entire source is valid UTF-8. + valid: unsafe { core_str::from_utf8_unchecked(self.source) }, + broken: &[], + }; + self.source = &[]; + Some(r) + } +} + +impl fmt::Display for Utf8Lossy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // If we're the empty string then our iterator won't actually yield + // anything, so perform the formatting manually + if self.bytes.is_empty() { + return "".fmt(f); + } + + for Utf8LossyChunk { valid, broken } in self.chunks() { + // If we successfully decoded the whole chunk as a valid string then + // we can return a direct formatting of the string which will also + // respect various formatting flags if possible. + if valid.len() == self.bytes.len() { + assert!(broken.is_empty()); + return valid.fmt(f); + } + + f.write_str(valid)?; + if !broken.is_empty() { + f.write_char(char::REPLACEMENT_CHARACTER)?; + } + } + Ok(()) + } +} + +impl fmt::Debug for Utf8Lossy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + + for Utf8LossyChunk { valid, broken } in self.chunks() { + // Valid part. + // Here we partially parse UTF-8 again which is suboptimal. + { + let mut from = 0; + for (i, c) in valid.char_indices() { + let esc = c.escape_debug(); + // If char needs escaping, flush backlog so far and write, else skip + if esc.len() != 1 { + f.write_str(&valid[from..i])?; + for c in esc { + f.write_char(c)?; + } + from = i + c.len_utf8(); + } + } + f.write_str(&valid[from..])?; + } + + // Broken parts of string as hex escape. + for &b in broken { + write!(f, "\\x{:02x}", b)?; + } + } + + f.write_char('"') + } +} |