diff options
Diffstat (limited to 'vendor/tendril/src')
-rw-r--r-- | vendor/tendril/src/bench.rs | 159 | ||||
-rw-r--r-- | vendor/tendril/src/buf32.rs | 120 | ||||
-rw-r--r-- | vendor/tendril/src/fmt.rs | 519 | ||||
-rw-r--r-- | vendor/tendril/src/lib.rs | 35 | ||||
-rw-r--r-- | vendor/tendril/src/stream.rs | 752 | ||||
-rw-r--r-- | vendor/tendril/src/tendril.rs | 2472 | ||||
-rw-r--r-- | vendor/tendril/src/utf8_decode.rs | 98 | ||||
-rw-r--r-- | vendor/tendril/src/util.rs | 45 |
8 files changed, 4200 insertions, 0 deletions
diff --git a/vendor/tendril/src/bench.rs b/vendor/tendril/src/bench.rs new file mode 100644 index 000000000..a9d2c30af --- /dev/null +++ b/vendor/tendril/src/bench.rs @@ -0,0 +1,159 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::ToOwned; +use std::collections::hash_map::{Entry, HashMap}; + +use tendril::StrTendril; + +fn index_words_string(input: &String) -> HashMap<char, Vec<String>> { + let mut index = HashMap::new(); + for word in input.split(|c| c == ' ') { + if word.len() == 0 { + continue; + } + let word = word.to_owned(); + match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + let x: &mut Vec<String> = e.get_mut(); + x.push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + } + } + index +} + +fn index_words_tendril(input: &StrTendril) -> HashMap<char, Vec<StrTendril>> { + let mut index = HashMap::new(); + let mut t = input.clone(); + loop { + match t.pop_front_char_run(|c| c != ' ') { + None => return index, + Some((_, false)) => (), + Some((word, true)) => match index.entry(word.chars().next().unwrap()) { + Entry::Occupied(mut e) => { + e.get_mut().push(word); + } + Entry::Vacant(e) => { + e.insert(vec![word]); + } + }, + } + } +} + +static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; + +static EN_2: &'static str = + "Here the notes in my laboratory journal cease. I was able to write the last \ + words only with great effort. By now it was already clear to me that LSD had \ + been the cause of the remarkable experience of the previous Friday, for the \ + altered perceptions were of the same type as before, only much more intense. I \ + had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ + informed of the self-experiment, to escort me home. We went by bicycle, no \ + automobile being available because of wartime restrictions on their use. On the \ + way home, my condition began to assume threatening forms. Everything in my \ + field of vision wavered and was distorted as if seen in a curved mirror. I also \ + had the sensation of being unable to move from the spot. Nevertheless, my \ + assistant later told me that we had traveled very rapidly. Finally, we arrived \ + at home safe and sound, and I was just barely capable of asking my companion to \ + summon our family doctor and request milk from the neighbors.\n\n\ + In spite of my delirious, bewildered condition, I had brief periods of clear \ + and effective thinking—and chose milk as a nonspecific antidote for poisoning."; + +static KR_1: &'static str = + "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ + 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ + 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; + +static HTML_KR_1: &'static str = + "<p>러스트(<a href=\"http://rust-lang.org\">Rust</a>)는 모질라(<a href=\"\ + https://www.mozilla.org/\">mozilla.org</a>)에서 개발하고 있는, \ + 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ + 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.</p>"; + +mod index_words { + macro_rules! bench { + ($txt:ident) => { + #[allow(non_snake_case)] + mod $txt { + const SMALL_SIZE: usize = 65536; + const LARGE_SIZE: usize = (1 << 20); + + #[bench] + fn index_words_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < SMALL_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < SMALL_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[bench] + fn index_words_big_string(b: &mut ::test::Bencher) { + let mut s = String::new(); + while s.len() < LARGE_SIZE { + s.push_str(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_string(&s)); + } + + #[bench] + fn index_words_big_tendril(b: &mut ::test::Bencher) { + let mut t = ::tendril::StrTendril::new(); + while t.len() < LARGE_SIZE { + t.push_slice(::tendril::bench::$txt); + } + b.iter(|| ::tendril::bench::index_words_tendril(&t)); + } + + #[test] + fn correctness() { + use std::borrow::ToOwned; + use tendril::bench::{index_words_string, index_words_tendril}; + use tendril::SliceExt; + + let txt = ::tendril::bench::$txt; + let input_string = txt.to_owned(); + let count_s = index_words_string(&input_string); + let mut keys: Vec<char> = count_s.keys().cloned().collect(); + keys.sort(); + + let input_tendril = txt.to_tendril(); + let count_t = index_words_tendril(&input_tendril); + let mut keys_t: Vec<char> = count_t.keys().cloned().collect(); + keys_t.sort(); + + assert_eq!(keys, keys_t); + + for k in &keys { + let vs = &count_s[k]; + let vt = &count_t[k]; + assert_eq!(vs.len(), vt.len()); + assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); + } + } + } + }; + } + + bench!(EN_1); + bench!(EN_2); + bench!(KR_1); + bench!(HTML_KR_1); +} diff --git a/vendor/tendril/src/buf32.rs b/vendor/tendril/src/buf32.rs new file mode 100644 index 000000000..d60a277a1 --- /dev/null +++ b/vendor/tendril/src/buf32.rs @@ -0,0 +1,120 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Provides an unsafe owned buffer type, used in implementing `Tendril`. + +use std::{mem, ptr, slice, u32}; + +use OFLOW; + +pub const MIN_CAP: u32 = 16; + +pub const MAX_LEN: usize = u32::MAX as usize; + +/// A buffer points to a header of type `H`, which is followed by `MIN_CAP` or more +/// bytes of storage. +pub struct Buf32<H> { + pub ptr: *mut H, + pub len: u32, + pub cap: u32, +} + +#[inline(always)] +fn bytes_to_vec_capacity<H>(x: u32) -> usize { + let header = mem::size_of::<H>(); + debug_assert!(header > 0); + let x = (x as usize).checked_add(header).expect(OFLOW); + // Integer ceil https://stackoverflow.com/a/2745086/1162888 + 1 + ((x - 1) / header) +} + +impl<H> Buf32<H> { + #[inline] + pub unsafe fn with_capacity(mut cap: u32, h: H) -> Buf32<H> { + if cap < MIN_CAP { + cap = MIN_CAP; + } + + let mut vec = Vec::<H>::with_capacity(bytes_to_vec_capacity::<H>(cap)); + let ptr = vec.as_mut_ptr(); + mem::forget(vec); + ptr::write(ptr, h); + + Buf32 { + ptr: ptr, + len: 0, + cap: cap, + } + } + + #[inline] + pub unsafe fn destroy(self) { + mem::drop(Vec::from_raw_parts( + self.ptr, + 1, + bytes_to_vec_capacity::<H>(self.cap), + )); + } + + #[inline(always)] + pub unsafe fn data_ptr(&self) -> *mut u8 { + (self.ptr as *mut u8).offset(mem::size_of::<H>() as isize) + } + + #[inline(always)] + pub unsafe fn data(&self) -> &[u8] { + slice::from_raw_parts(self.data_ptr(), self.len as usize) + } + + #[inline(always)] + pub unsafe fn data_mut(&mut self) -> &mut [u8] { + slice::from_raw_parts_mut(self.data_ptr(), self.len as usize) + } + + /// Grow the capacity to at least `new_cap`. + /// + /// This will panic if the capacity calculation overflows `u32`. + #[inline] + pub unsafe fn grow(&mut self, new_cap: u32) { + if new_cap <= self.cap { + return; + } + + let new_cap = new_cap.checked_next_power_of_two().expect(OFLOW); + let mut vec = Vec::from_raw_parts(self.ptr, 0, bytes_to_vec_capacity::<H>(self.cap)); + vec.reserve_exact(bytes_to_vec_capacity::<H>(new_cap)); + self.ptr = vec.as_mut_ptr(); + self.cap = new_cap; + mem::forget(vec); + } +} + +#[cfg(test)] +mod test { + use super::Buf32; + use std::ptr; + + #[test] + fn smoke_test() { + unsafe { + let mut b = Buf32::with_capacity(0, 0u8); + assert_eq!(b"", b.data()); + + b.grow(5); + ptr::copy_nonoverlapping(b"Hello".as_ptr(), b.data_ptr(), 5); + + assert_eq!(b"", b.data()); + b.len = 5; + assert_eq!(b"Hello", b.data()); + + b.grow(1337); + assert!(b.cap >= 1337); + assert_eq!(b"Hello", b.data()); + + b.destroy(); + } + } +} diff --git a/vendor/tendril/src/fmt.rs b/vendor/tendril/src/fmt.rs new file mode 100644 index 000000000..2ff04bbca --- /dev/null +++ b/vendor/tendril/src/fmt.rs @@ -0,0 +1,519 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Marker types for formats. +//! +//! This module defines the types and traits used to mark a `Tendril` +//! with the format of data it contains. It includes those formats +//! for which `Tendril` supports at least some operations without +//! conversion. +//! +//! To convert a string tendril to/from a byte tendril in an arbitrary +//! character encoding, see the `encode` and `decode` methods on +//! `Tendril`. +//! +//! `Tendril` operations may become memory-unsafe if data invalid for +//! the format sneaks in. For that reason, these traits require +//! `unsafe impl`. + +use std::default::Default; +use std::{char, mem, str}; + +use futf::{self, Codepoint, Meaning}; + +/// Implementation details. +/// +/// You don't need these unless you are implementing +/// a new format. +pub mod imp { + use std::default::Default; + use std::{iter, mem, slice}; + + /// Describes how to fix up encodings when concatenating. + /// + /// We can drop characters on either side of the splice, + /// and insert up to 4 bytes in the middle. + pub struct Fixup { + pub drop_left: u32, + pub drop_right: u32, + pub insert_len: u32, + pub insert_bytes: [u8; 4], + } + + impl Default for Fixup { + #[inline(always)] + fn default() -> Fixup { + Fixup { + drop_left: 0, + drop_right: 0, + insert_len: 0, + insert_bytes: [0; 4], + } + } + } + + #[inline(always)] + unsafe fn from_u32_unchecked(n: u32) -> char { + mem::transmute(n) + } + + pub struct SingleByteCharIndices<'a> { + inner: iter::Enumerate<slice::Iter<'a, u8>>, + } + + impl<'a> Iterator for SingleByteCharIndices<'a> { + type Item = (usize, char); + + #[inline] + fn next(&mut self) -> Option<(usize, char)> { + self.inner + .next() + .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) + } + } + + impl<'a> SingleByteCharIndices<'a> { + #[inline] + pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { + SingleByteCharIndices { + inner: buf.iter().enumerate(), + } + } + } +} + +/// Trait for format marker types. +/// +/// The type implementing this trait is usually not instantiated. +/// It's used with a phantom type parameter of `Tendril`. +pub unsafe trait Format { + /// Check whether the buffer is valid for this format. + fn validate(buf: &[u8]) -> bool; + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a prefix of a valid buffer. + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a suffix of a valid buffer. + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Check whether the buffer is valid for this format. + /// + /// You may assume the buffer is a contiguous subsequence + /// of a valid buffer, but not necessarily a prefix or + /// a suffix. + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate(buf) + } + + /// Compute any fixup needed when concatenating buffers. + /// + /// The default is to do nothing. + /// + /// The function is `unsafe` because it may assume the input + /// buffers are already valid for the format. Also, no + /// bounds-checking is performed on the return value! + #[inline(always)] + unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { + Default::default() + } +} + +/// Indicates that one format is a subset of another. +/// +/// The subset format can be converted to the superset format +/// for free. +pub unsafe trait SubsetOf<Super>: Format +where + Super: Format, +{ + /// Validate the *other* direction of conversion; check if + /// this buffer from the superset format conforms to the + /// subset format. + /// + /// The default calls `Self::validate`, but some conversions + /// may implement a check which is cheaper than validating + /// from scratch. + fn revalidate_subset(x: &[u8]) -> bool { + Self::validate(x) + } +} + +/// Indicates a format which corresponds to a Rust slice type, +/// representing exactly the same invariants. +pub unsafe trait SliceFormat: Format + Sized { + type Slice: ?Sized + Slice; +} + +/// Indicates a format which contains characters from Unicode +/// (all of it, or some proper subset). +pub unsafe trait CharFormat<'a>: Format { + /// Iterator for characters and their byte indices. + type Iter: Iterator<Item = (usize, char)>; + + /// Iterate over the characters of the string and their byte + /// indices. + /// + /// You may assume the buffer is *already validated* for `Format`. + unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; + + /// Encode the character as bytes and pass them to a continuation. + /// + /// Returns `Err(())` iff the character cannot be represented. + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]); +} + +/// Indicates a Rust slice type that is represented in memory as bytes. +pub unsafe trait Slice { + /// Access the raw bytes of the slice. + fn as_bytes(&self) -> &[u8]; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_bytes(x: &[u8]) -> &Self; + + /// Convert a byte slice to this kind of slice. + /// + /// You may assume the buffer is *already validated* + /// for `Format`. + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; +} + +/// Marker type for uninterpreted bytes. +/// +/// Validation will never fail for this format. +#[derive(Copy, Clone, Default, Debug)] +pub struct Bytes; + +unsafe impl Format for Bytes { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } +} + +unsafe impl SliceFormat for Bytes { + type Slice = [u8]; +} + +unsafe impl Slice for [u8] { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &[u8] { + x + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { + x + } +} + +/// Marker type for ASCII text. +#[derive(Copy, Clone, Default, Debug)] +pub struct ASCII; + +unsafe impl Format for ASCII { + #[inline] + fn validate(buf: &[u8]) -> bool { + buf.iter().all(|&n| n <= 127) + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl SubsetOf<UTF8> for ASCII {} +unsafe impl SubsetOf<Latin1> for ASCII {} + +unsafe impl<'a> CharFormat<'a> for ASCII { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0x7F { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} + +/// Marker type for UTF-8 text. +#[derive(Copy, Clone, Default, Debug)] +pub struct UTF8; + +unsafe impl Format for UTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + str::from_utf8(buf).is_ok() + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(Codepoint { + meaning: Meaning::Whole(_), + .. + }) => true, + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } +} + +unsafe impl SubsetOf<WTF8> for UTF8 {} + +unsafe impl SliceFormat for UTF8 { + type Slice = str; +} + +unsafe impl Slice for str { + #[inline(always)] + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } + + #[inline(always)] + unsafe fn from_bytes(x: &[u8]) -> &str { + str::from_utf8_unchecked(x) + } + + #[inline(always)] + unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { + mem::transmute(x) + } +} + +unsafe impl<'a> CharFormat<'a> for UTF8 { + type Iter = str::CharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { + str::from_utf8_unchecked(buf).char_indices() + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); + Ok(()) + } +} + +/// Marker type for WTF-8 text. +/// +/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). +#[derive(Copy, Clone, Default, Debug)] +pub struct WTF8; + +#[inline] +fn wtf8_meaningful(m: Meaning) -> bool { + match m { + Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, + _ => false, + } +} + +unsafe impl Format for WTF8 { + #[inline] + fn validate(buf: &[u8]) -> bool { + let mut i = 0; + let mut prev_lead = false; + while i < buf.len() { + let codept = unwrap_or_return!(futf::classify(buf, i), false); + if !wtf8_meaningful(codept.meaning) { + return false; + } + i += codept.bytes.len(); + prev_lead = match codept.meaning { + Meaning::TrailSurrogate(_) if prev_lead => return false, + Meaning::LeadSurrogate(_) => true, + _ => false, + }; + } + + true + } + + #[inline] + fn validate_prefix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, buf.len() - 1) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_suffix(buf: &[u8]) -> bool { + if buf.len() == 0 { + return true; + } + match futf::classify(buf, 0) { + Some(c) => wtf8_meaningful(c.meaning), + _ => false, + } + } + + #[inline] + fn validate_subseq(buf: &[u8]) -> bool { + <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) + } + + #[inline] + unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { + const ERR: &'static str = "WTF8: internal error"; + + if lhs.len() >= 3 && rhs.len() >= 3 { + if let ( + Some(Codepoint { + meaning: Meaning::LeadSurrogate(hi), + .. + }), + Some(Codepoint { + meaning: Meaning::TrailSurrogate(lo), + .. + }), + ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) + { + let mut fixup = imp::Fixup { + drop_left: 3, + drop_right: 3, + insert_len: 0, + insert_bytes: [0_u8; 4], + }; + + let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); + + let ch = char::from_u32(n).expect(ERR); + fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; + + return fixup; + } + } + + Default::default() + } +} + +/// Marker type for the single-byte encoding of the first 256 Unicode codepoints. +/// +/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the +/// C0 and C1 control characters from ECMA-48 / ISO 6429. +/// +/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the +/// many other aliases), which actually stand for Windows-1252. +#[derive(Copy, Clone, Default, Debug)] +pub struct Latin1; + +unsafe impl Format for Latin1 { + #[inline(always)] + fn validate(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_prefix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_suffix(_: &[u8]) -> bool { + true + } + + #[inline(always)] + fn validate_subseq(_: &[u8]) -> bool { + true + } +} + +unsafe impl<'a> CharFormat<'a> for Latin1 { + type Iter = imp::SingleByteCharIndices<'a>; + + #[inline] + unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { + imp::SingleByteCharIndices::new(buf) + } + + #[inline] + fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> + where + F: FnOnce(&[u8]), + { + let n = ch as u32; + if n > 0xFF { + return Err(()); + } + cont(&[n as u8]); + Ok(()) + } +} diff --git a/vendor/tendril/src/lib.rs b/vendor/tendril/src/lib.rs new file mode 100644 index 000000000..33782fdc2 --- /dev/null +++ b/vendor/tendril/src/lib.rs @@ -0,0 +1,35 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg_attr(all(test, feature = "bench"), feature(test))] +//#![cfg_attr(test, deny(warnings))] + +#[cfg(feature = "encoding")] +pub extern crate encoding; +#[cfg(feature = "encoding_rs")] +pub extern crate encoding_rs; +#[cfg(all(test, feature = "bench"))] +extern crate test; +#[macro_use] +extern crate mac; +extern crate futf; +extern crate utf8; + +pub use fmt::Format; +pub use stream::TendrilSink; +pub use tendril::{Atomic, Atomicity, NonAtomic, SendTendril}; +pub use tendril::{ByteTendril, ReadExt, SliceExt, StrTendril, SubtendrilError, Tendril}; +pub use utf8_decode::IncompleteUtf8; + +pub mod fmt; +pub mod stream; + +mod buf32; +mod tendril; +mod utf8_decode; +mod util; + +static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; diff --git a/vendor/tendril/src/stream.rs b/vendor/tendril/src/stream.rs new file mode 100644 index 000000000..469d58c9b --- /dev/null +++ b/vendor/tendril/src/stream.rs @@ -0,0 +1,752 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Streams of tendrils. + +use fmt; +use tendril::{Atomicity, NonAtomic, Tendril}; + +use std::borrow::Cow; +use std::fs::File; +use std::io; +use std::marker::PhantomData; +use std::path::Path; + +#[cfg(feature = "encoding")] +use encoding; +#[cfg(feature = "encoding_rs")] +use encoding_rs::{self, DecoderResult}; +use utf8; + +/// Trait for types that can process a tendril. +/// +/// This is a "push" interface, unlike the "pull" interface of +/// `Iterator<Item=Tendril<F>>`. The push interface matches +/// [html5ever][] and other incremental parsers with a similar +/// architecture. +/// +/// [html5ever]: https://github.com/servo/html5ever +pub trait TendrilSink<F, A = NonAtomic> +where + F: fmt::Format, + A: Atomicity, +{ + /// Process this tendril. + fn process(&mut self, t: Tendril<F, A>); + + /// Indicates that an error has occurred. + fn error(&mut self, desc: Cow<'static, str>); + + /// What the overall result of processing is. + type Output; + + /// Indicates the end of the stream. + fn finish(self) -> Self::Output; + + /// Process one tendril and finish. + fn one<T>(mut self, t: T) -> Self::Output + where + Self: Sized, + T: Into<Tendril<F, A>>, + { + self.process(t.into()); + self.finish() + } + + /// Consume an iterator of tendrils, processing each item, then finish. + fn from_iter<I>(mut self, i: I) -> Self::Output + where + Self: Sized, + I: IntoIterator, + I::Item: Into<Tendril<F, A>>, + { + for t in i { + self.process(t.into()) + } + self.finish() + } + + /// Read from the given stream of bytes until exhaustion and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> + where + Self: Sized, + R: io::Read, + F: fmt::SliceFormat<Slice = [u8]>, + { + const BUFFER_SIZE: u32 = 4 * 1024; + loop { + let mut tendril = Tendril::<F, A>::new(); + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec<u8>` for that reason. + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril) { + Ok(0) => return Ok(self.finish()), + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + self.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + } + } + + /// Read from the file at the given path and process incrementally, + /// then finish. Return `Err` at the first I/O error. + fn from_file<P>(self, path: P) -> io::Result<Self::Output> + where + Self: Sized, + P: AsRef<Path>, + F: fmt::SliceFormat<Slice = [u8]>, + { + self.read_from(&mut File::open(path)?) + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This does not allocate memory: the output is either subtendrils on the input, +/// on inline tendrils for a single code point. +pub struct Utf8LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + pub inner_sink: Sink, + incomplete: Option<utf8::Incomplete>, + marker: PhantomData<A>, +} + +impl<Sink, A> Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental UTF-8 decoder. + #[inline] + pub fn new(inner_sink: Sink) -> Self { + Utf8LossyDecoder { + inner_sink: inner_sink, + incomplete: None, + marker: PhantomData, + } + } +} + +impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { + // FIXME: remove take() and map() when non-lexical borrows are stable. + if let Some(mut incomplete) = self.incomplete.take() { + let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { + match result { + Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), + Err(_) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + } + t.len() - rest.len() + }); + match resume_at { + None => { + self.incomplete = Some(incomplete); + return; + } + Some(resume_at) => t.pop_front(resume_at as u32), + } + } + while !t.is_empty() { + let unborrowed_result = match utf8::decode(&t) { + Ok(s) => { + debug_assert!(s.as_ptr() == t.as_ptr()); + debug_assert!(s.len() == t.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); + debug_assert!(valid_prefix.len() <= t.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } + return; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = t.subtendril(0, valid_len as u32); + unsafe { + self.inner_sink + .process(subtendril.reinterpret_without_validating()) + } + } + match and_then { + Ok(incomplete) => { + self.incomplete = Some(incomplete); + return; + } + Err(offset) => { + self.inner_sink.error("invalid byte sequence".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + t.pop_front(offset as u32); + } + } + } + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + self.inner_sink.error(desc); + } + + type Output = Sink::Output; + + #[inline] + fn finish(mut self) -> Sink::Output { + if self.incomplete.is_some() { + self.inner_sink + .error("incomplete byte sequence at end of stream".into()); + self.inner_sink + .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + } + self.inner_sink.finish() + } +} + +/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, +/// lossily replace ill-formed byte sequences with U+FFFD replacement characters, +/// and emits Unicode (`StrTendril`). +/// +/// This allocates new tendrils for encodings other than UTF-8. +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +pub struct LossyDecoder<Sink, A = NonAtomic> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + inner: LossyDecoderInner<Sink, A>, +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +enum LossyDecoderInner<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + Utf8(Utf8LossyDecoder<Sink, A>), + #[cfg(feature = "encoding")] + Encoding(Box<encoding::RawDecoder>, Sink), + #[cfg(feature = "encoding_rs")] + EncodingRs(encoding_rs::Decoder, Sink), +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + /// Create a new incremental decoder using the encoding crate. + #[cfg(feature = "encoding")] + #[inline] + pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { + if encoding.name() == "utf-8" { + LossyDecoder::utf8(sink) + } else { + LossyDecoder { + inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), + } + } + } + + /// Create a new incremental decoder using the encoding_rs crate. + #[cfg(feature = "encoding_rs")] + #[inline] + pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { + if encoding == encoding_rs::UTF_8 { + return Self::utf8(sink); + } + Self { + inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), + } + } + + /// Create a new incremental decoder for the UTF-8 encoding. + /// + /// This is useful for content that is known at run-time to be UTF-8 + /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) + #[inline] + pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { + LossyDecoder { + inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), + } + } + + /// Give a reference to the inner sink. + pub fn inner_sink(&self) -> &Sink { + match self.inner { + LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, + } + } + + /// Give a mutable reference to the inner sink. + pub fn inner_sink_mut(&mut self) -> &mut Sink { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, + } + } +} + +#[cfg(any(feature = "encoding", feature = "encoding_rs"))] +impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> +where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + #[inline] + fn process(&mut self, t: Tendril<fmt::Bytes, A>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { + let mut out = Tendril::new(); + let mut t = t; + loop { + match decoder.raw_feed(&*t, &mut out) { + (_, Some(err)) => { + out.push_char('\u{fffd}'); + sink.error(err.cause); + debug_assert!(err.upto >= 0); + t.pop_front(err.upto as u32); + // continue loop and process remainder of t + } + (_, None) => break, + } + } + if out.len() > 0 { + sink.process(out); + } + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { + if t.is_empty() { + return; + } + decode_to_sink(t, decoder, sink, false); + } + } + } + + #[inline] + fn error(&mut self, desc: Cow<'static, str>) { + match self.inner { + LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), + } + } + + type Output = Sink::Output; + + #[inline] + fn finish(self) -> Sink::Output { + match self.inner { + LossyDecoderInner::Utf8(utf8) => return utf8.finish(), + #[cfg(feature = "encoding")] + LossyDecoderInner::Encoding(mut decoder, mut sink) => { + let mut out = Tendril::new(); + if let Some(err) = decoder.raw_finish(&mut out) { + out.push_char('\u{fffd}'); + sink.error(err.cause); + } + if out.len() > 0 { + sink.process(out); + } + sink.finish() + } + #[cfg(feature = "encoding_rs")] + LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { + decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); + sink.finish() + } + } + } +} + +#[cfg(feature = "encoding_rs")] +fn decode_to_sink<Sink, A>( + mut t: Tendril<fmt::Bytes, A>, + decoder: &mut encoding_rs::Decoder, + sink: &mut Sink, + last: bool, +) where + Sink: TendrilSink<fmt::UTF8, A>, + A: Atomicity, +{ + loop { + let mut out = <Tendril<fmt::Bytes, A>>::new(); + let max_len = decoder + .max_utf8_buffer_length_without_replacement(t.len()) + .unwrap_or(8192); + unsafe { + out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); + } + let (result, bytes_read, bytes_written) = + decoder.decode_to_utf8_without_replacement(&t, &mut out, last); + if bytes_written > 0 { + sink.process(unsafe { + out.subtendril(0, bytes_written as u32) + .reinterpret_without_validating() + }); + } + match result { + DecoderResult::InputEmpty => return, + DecoderResult::OutputFull => {} + DecoderResult::Malformed(_, _) => { + sink.error(Cow::Borrowed("invalid sequence")); + sink.process("\u{FFFD}".into()); + } + } + t.pop_front(bytes_read as u32); + if t.is_empty() { + return; + } + } +} + +#[cfg(test)] +mod test { + use super::{TendrilSink, Utf8LossyDecoder}; + use fmt; + use std::borrow::Cow; + use tendril::{Atomicity, NonAtomic, Tendril}; + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use super::LossyDecoder; + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + use tendril::SliceExt; + + #[cfg(feature = "encoding")] + use encoding::all as enc; + #[cfg(feature = "encoding_rs")] + use encoding_rs as enc_rs; + + struct Accumulate<A> + where + A: Atomicity, + { + tendrils: Vec<Tendril<fmt::UTF8, A>>, + errors: Vec<String>, + } + + impl<A> Accumulate<A> + where + A: Atomicity, + { + fn new() -> Accumulate<A> { + Accumulate { + tendrils: vec![], + errors: vec![], + } + } + } + + impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> + where + A: Atomicity, + { + fn process(&mut self, t: Tendril<fmt::UTF8, A>) { + self.tendrils.push(t); + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.errors.push(desc.into_owned()); + } + + type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); + + fn finish(self) -> Self::Output { + (self.tendrils, self.errors) + } + } + + fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); + assert_eq!( + expected, + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() + ); + assert_eq!(errs, errors.len()); + } + + #[test] + fn utf8() { + check_utf8(&[], &[], 0); + check_utf8(&[b""], &[], 0); + check_utf8(&[b"xyz"], &["xyz"], 0); + check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); + + check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); + check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); + check_utf8( + &[b"xy\xEA", b"\x99", b"\xAEzw"], + &["xy", "\u{a66e}z", "w"], + 0, + ); + check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); + check_utf8( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + &["\u{a66e}"], + 0, + ); + + check_utf8( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], + 4, + ); + check_utf8( + &[b"xy\xEA\x99", b"\xFFz"], + &["xy", "\u{fffd}", "\u{fffd}", "z"], + 2, + ); + + check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); + check_utf8( + &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], + &["ő", "ő", "ő"], + 0, + ); + check_utf8( + &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], + &["ő", "\u{fffd}", "\u{fffd}", "ő"], + 2, + ); + + // incomplete char at end of input + check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); + check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + fn check_decode( + mut decoder: LossyDecoder<Accumulate<NonAtomic>>, + input: &[&[u8]], + expected: &str, + errs: usize, + ) { + for x in input { + decoder.process(x.to_tendril()); + } + let (tendrils, errors) = decoder.finish(); + let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); + for t in tendrils { + tendril.push_tendril(&t); + } + assert_eq!(expected, &*tendril); + assert_eq!(errs, errors.len()); + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; + + #[cfg(any(feature = "encoding"))] + const ASCII: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"xy", b"", b"", b"z"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xFF"], "\u{fffd}", 1), + (&[b"x\xC0yz"], "x\u{fffd}yz", 1), + (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), + (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_ascii() { + for &(input, expected, errs) in ASCII { + let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const UTF_8: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"xyz"], "xyz", 0), + (&[b"x", b"y", b"z"], "xyz", 0), + (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), + (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), + (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), + ( + &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], + "\u{a66e}", + 0, + ), + (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), + ( + &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], + "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", + 4, + ), + (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), + // incomplete char at end of input + (&[b"\xC0"], "\u{fffd}", 1), + (&[b"\xEA\x99"], "\u{fffd}", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_utf8() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_utf8_encoding_rs() { + for &(input, expected, errs) in UTF_8 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const KOI8_U: Tests = &[ + (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), + (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), + ( + &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], + "Энергия", + 0, + ), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_koi8_u() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_koi8_u_encoding_rs() { + for &(input, expected, errs) in KOI8_U { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(any(feature = "encoding", feature = "encoding_rs"))] + const WINDOWS_949: Tests = &[ + (&[], "", 0), + (&[b""], "", 0), + (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), + (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), + ( + &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], + "안녕하세요", + 0, + ), + (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), + (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), + (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), + ]; + + #[cfg(feature = "encoding")] + #[test] + fn decode_windows_949() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[cfg(feature = "encoding_rs")] + #[test] + fn decode_windows_949_encoding_rs() { + for &(input, expected, errs) in WINDOWS_949 { + let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); + check_decode(decoder, input, expected, errs); + } + } + + #[test] + fn read_from() { + let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); + let mut bytes: &[u8] = b"foo\xffbar"; + let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); + assert_eq!( + &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), + &["foo", "\u{FFFD}", "bar"] + ); + assert_eq!(errors, &["invalid byte sequence"]); + } +} diff --git a/vendor/tendril/src/tendril.rs b/vendor/tendril/src/tendril.rs new file mode 100644 index 000000000..0941b267e --- /dev/null +++ b/vendor/tendril/src/tendril.rs @@ -0,0 +1,2472 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::Borrow; +use std::cell::{Cell, UnsafeCell}; +use std::cmp::Ordering; +use std::default::Default; +use std::fmt as strfmt; +use std::iter::FromIterator; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{self, AtomicUsize}; +use std::{hash, io, mem, ptr, str, u32}; + +#[cfg(feature = "encoding")] +use encoding::{self, DecoderTrap, EncoderTrap, EncodingRef}; + +use buf32::{self, Buf32}; +use fmt::imp::Fixup; +use fmt::{self, Slice}; +use util::{copy_and_advance, copy_lifetime, copy_lifetime_mut, unsafe_slice, unsafe_slice_mut}; +use OFLOW; + +const MAX_INLINE_LEN: usize = 8; +const MAX_INLINE_TAG: usize = 0xF; +const EMPTY_TAG: usize = 0xF; + +#[inline(always)] +fn inline_tag(len: u32) -> NonZeroUsize { + debug_assert!(len <= MAX_INLINE_LEN as u32); + unsafe { NonZeroUsize::new_unchecked(if len == 0 { EMPTY_TAG } else { len as usize }) } +} + +/// The multithreadedness of a tendril. +/// +/// Exactly two types implement this trait: +/// +/// - `Atomic`: use this in your tendril and you will have a `Send` tendril which works +/// across threads; this is akin to `Arc`. +/// +/// - `NonAtomic`: use this in your tendril and you will have a tendril which is neither +/// `Send` nor `Sync` but should be a tad faster; this is akin to `Rc`. +/// +/// The layout of this trait is also mandated to be that of a `usize`, +/// for it is used for reference counting. +pub unsafe trait Atomicity: 'static { + #[doc(hidden)] + fn new() -> Self; + + #[doc(hidden)] + fn increment(&self) -> usize; + + #[doc(hidden)] + fn decrement(&self) -> usize; + + #[doc(hidden)] + fn fence_acquire(); +} + +/// A marker of a non-atomic tendril. +/// +/// This is the default for the second type parameter of a `Tendril` +/// and so doesn't typically need to be written. +/// +/// This is akin to using `Rc` for reference counting. +#[repr(C)] +pub struct NonAtomic(Cell<usize>); + +unsafe impl Atomicity for NonAtomic { + #[inline] + fn new() -> Self { + NonAtomic(Cell::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + let value = self.0.get(); + self.0.set(value.checked_add(1).expect(OFLOW)); + value + } + + #[inline] + fn decrement(&self) -> usize { + let value = self.0.get(); + self.0.set(value - 1); + value + } + + #[inline] + fn fence_acquire() {} +} + +/// A marker of an atomic (and hence concurrent) tendril. +/// +/// This is used as the second, optional type parameter of a `Tendril`; +/// `Tendril<F, Atomic>` thus implements`Send`. +/// +/// This is akin to using `Arc` for reference counting. +pub struct Atomic(AtomicUsize); + +unsafe impl Atomicity for Atomic { + #[inline] + fn new() -> Self { + Atomic(AtomicUsize::new(1)) + } + + #[inline] + fn increment(&self) -> usize { + // Relaxed is OK because we have a reference already. + self.0.fetch_add(1, AtomicOrdering::Relaxed) + } + + #[inline] + fn decrement(&self) -> usize { + self.0.fetch_sub(1, AtomicOrdering::Release) + } + + #[inline] + fn fence_acquire() { + atomic::fence(AtomicOrdering::Acquire); + } +} + +#[repr(C)] // Preserve field order for cross-atomicity transmutes +struct Header<A: Atomicity> { + refcount: A, + cap: u32, +} + +impl<A> Header<A> +where + A: Atomicity, +{ + #[inline(always)] + unsafe fn new() -> Header<A> { + Header { + refcount: A::new(), + cap: 0, + } + } +} + +/// Errors that can occur when slicing a `Tendril`. +#[derive(Copy, Clone, Hash, Debug, PartialEq, Eq)] +pub enum SubtendrilError { + OutOfBounds, + ValidationFailed, +} + +/// Compact string type for zero-copy parsing. +/// +/// `Tendril`s have the semantics of owned strings, but are sometimes views +/// into shared buffers. When you mutate a `Tendril`, an owned copy is made +/// if necessary. Further mutations occur in-place until the string becomes +/// shared, e.g. with `clone()` or `subtendril()`. +/// +/// Buffer sharing is accomplished through thread-local (non-atomic) reference +/// counting, which has very low overhead. The Rust type system will prevent +/// you at compile time from sending a `Tendril` between threads. We plan to +/// relax this restriction in the future; see `README.md`. +/// +/// Whereas `String` allocates in the heap for any non-empty string, `Tendril` +/// can store small strings (up to 8 bytes) in-line, without a heap allocation. +/// `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes +/// versus 24. +/// +/// The type parameter `F` specifies the format of the tendril, for example +/// UTF-8 text or uninterpreted bytes. The parameter will be instantiated +/// with one of the marker types from `tendril::fmt`. See the `StrTendril` +/// and `ByteTendril` type aliases for two examples. +/// +/// The type parameter `A` indicates the atomicity of the tendril; it is by +/// default `NonAtomic`, but can be specified as `Atomic` to get a tendril +/// which implements `Send` (viz. a thread-safe tendril). +/// +/// The maximum length of a `Tendril` is 4 GB. The library will panic if +/// you attempt to go over the limit. +#[repr(C)] +pub struct Tendril<F, A = NonAtomic> +where + F: fmt::Format, + A: Atomicity, +{ + ptr: Cell<NonZeroUsize>, + buf: UnsafeCell<Buffer>, + marker: PhantomData<*mut F>, + refcount_marker: PhantomData<A>, +} + +#[repr(C)] +union Buffer { + heap: Heap, + inline: [u8; 8], +} + +#[derive(Copy, Clone)] +#[repr(C)] +struct Heap { + len: u32, + aux: u32, +} + +unsafe impl<F, A> Send for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity + Sync, +{ +} + +/// `Tendril` for storing native Rust strings. +pub type StrTendril = Tendril<fmt::UTF8>; + +/// `Tendril` for storing binary data. +pub type ByteTendril = Tendril<fmt::Bytes>; + +impl<F, A> Clone for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn clone(&self) -> Tendril<F, A> { + unsafe { + if self.ptr.get().get() > MAX_INLINE_TAG { + self.make_buf_shared(); + self.incref(); + } + + ptr::read(self) + } + } +} + +impl<F, A> Drop for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn drop(&mut self) { + unsafe { + let p = self.ptr.get().get(); + if p <= MAX_INLINE_TAG { + return; + } + + let (buf, shared, _) = self.assume_buf(); + if shared { + let header = self.header(); + if (*header).refcount.decrement() == 1 { + A::fence_acquire(); + buf.destroy(); + } + } else { + buf.destroy(); + } + } + } +} + +macro_rules! from_iter_method { + ($ty:ty) => { + #[inline] + fn from_iter<I>(iterable: I) -> Self + where + I: IntoIterator<Item = $ty>, + { + let mut output = Self::new(); + output.extend(iterable); + output + } + }; +} + +impl<A> Extend<char> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = char>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for c in iterator { + self.push_char(c); + } + } +} + +impl<A> FromIterator<char> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + from_iter_method!(char); +} + +impl<A> Extend<u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = u8>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for b in iterator { + self.push_slice(&[b]); + } + } +} + +impl<A> FromIterator<u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(u8); +} + +impl<'a, A> Extend<&'a u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a u8>, + { + let iterator = iterable.into_iter(); + self.force_reserve(iterator.size_hint().0 as u32); + for &b in iterator { + self.push_slice(&[b]); + } + } +} + +impl<'a, A> FromIterator<&'a u8> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(&'a u8); +} + +impl<'a, A> Extend<&'a str> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a str>, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a str> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + from_iter_method!(&'a str); +} + +impl<'a, A> Extend<&'a [u8]> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a [u8]>, + { + for s in iterable { + self.push_slice(s); + } + } +} + +impl<'a, A> FromIterator<&'a [u8]> for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + from_iter_method!(&'a [u8]); +} + +impl<'a, F, A> Extend<&'a Tendril<F, A>> for Tendril<F, A> +where + F: fmt::Format + 'a, + A: Atomicity, +{ + #[inline] + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = &'a Tendril<F, A>>, + { + for t in iterable { + self.push_tendril(t); + } + } +} + +impl<'a, F, A> FromIterator<&'a Tendril<F, A>> for Tendril<F, A> +where + F: fmt::Format + 'a, + A: Atomicity, +{ + from_iter_method!(&'a Tendril<F, A>); +} + +impl<F, A> Deref for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + type Target = F::Slice; + + #[inline] + fn deref(&self) -> &F::Slice { + unsafe { F::Slice::from_bytes(self.as_byte_slice()) } + } +} + +impl<F, A> DerefMut for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn deref_mut(&mut self) -> &mut F::Slice { + unsafe { F::Slice::from_mut_bytes(self.as_mut_byte_slice()) } + } +} + +impl<F, A> Borrow<[u8]> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + fn borrow(&self) -> &[u8] { + self.as_byte_slice() + } +} + +// Why not impl Borrow<str> for Tendril<fmt::UTF8>? str and [u8] hash differently, +// and so a HashMap<StrTendril, _> would silently break if we indexed by str. Ick. +// https://github.com/rust-lang/rust/issues/27108 + +impl<F, A> PartialEq for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn eq(&self, other: &Self) -> bool { + self.as_byte_slice() == other.as_byte_slice() + } + + #[inline] + fn ne(&self, other: &Self) -> bool { + self.as_byte_slice() != other.as_byte_slice() + } +} + +impl<F, A> Eq for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ +} + +impl<F, A> PartialOrd for Tendril<F, A> +where + F: fmt::SliceFormat, + <F as fmt::SliceFormat>::Slice: PartialOrd, + A: Atomicity, +{ + #[inline] + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + PartialOrd::partial_cmp(&**self, &**other) + } +} + +impl<F, A> Ord for Tendril<F, A> +where + F: fmt::SliceFormat, + <F as fmt::SliceFormat>::Slice: Ord, + A: Atomicity, +{ + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} + +impl<F, A> Default for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline(always)] + fn default() -> Tendril<F, A> { + Tendril::new() + } +} + +impl<F, A> strfmt::Debug for Tendril<F, A> +where + F: fmt::SliceFormat + Default + strfmt::Debug, + <F as fmt::SliceFormat>::Slice: strfmt::Debug, + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + let kind = match self.ptr.get().get() { + p if p <= MAX_INLINE_TAG => "inline", + p if p & 1 == 1 => "shared", + _ => "owned", + }; + + write!(f, "Tendril<{:?}>({}: ", <F as Default>::default(), kind)?; + <<F as fmt::SliceFormat>::Slice as strfmt::Debug>::fmt(&**self, f)?; + write!(f, ")") + } +} + +impl<F, A> hash::Hash for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn hash<H: hash::Hasher>(&self, hasher: &mut H) { + self.as_byte_slice().hash(hasher) + } +} + +impl<F, A> Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + /// Create a new, empty `Tendril` in any format. + #[inline(always)] + pub fn new() -> Tendril<F, A> { + unsafe { Tendril::inline(&[]) } + } + + /// Create a new, empty `Tendril` with a specified capacity. + #[inline] + pub fn with_capacity(capacity: u32) -> Tendril<F, A> { + let mut t: Tendril<F, A> = Tendril::new(); + if capacity > MAX_INLINE_LEN as u32 { + unsafe { + t.make_owned_with_capacity(capacity); + } + } + t + } + + /// Reserve space for additional bytes. + /// + /// This is only a suggestion. There are cases where `Tendril` will + /// decline to allocate until the buffer is actually modified. + #[inline] + pub fn reserve(&mut self, additional: u32) { + if !self.is_shared() { + // Don't grow a shared tendril because we'd have to copy + // right away. + self.force_reserve(additional); + } + } + + /// Reserve space for additional bytes, even for shared buffers. + #[inline] + fn force_reserve(&mut self, additional: u32) { + let new_len = self.len32().checked_add(additional).expect(OFLOW); + if new_len > MAX_INLINE_LEN as u32 { + unsafe { + self.make_owned_with_capacity(new_len); + } + } + } + + /// Get the length of the `Tendril`. + /// + /// This is named not to conflict with `len()` on the underlying + /// slice, if any. + #[inline(always)] + pub fn len32(&self) -> u32 { + match self.ptr.get().get() { + EMPTY_TAG => 0, + n if n <= MAX_INLINE_LEN => n as u32, + _ => unsafe { self.raw_len() }, + } + } + + /// Is the backing buffer shared? + #[inline] + pub fn is_shared(&self) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && ((n & 1) == 1) + } + + /// Is the backing buffer shared with this other `Tendril`? + #[inline] + pub fn is_shared_with(&self, other: &Tendril<F, A>) -> bool { + let n = self.ptr.get().get(); + + (n > MAX_INLINE_TAG) && (n == other.ptr.get().get()) + } + + /// Truncate to length 0 without discarding any owned storage. + #[inline] + pub fn clear(&mut self) { + if self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr + .set(unsafe { NonZeroUsize::new_unchecked(EMPTY_TAG) }); + } else { + let (_, shared, _) = unsafe { self.assume_buf() }; + if shared { + // No need to keep a reference alive for a 0-size slice. + *self = Tendril::new(); + } else { + unsafe { self.set_len(0) }; + } + } + } + + /// Build a `Tendril` by copying a byte slice, if it conforms to the format. + #[inline] + pub fn try_from_byte_slice(x: &[u8]) -> Result<Tendril<F, A>, ()> { + match F::validate(x) { + true => Ok(unsafe { Tendril::from_byte_slice_without_validating(x) }), + false => Err(()), + } + } + + /// View as uninterpreted bytes. + #[inline(always)] + pub fn as_bytes(&self) -> &Tendril<fmt::Bytes, A> { + unsafe { mem::transmute(self) } + } + + /// Convert into uninterpreted bytes. + #[inline(always)] + pub fn into_bytes(self) -> Tendril<fmt::Bytes, A> { + unsafe { mem::transmute(self) } + } + + /// Convert `self` into a type which is `Send`. + /// + /// If the tendril is owned or inline, this is free, + /// but if it's shared this will entail a copy of the contents. + #[inline] + pub fn into_send(mut self) -> SendTendril<F> { + self.make_owned(); + SendTendril { + // This changes the header.refcount from A to NonAtomic, but that's + // OK because we have defined the format of A as a usize. + tendril: unsafe { mem::transmute(self) }, + } + } + + /// View as a superset format, for free. + #[inline(always)] + pub fn as_superset<Super>(&self) -> &Tendril<Super, A> + where + F: fmt::SubsetOf<Super>, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// Convert into a superset format, for free. + #[inline(always)] + pub fn into_superset<Super>(self) -> Tendril<Super, A> + where + F: fmt::SubsetOf<Super>, + Super: fmt::Format, + { + unsafe { mem::transmute(self) } + } + + /// View as a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_as_subset<Sub>(&self) -> Result<&Tendril<Sub, A>, ()> + where + Sub: fmt::SubsetOf<F>, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into a subset format, if the `Tendril` conforms to that subset. + #[inline] + pub fn try_into_subset<Sub>(self) -> Result<Tendril<Sub, A>, Self> + where + Sub: fmt::SubsetOf<F>, + { + match Sub::revalidate_subset(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// View as another format, if the bytes of the `Tendril` are valid for + /// that format. + #[inline] + pub fn try_reinterpret_view<Other>(&self) -> Result<&Tendril<Other, A>, ()> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(()), + } + } + + /// Convert into another format, if the `Tendril` conforms to that format. + /// + /// This only re-validates the existing bytes under the new format. It + /// will *not* change the byte content of the tendril! + /// + /// See the `encode` and `decode` methods for character encoding conversion. + #[inline] + pub fn try_reinterpret<Other>(self) -> Result<Tendril<Other, A>, Self> + where + Other: fmt::Format, + { + match Other::validate(self.as_byte_slice()) { + true => Ok(unsafe { mem::transmute(self) }), + false => Err(self), + } + } + + /// Push some bytes onto the end of the `Tendril`, if they conform to the + /// format. + #[inline] + pub fn try_push_bytes(&mut self, buf: &[u8]) -> Result<(), ()> { + match F::validate(buf) { + true => unsafe { + self.push_bytes_without_validating(buf); + Ok(()) + }, + false => Err(()), + } + } + + /// Push another `Tendril` onto the end of this one. + #[inline] + pub fn push_tendril(&mut self, other: &Tendril<F, A>) { + let new_len = self.len32().checked_add(other.len32()).expect(OFLOW); + + unsafe { + if (self.ptr.get().get() > MAX_INLINE_TAG) && (other.ptr.get().get() > MAX_INLINE_TAG) { + let (self_buf, self_shared, _) = self.assume_buf(); + let (other_buf, other_shared, _) = other.assume_buf(); + + if self_shared + && other_shared + && (self_buf.data_ptr() == other_buf.data_ptr()) + && other.aux() == self.aux() + self.raw_len() + { + self.set_len(new_len); + return; + } + } + + self.push_bytes_without_validating(other.as_byte_slice()) + } + } + + /// Attempt to slice this `Tendril` as a new `Tendril`. + /// + /// This will share the buffer when possible. Mutating a shared buffer + /// will copy the contents. + /// + /// The offset and length are in bytes. The function will return + /// `Err` if these are out of bounds, or if the resulting slice + /// does not conform to the format. + #[inline] + pub fn try_subtendril( + &self, + offset: u32, + length: u32, + ) -> Result<Tendril<F, A>, SubtendrilError> { + let self_len = self.len32(); + if offset > self_len || length > (self_len - offset) { + return Err(SubtendrilError::OutOfBounds); + } + + unsafe { + let byte_slice = unsafe_slice(self.as_byte_slice(), offset as usize, length as usize); + if !F::validate_subseq(byte_slice) { + return Err(SubtendrilError::ValidationFailed); + } + + Ok(self.unsafe_subtendril(offset, length)) + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Panics on bounds or validity check failure. + #[inline] + pub fn subtendril(&self, offset: u32, length: u32) -> Tendril<F, A> { + self.try_subtendril(offset, length).unwrap() + } + + /// Try to drop `n` bytes from the front. + /// + /// Returns `Err` if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn try_pop_front(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_suffix(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_front(n); + Ok(()) + } + } + + /// Drop `n` bytes from the front. + /// + /// Panics if the bytes are not available, or the suffix fails + /// validation. + #[inline] + pub fn pop_front(&mut self, n: u32) { + self.try_pop_front(n).unwrap() + } + + /// Drop `n` bytes from the back. + /// + /// Returns `Err` if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn try_pop_back(&mut self, n: u32) -> Result<(), SubtendrilError> { + if n == 0 { + return Ok(()); + } + let old_len = self.len32(); + if n > old_len { + return Err(SubtendrilError::OutOfBounds); + } + let new_len = old_len - n; + + unsafe { + if !F::validate_prefix(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)) { + return Err(SubtendrilError::ValidationFailed); + } + + self.unsafe_pop_back(n); + Ok(()) + } + } + + /// Drop `n` bytes from the back. + /// + /// Panics if the bytes are not available, or the prefix fails + /// validation. + #[inline] + pub fn pop_back(&mut self, n: u32) { + self.try_pop_back(n).unwrap() + } + + /// View as another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_view_without_validating<Other>(&self) -> &Tendril<Other, A> + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Convert into another format, without validating. + #[inline(always)] + pub unsafe fn reinterpret_without_validating<Other>(self) -> Tendril<Other, A> + where + Other: fmt::Format, + { + mem::transmute(self) + } + + /// Build a `Tendril` by copying a byte slice, without validating. + #[inline] + pub unsafe fn from_byte_slice_without_validating(x: &[u8]) -> Tendril<F, A> { + assert!(x.len() <= buf32::MAX_LEN); + if x.len() <= MAX_INLINE_LEN { + Tendril::inline(x) + } else { + Tendril::owned_copy(x) + } + } + + /// Push some bytes onto the end of the `Tendril`, without validating. + #[inline] + pub unsafe fn push_bytes_without_validating(&mut self, buf: &[u8]) { + assert!(buf.len() <= buf32::MAX_LEN); + + let Fixup { + drop_left, + drop_right, + insert_len, + insert_bytes, + } = F::fixup(self.as_byte_slice(), buf); + + // FIXME: think more about overflow + let adj_len = self.len32() + insert_len - drop_left; + + let new_len = adj_len.checked_add(buf.len() as u32).expect(OFLOW) - drop_right; + + let drop_left = drop_left as usize; + let drop_right = drop_right as usize; + + if new_len <= MAX_INLINE_LEN as u32 { + let mut tmp = [0_u8; MAX_INLINE_LEN]; + { + let old = self.as_byte_slice(); + let mut dest = tmp.as_mut_ptr(); + copy_and_advance(&mut dest, unsafe_slice(old, 0, old.len() - drop_left)); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + } + *self = Tendril::inline(&tmp[..new_len as usize]); + } else { + self.make_owned_with_capacity(new_len); + let (owned, _, _) = self.assume_buf(); + let mut dest = owned + .data_ptr() + .offset((owned.len as usize - drop_left) as isize); + copy_and_advance( + &mut dest, + unsafe_slice(&insert_bytes, 0, insert_len as usize), + ); + copy_and_advance( + &mut dest, + unsafe_slice(buf, drop_right, buf.len() - drop_right), + ); + self.set_len(new_len); + } + } + + /// Slice this `Tendril` as a new `Tendril`. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_subtendril(&self, offset: u32, length: u32) -> Tendril<F, A> { + if length <= MAX_INLINE_LEN as u32 { + Tendril::inline(unsafe_slice( + self.as_byte_slice(), + offset as usize, + length as usize, + )) + } else { + self.make_buf_shared(); + self.incref(); + let (buf, _, _) = self.assume_buf(); + Tendril::shared(buf, self.aux() + offset, length) + } + } + + /// Drop `n` bytes from the front. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_front(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice( + self.as_byte_slice(), + n as usize, + new_len as usize, + )); + } else { + self.make_buf_shared(); + self.set_aux(self.aux() + n); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + /// Drop `n` bytes from the back. + /// + /// Does not check validity or bounds! + #[inline] + pub unsafe fn unsafe_pop_back(&mut self, n: u32) { + let new_len = self.len32() - n; + if new_len <= MAX_INLINE_LEN as u32 { + *self = Tendril::inline(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)); + } else { + self.make_buf_shared(); + let len = self.raw_len(); + self.set_len(len - n); + } + } + + #[inline] + unsafe fn incref(&self) { + (*self.header()).refcount.increment(); + } + + #[inline] + unsafe fn make_buf_shared(&self) { + let p = self.ptr.get().get(); + if p & 1 == 0 { + let header = p as *mut Header<A>; + (*header).cap = self.aux(); + + self.ptr.set(NonZeroUsize::new_unchecked(p | 1)); + self.set_aux(0); + } + } + + // This is not public as it is of no practical value to users. + // By and large they shouldn't need to worry about the distinction at all, + // and going out of your way to make it owned is pointless. + #[inline] + fn make_owned(&mut self) { + unsafe { + let ptr = self.ptr.get().get(); + if ptr <= MAX_INLINE_TAG || (ptr & 1) == 1 { + *self = Tendril::owned_copy(self.as_byte_slice()); + } + } + } + + #[inline] + unsafe fn make_owned_with_capacity(&mut self, cap: u32) { + self.make_owned(); + let mut buf = self.assume_buf().0; + buf.grow(cap); + self.ptr.set(NonZeroUsize::new_unchecked(buf.ptr as usize)); + self.set_aux(buf.cap); + } + + #[inline(always)] + unsafe fn header(&self) -> *mut Header<A> { + (self.ptr.get().get() & !1) as *mut Header<A> + } + + #[inline] + unsafe fn assume_buf(&self) -> (Buf32<Header<A>>, bool, u32) { + let ptr = self.ptr.get().get(); + let header = self.header(); + let shared = (ptr & 1) == 1; + let (cap, offset) = match shared { + true => ((*header).cap, self.aux()), + false => (self.aux(), 0), + }; + + ( + Buf32 { + ptr: header, + len: offset + self.len32(), + cap: cap, + }, + shared, + offset, + ) + } + + #[inline] + unsafe fn inline(x: &[u8]) -> Tendril<F, A> { + let len = x.len(); + let t = Tendril { + ptr: Cell::new(inline_tag(len as u32)), + buf: UnsafeCell::new(Buffer { inline: [0; 8] }), + marker: PhantomData, + refcount_marker: PhantomData, + }; + ptr::copy_nonoverlapping(x.as_ptr(), (*t.buf.get()).inline.as_mut_ptr(), len); + t + } + + #[inline] + unsafe fn owned(x: Buf32<Header<A>>) -> Tendril<F, A> { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked(x.ptr as usize)), + buf: UnsafeCell::new(Buffer { + heap: Heap { + len: x.len, + aux: x.cap, + }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + unsafe fn owned_copy(x: &[u8]) -> Tendril<F, A> { + let len32 = x.len() as u32; + let mut b = Buf32::with_capacity(len32, Header::new()); + ptr::copy_nonoverlapping(x.as_ptr(), b.data_ptr(), x.len()); + b.len = len32; + Tendril::owned(b) + } + + #[inline] + unsafe fn shared(buf: Buf32<Header<A>>, off: u32, len: u32) -> Tendril<F, A> { + Tendril { + ptr: Cell::new(NonZeroUsize::new_unchecked((buf.ptr as usize) | 1)), + buf: UnsafeCell::new(Buffer { + heap: Heap { len, aux: off }, + }), + marker: PhantomData, + refcount_marker: PhantomData, + } + } + + #[inline] + fn as_byte_slice<'a>(&'a self) -> &'a [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &[], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked(..n), + _ => { + let (buf, _, offset) = self.assume_buf(); + copy_lifetime( + self, + unsafe_slice(buf.data(), offset as usize, self.len32() as usize), + ) + } + } + } + } + + // There's no need to worry about locking on an atomic Tendril, because it makes it unique as + // soon as you do that. + #[inline] + fn as_mut_byte_slice<'a>(&'a mut self) -> &'a mut [u8] { + unsafe { + match self.ptr.get().get() { + EMPTY_TAG => &mut [], + n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked_mut(..n), + _ => { + self.make_owned(); + let (mut buf, _, offset) = self.assume_buf(); + let len = self.len32() as usize; + copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) + } + } + } + } + + unsafe fn raw_len(&self) -> u32 { + (*self.buf.get()).heap.len + } + + unsafe fn set_len(&mut self, len: u32) { + (*self.buf.get()).heap.len = len; + } + + unsafe fn aux(&self) -> u32 { + (*self.buf.get()).heap.aux + } + + unsafe fn set_aux(&self, aux: u32) { + (*self.buf.get()).heap.aux = aux; + } +} + +impl<F, A> Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + /// Build a `Tendril` by copying a slice. + #[inline] + pub fn from_slice(x: &F::Slice) -> Tendril<F, A> { + unsafe { Tendril::from_byte_slice_without_validating(x.as_bytes()) } + } + + /// Push a slice onto the end of the `Tendril`. + #[inline] + pub fn push_slice(&mut self, x: &F::Slice) { + unsafe { self.push_bytes_without_validating(x.as_bytes()) } + } +} + +/// A simple wrapper to make `Tendril` `Send`. +/// +/// Although there is a certain subset of the operations on a `Tendril` that a `SendTendril` could +/// reasonably implement, in order to clearly separate concerns this type is deliberately +/// minimalist, acting as a safe encapsulation around the invariants which permit `Send`ness and +/// behaving as an opaque object. +/// +/// A `SendTendril` may be produced by `Tendril.into_send()` or `SendTendril::from(tendril)`, +/// and may be returned to a `Tendril` by `Tendril::from(self)`. +#[derive(Clone)] +pub struct SendTendril<F> +where + F: fmt::Format, +{ + tendril: Tendril<F>, +} + +unsafe impl<F> Send for SendTendril<F> where F: fmt::Format {} + +impl<F, A> From<Tendril<F, A>> for SendTendril<F> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(tendril: Tendril<F, A>) -> SendTendril<F> { + tendril.into_send() + } +} + +impl<F, A> From<SendTendril<F>> for Tendril<F, A> +where + F: fmt::Format, + A: Atomicity, +{ + #[inline] + fn from(send: SendTendril<F>) -> Tendril<F, A> { + unsafe { mem::transmute(send.tendril) } + // header.refcount may have been initialised as an Atomic or a NonAtomic, but the value + // will be the same (1) regardless, because the layout is defined. + // Thus we don't need to fiddle about resetting it or anything like that. + } +} + +/// `Tendril`-related methods for Rust slices. +pub trait SliceExt<F>: fmt::Slice +where + F: fmt::SliceFormat<Slice = Self>, +{ + /// Make a `Tendril` from this slice. + #[inline] + fn to_tendril(&self) -> Tendril<F> { + // It should be done thusly, but at the time of writing the defaults don't help inference: + //fn to_tendril<A = NonAtomic>(&self) -> Tendril<Self::Format, A> + // where A: Atomicity, + //{ + Tendril::from_slice(self) + } +} + +impl SliceExt<fmt::UTF8> for str {} +impl SliceExt<fmt::Bytes> for [u8] {} + +impl<F, A> Tendril<F, A> +where + F: for<'a> fmt::CharFormat<'a>, + A: Atomicity, +{ + /// Remove and return the first character, if any. + #[inline] + pub fn pop_front_char<'a>(&'a mut self) -> Option<char> { + unsafe { + let next_char; // first char in iterator + let mut skip = 0; // number of bytes to skip, or 0 to clear + + { + // <--+ + // | Creating an iterator borrows self, so introduce a + // +- scope to contain the borrow (that way we can mutate + // self below, after this scope exits). + + let mut iter = F::char_indices(self.as_byte_slice()); + match iter.next() { + Some((_, c)) => { + next_char = Some(c); + if let Some((n, _)) = iter.next() { + skip = n as u32; + } + } + None => { + next_char = None; + } + } + } + + if skip != 0 { + self.unsafe_pop_front(skip); + } else { + self.clear(); + } + + next_char + } + } + + /// Remove and return a run of characters at the front of the `Tendril` + /// which are classified the same according to the function `classify`. + /// + /// Returns `None` on an empty string. + #[inline] + pub fn pop_front_char_run<'a, C, R>(&'a mut self, mut classify: C) -> Option<(Tendril<F, A>, R)> + where + C: FnMut(char) -> R, + R: PartialEq, + { + let (class, first_mismatch); + { + let mut chars = unsafe { F::char_indices(self.as_byte_slice()) }; + let (_, first) = unwrap_or_return!(chars.next(), None); + class = classify(first); + first_mismatch = chars.find(|&(_, ch)| &classify(ch) != &class); + } + + match first_mismatch { + Some((idx, _)) => unsafe { + let t = self.unsafe_subtendril(0, idx as u32); + self.unsafe_pop_front(idx as u32); + Some((t, class)) + }, + None => { + let t = self.clone(); + self.clear(); + Some((t, class)) + } + } + } + + /// Push a character, if it can be represented in this format. + #[inline] + pub fn try_push_char(&mut self, c: char) -> Result<(), ()> { + F::encode_char(c, |b| unsafe { + self.push_bytes_without_validating(b); + }) + } +} + +/// Extension trait for `io::Read`. +pub trait ReadExt: io::Read { + fn read_to_tendril<A>(&mut self, buf: &mut Tendril<fmt::Bytes, A>) -> io::Result<usize> + where + A: Atomicity; +} + +impl<T> ReadExt for T +where + T: io::Read, +{ + /// Read all bytes until EOF. + fn read_to_tendril<A>(&mut self, buf: &mut Tendril<fmt::Bytes, A>) -> io::Result<usize> + where + A: Atomicity, + { + // Adapted from libstd/io/mod.rs. + const DEFAULT_BUF_SIZE: u32 = 64 * 1024; + + let start_len = buf.len(); + let mut len = start_len; + let mut new_write_size = 16; + let ret; + loop { + if len == buf.len() { + if new_write_size < DEFAULT_BUF_SIZE { + new_write_size *= 2; + } + // FIXME: this exposes uninitialized bytes to a generic R type + // this is fine for R=File which never reads these bytes, + // but user-defined types might. + // The standard library pushes zeros to `Vec<u8>` for that reason. + unsafe { + buf.push_uninitialized(new_write_size); + } + } + + match self.read(&mut buf[len..]) { + Ok(0) => { + ret = Ok(len - start_len); + break; + } + Ok(n) => len += n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => { + ret = Err(e); + break; + } + } + } + + let buf_len = buf.len32(); + buf.pop_back(buf_len - (len as u32)); + ret + } +} + +impl<A> io::Write for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + self.push_slice(buf); + Ok(buf.len()) + } + + #[inline] + fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { + self.push_slice(buf); + Ok(()) + } + + #[inline(always)] + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl<A> encoding::ByteWriter for Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + #[inline] + fn write_byte(&mut self, b: u8) { + self.push_slice(&[b]); + } + + #[inline] + fn write_bytes(&mut self, v: &[u8]) { + self.push_slice(v); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl<F, A> Tendril<F, A> +where + A: Atomicity, + F: fmt::SliceFormat<Slice = [u8]>, +{ + /// Decode from some character encoding into UTF-8. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn decode( + &self, + encoding: EncodingRef, + trap: DecoderTrap, + ) -> Result<Tendril<fmt::UTF8, A>, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.decode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push "uninitialized bytes" onto the end. + /// + /// Really, this grows the tendril without writing anything to the new area. + /// It's only defined for byte tendrils because it's only useful if you + /// plan to then mutate the buffer. + #[inline] + pub unsafe fn push_uninitialized(&mut self, n: u32) { + let new_len = self.len32().checked_add(n).expect(OFLOW); + if new_len <= MAX_INLINE_LEN as u32 && self.ptr.get().get() <= MAX_INLINE_TAG { + self.ptr.set(inline_tag(new_len)) + } else { + self.make_owned_with_capacity(new_len); + self.set_len(new_len); + } + } +} + +impl<A> strfmt::Display for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { + <str as strfmt::Display>::fmt(&**self, f) + } +} + +impl<A> str::FromStr for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + type Err = (); + + #[inline] + fn from_str(s: &str) -> Result<Self, ()> { + Ok(Tendril::from_slice(s)) + } +} + +impl<A> strfmt::Write for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn write_str(&mut self, s: &str) -> strfmt::Result { + self.push_slice(s); + Ok(()) + } +} + +#[cfg(feature = "encoding")] +impl<A> encoding::StringWriter for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn write_char(&mut self, c: char) { + self.push_char(c); + } + + #[inline] + fn write_str(&mut self, s: &str) { + self.push_slice(s); + } + + #[inline] + fn writer_hint(&mut self, additional: usize) { + self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); + } +} + +impl<A> Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + /// Encode from UTF-8 into some other character encoding. + /// + /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) + /// for more information. + #[inline] + #[cfg(feature = "encoding")] + pub fn encode( + &self, + encoding: EncodingRef, + trap: EncoderTrap, + ) -> Result<Tendril<fmt::Bytes, A>, ::std::borrow::Cow<'static, str>> { + let mut ret = Tendril::new(); + encoding.encode_to(&*self, trap, &mut ret).map(|_| ret) + } + + /// Push a character onto the end. + #[inline] + pub fn push_char(&mut self, c: char) { + unsafe { + self.push_bytes_without_validating(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); + } + } + + /// Create a `Tendril` from a single character. + #[inline] + pub fn from_char(c: char) -> Tendril<fmt::UTF8, A> { + let mut t: Tendril<fmt::UTF8, A> = Tendril::new(); + t.push_char(c); + t + } + + /// Helper for the `format_tendril!` macro. + #[inline] + pub fn format(args: strfmt::Arguments) -> Tendril<fmt::UTF8, A> { + use std::fmt::Write; + let mut output: Tendril<fmt::UTF8, A> = Tendril::new(); + let _ = write!(&mut output, "{}", args); + output + } +} + +/// Create a `StrTendril` through string formatting. +/// +/// Works just like the standard `format!` macro. +#[macro_export] +macro_rules! format_tendril { + ($($arg:tt)*) => ($crate::StrTendril::format(format_args!($($arg)*))) +} + +impl<'a, F, A> From<&'a F::Slice> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn from(input: &F::Slice) -> Tendril<F, A> { + Tendril::from_slice(input) + } +} + +impl<A> From<String> for Tendril<fmt::UTF8, A> +where + A: Atomicity, +{ + #[inline] + fn from(input: String) -> Tendril<fmt::UTF8, A> { + Tendril::from_slice(&*input) + } +} + +impl<F, A> AsRef<F::Slice> for Tendril<F, A> +where + F: fmt::SliceFormat, + A: Atomicity, +{ + #[inline] + fn as_ref(&self) -> &F::Slice { + &**self + } +} + +impl<A> From<Tendril<fmt::UTF8, A>> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: Tendril<fmt::UTF8, A>) -> String { + String::from(&*input) + } +} + +impl<'a, A> From<&'a Tendril<fmt::UTF8, A>> for String +where + A: Atomicity, +{ + #[inline] + fn from(input: &'a Tendril<fmt::UTF8, A>) -> String { + String::from(&**input) + } +} + +#[cfg(all(test, feature = "bench"))] +#[path = "bench.rs"] +mod bench; + +#[cfg(test)] +mod test { + use super::{ + Atomic, ByteTendril, Header, NonAtomic, ReadExt, SendTendril, SliceExt, StrTendril, Tendril, + }; + use fmt; + use std::iter; + use std::thread; + + fn assert_send<T: Send>() {} + + #[test] + fn smoke_test() { + assert_eq!("", &*"".to_tendril()); + assert_eq!("abc", &*"abc".to_tendril()); + assert_eq!("Hello, world!", &*"Hello, world!".to_tendril()); + + assert_eq!(b"", &*b"".to_tendril()); + assert_eq!(b"abc", &*b"abc".to_tendril()); + assert_eq!(b"Hello, world!", &*b"Hello, world!".to_tendril()); + } + + #[test] + fn assert_sizes() { + use std::mem; + struct EmptyWithDrop; + impl Drop for EmptyWithDrop { + fn drop(&mut self) {} + } + let compiler_uses_inline_drop_flags = mem::size_of::<EmptyWithDrop>() > 0; + + let correct = mem::size_of::<*const ()>() + + 8 + + if compiler_uses_inline_drop_flags { + 1 + } else { + 0 + }; + + assert_eq!(correct, mem::size_of::<ByteTendril>()); + assert_eq!(correct, mem::size_of::<StrTendril>()); + + assert_eq!(correct, mem::size_of::<Option<ByteTendril>>()); + assert_eq!(correct, mem::size_of::<Option<StrTendril>>()); + + assert_eq!( + mem::size_of::<*const ()>() * 2, + mem::size_of::<Header<Atomic>>(), + ); + assert_eq!( + mem::size_of::<Header<Atomic>>(), + mem::size_of::<Header<NonAtomic>>(), + ); + } + + #[test] + fn validate_utf8() { + assert!(ByteTendril::try_from_byte_slice(b"\xFF").is_ok()); + assert!(StrTendril::try_from_byte_slice(b"\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xFF").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99").is_err()); + assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xAE\xEA").is_err()); + assert_eq!( + "\u{a66e}", + &*StrTendril::try_from_byte_slice(b"\xEA\x99\xAE").unwrap() + ); + + let mut t = StrTendril::new(); + assert!(t.try_push_bytes(b"\xEA\x99").is_err()); + assert!(t.try_push_bytes(b"\xAE").is_err()); + assert!(t.try_push_bytes(b"\xEA\x99\xAE").is_ok()); + assert_eq!("\u{a66e}", &*t); + } + + #[test] + fn share_and_unshare() { + let s = b"foobarbaz".to_tendril(); + assert_eq!(b"foobarbaz", &*s); + assert!(!s.is_shared()); + + let mut t = s.clone(); + assert_eq!(s.as_ptr(), t.as_ptr()); + assert!(s.is_shared()); + assert!(t.is_shared()); + + t.push_slice(b"quux"); + assert_eq!(b"foobarbaz", &*s); + assert_eq!(b"foobarbazquux", &*t); + assert!(s.as_ptr() != t.as_ptr()); + assert!(!t.is_shared()); + } + + #[test] + fn format_display() { + assert_eq!("foobar", &*format!("{}", "foobar".to_tendril())); + + let mut s = "foo".to_tendril(); + assert_eq!("foo", &*format!("{}", s)); + + let t = s.clone(); + assert_eq!("foo", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + + s.push_slice("barbaz!"); + assert_eq!("foobarbaz!", &*format!("{}", s)); + assert_eq!("foo", &*format!("{}", t)); + } + + #[test] + fn format_debug() { + assert_eq!( + r#"Tendril<UTF8>(inline: "foobar")"#, + &*format!("{:?}", "foobar".to_tendril()) + ); + assert_eq!( + r#"Tendril<Bytes>(inline: [102, 111, 111, 98, 97, 114])"#, + &*format!("{:?}", b"foobar".to_tendril()) + ); + + let t = "anextralongstring".to_tendril(); + assert_eq!( + r#"Tendril<UTF8>(owned: "anextralongstring")"#, + &*format!("{:?}", t) + ); + let _ = t.clone(); + assert_eq!( + r#"Tendril<UTF8>(shared: "anextralongstring")"#, + &*format!("{:?}", t) + ); + } + + #[test] + fn subtendril() { + assert_eq!("foo".to_tendril(), "foo-bar".to_tendril().subtendril(0, 3)); + assert_eq!("bar".to_tendril(), "foo-bar".to_tendril().subtendril(4, 3)); + + let mut t = "foo-bar".to_tendril(); + t.pop_front(2); + assert_eq!("o-bar".to_tendril(), t); + t.pop_back(1); + assert_eq!("o-ba".to_tendril(), t); + + assert_eq!( + "foo".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(0, 3) + ); + assert_eq!( + "oo-a-".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(1, 5) + ); + assert_eq!( + "bar".to_tendril(), + "foo-a-longer-string-bar-baz".to_tendril().subtendril(20, 3) + ); + + let mut t = "another rather long string".to_tendril(); + t.pop_front(2); + assert!(t.starts_with("other rather")); + t.pop_back(1); + assert_eq!("other rather long strin".to_tendril(), t); + assert!(t.is_shared()); + } + + #[test] + fn subtendril_invalid() { + assert!("\u{a66e}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{a66e}".to_tendril().try_subtendril(1, 2).is_err()); + + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 3).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 2).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 1).is_err()); + assert!("\u{1f4a9}".to_tendril().try_subtendril(3, 1).is_err()); + + let mut t = "\u{1f4a9}zzzzzz".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(2).is_err()); + assert!(t.try_pop_front(3).is_err()); + assert!(t.try_pop_front(4).is_ok()); + assert_eq!("zzzzzz", &*t); + + let mut t = "zzzzzz\u{1f4a9}".to_tendril(); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(2).is_err()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(4).is_ok()); + assert_eq!("zzzzzz", &*t); + } + + #[test] + fn conversion() { + assert_eq!( + &[0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().as_bytes() + ); + assert_eq!( + [0x66, 0x6F, 0x6F].to_tendril(), + "foo".to_tendril().into_bytes() + ); + + let ascii: Tendril<fmt::ASCII> = b"hello".to_tendril().try_reinterpret().unwrap(); + assert_eq!(&"hello".to_tendril(), ascii.as_superset()); + assert_eq!("hello".to_tendril(), ascii.clone().into_superset()); + + assert!(b"\xFF" + .to_tendril() + .try_reinterpret::<fmt::ASCII>() + .is_err()); + + let t = "hello".to_tendril(); + let ascii: &Tendril<fmt::ASCII> = t.try_as_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő" + .to_tendril() + .try_reinterpret_view::<fmt::ASCII>() + .is_err()); + assert!("ő".to_tendril().try_as_subset::<fmt::ASCII>().is_err()); + + let ascii: Tendril<fmt::ASCII> = "hello".to_tendril().try_into_subset().unwrap(); + assert_eq!(b"hello", &**ascii.as_bytes()); + + assert!("ő".to_tendril().try_reinterpret::<fmt::ASCII>().is_err()); + assert!("ő".to_tendril().try_into_subset::<fmt::ASCII>().is_err()); + } + + #[test] + fn clear() { + let mut t = "foo-".to_tendril(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + + let mut t = "much longer".to_tendril(); + let s = t.clone(); + t.clear(); + assert_eq!(t.len(), 0); + assert_eq!(t.len32(), 0); + assert_eq!(&*t, ""); + assert_eq!(&*s, "much longer"); + } + + #[test] + fn push_tendril() { + let mut t = "abc".to_tendril(); + t.push_tendril(&"xyz".to_tendril()); + assert_eq!("abcxyz", &*t); + } + + #[test] + fn wtf8() { + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xA0\xBD").is_ok()); + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xB2\xA9").is_ok()); + assert!(Tendril::<fmt::WTF8>::try_from_byte_slice(b"\xED\xA0\xBD\xED\xB2\xA9").is_err()); + + let t: Tendril<fmt::WTF8> = + Tendril::try_from_byte_slice(b"\xED\xA0\xBD\xEA\x99\xAE").unwrap(); + assert!(b"\xED\xA0\xBD".to_tendril().try_reinterpret().unwrap() == t.subtendril(0, 3)); + assert!(b"\xEA\x99\xAE".to_tendril().try_reinterpret().unwrap() == t.subtendril(3, 3)); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + + assert!(t.try_subtendril(0, 1).is_err()); + assert!(t.try_subtendril(0, 2).is_err()); + assert!(t.try_subtendril(1, 1).is_err()); + + assert!(t.try_subtendril(3, 1).is_err()); + assert!(t.try_subtendril(3, 2).is_err()); + assert!(t.try_subtendril(4, 1).is_err()); + + // paired surrogates + let mut t: Tendril<fmt::WTF8> = Tendril::try_from_byte_slice(b"\xED\xA0\xBD").unwrap(); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_ok()); + + // unpaired surrogates + let mut t: Tendril<fmt::WTF8> = Tendril::try_from_byte_slice(b"\xED\xA0\xBB").unwrap(); + assert!(t.try_push_bytes(b"\xED\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED").is_err()); + assert!(t.try_push_bytes(b"\xA0").is_err()); + assert!(t.try_push_bytes(b"\xED\xA0\xBD").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xED\xA0\xBD", t.as_byte_slice()); + assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); + assert_eq!(b"\xED\xA0\xBB\xF0\x9F\x92\xA9", t.as_byte_slice()); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + } + + #[test] + fn front_char() { + let mut t = "".to_tendril(); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "abc".to_tendril(); + assert_eq!(Some('a'), t.pop_front_char()); + assert_eq!(Some('b'), t.pop_front_char()); + assert_eq!(Some('c'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = "főo-a-longer-string-bar-baz".to_tendril(); + assert_eq!(28, t.len()); + assert_eq!(Some('f'), t.pop_front_char()); + assert_eq!(Some('ő'), t.pop_front_char()); + assert_eq!(Some('o'), t.pop_front_char()); + assert_eq!(Some('-'), t.pop_front_char()); + assert_eq!(23, t.len()); + } + + #[test] + fn char_run() { + for &(s, exp) in &[ + ("", None), + (" ", Some((" ", true))), + ("x", Some(("x", false))), + (" \t \n", Some((" \t \n", true))), + ("xyzzy", Some(("xyzzy", false))), + (" xyzzy", Some((" ", true))), + ("xyzzy ", Some(("xyzzy", false))), + (" xyzzy ", Some((" ", true))), + ("xyzzy hi", Some(("xyzzy", false))), + ("中 ", Some(("中", false))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + (" 中 ", Some((" ", true))), + ] { + let mut t = s.to_tendril(); + let res = t.pop_front_char_run(char::is_whitespace); + match exp { + None => assert!(res.is_none()), + Some((es, ec)) => { + let (rt, rc) = res.unwrap(); + assert_eq!(es, &*rt); + assert_eq!(ec, rc); + } + } + } + } + + #[test] + fn deref_mut_inline() { + let mut t = "xyő".to_tendril().into_bytes(); + t[3] = 0xff; + assert_eq!(b"xy\xC5\xFF", &*t); + assert!(t.try_reinterpret_view::<fmt::UTF8>().is_err()); + t[3] = 0x8b; + assert_eq!("xyŋ", &**t.try_reinterpret_view::<fmt::UTF8>().unwrap()); + + unsafe { + t.push_uninitialized(3); + t[4] = 0xEA; + t[5] = 0x99; + t[6] = 0xAE; + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::<fmt::UTF8>().unwrap() + ); + t.push_uninitialized(20); + t.pop_back(20); + assert_eq!( + "xyŋ\u{a66e}", + &**t.try_reinterpret_view::<fmt::UTF8>().unwrap() + ); + } + } + + #[test] + fn deref_mut() { + let mut t = b"0123456789".to_tendril(); + let u = t.clone(); + assert!(t.is_shared()); + t[9] = 0xff; + assert!(!t.is_shared()); + assert_eq!(b"0123456789", &*u); + assert_eq!(b"012345678\xff", &*t); + } + + #[test] + fn push_char() { + let mut t = "xyz".to_tendril(); + t.push_char('o'); + assert_eq!("xyzo", &*t); + t.push_char('ő'); + assert_eq!("xyzoő", &*t); + t.push_char('\u{a66e}'); + assert_eq!("xyzoő\u{a66e}", &*t); + t.push_char('\u{1f4a9}'); + assert_eq!("xyzoő\u{a66e}\u{1f4a9}", &*t); + assert_eq!(t.len(), 13); + } + + #[test] + #[cfg(feature = "encoding")] + fn encode() { + use encoding::{all, EncoderTrap}; + + let t = "안녕하세요 러스트".to_tendril(); + assert_eq!( + b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae", + &*t.encode(all::WINDOWS_949, EncoderTrap::Strict).unwrap() + ); + + let t = "Энергия пробуждения ия-я-я! \u{a66e}".to_tendril(); + assert_eq!( + b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21 ?", + &*t.encode(all::KOI8_U, EncoderTrap::Replace).unwrap() + ); + + let t = "\u{1f4a9}".to_tendril(); + assert!(t.encode(all::WINDOWS_1252, EncoderTrap::Strict).is_err()); + } + + #[test] + #[cfg(feature = "encoding")] + fn decode() { + use encoding::{all, DecoderTrap}; + + let t = b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\ + \xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae" + .to_tendril(); + assert_eq!( + "안녕하세요 러스트", + &*t.decode(all::WINDOWS_949, DecoderTrap::Strict).unwrap() + ); + + let t = b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ + \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21" + .to_tendril(); + assert_eq!( + "Энергия пробуждения ия-я-я!", + &*t.decode(all::KOI8_U, DecoderTrap::Replace).unwrap() + ); + + let t = b"x \xff y".to_tendril(); + assert!(t.decode(all::UTF_8, DecoderTrap::Strict).is_err()); + + let t = b"x \xff y".to_tendril(); + assert_eq!( + "x \u{fffd} y", + &*t.decode(all::UTF_8, DecoderTrap::Replace).unwrap() + ); + } + + #[test] + fn ascii() { + fn mk(x: &[u8]) -> Tendril<fmt::ASCII> { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"xyz"); + assert_eq!(Some('x'), t.pop_front_char()); + assert_eq!(Some('y'), t.pop_front_char()); + assert_eq!(Some('z'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t xyz"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"xyz"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::<fmt::ASCII>::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_err()); + assert_eq!(b"x\0", t.as_byte_slice()); + } + + #[test] + fn latin1() { + fn mk(x: &[u8]) -> Tendril<fmt::Latin1> { + x.to_tendril().try_reinterpret().unwrap() + } + + let mut t = mk(b"\xd8_\xd8"); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(Some('_'), t.pop_front_char()); + assert_eq!(Some('Ø'), t.pop_front_char()); + assert_eq!(None, t.pop_front_char()); + + let mut t = mk(b" \t \xfe\xa7z"); + assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); + assert!(Some((mk(b"\xfe\xa7z"), false)) == t.pop_front_char_run(char::is_whitespace)); + assert!(t.pop_front_char_run(char::is_whitespace).is_none()); + + let mut t = Tendril::<fmt::Latin1>::new(); + assert!(t.try_push_char('x').is_ok()); + assert!(t.try_push_char('\0').is_ok()); + assert!(t.try_push_char('\u{a0}').is_ok()); + assert!(t.try_push_char('ő').is_err()); + assert!(t.try_push_char('я').is_err()); + assert!(t.try_push_char('\u{a66e}').is_err()); + assert!(t.try_push_char('\u{1f4a9}').is_err()); + assert_eq!(b"x\0\xa0", t.as_byte_slice()); + } + + #[test] + fn format() { + assert_eq!("", &*format_tendril!("")); + assert_eq!( + "two and two make 4", + &*format_tendril!("two and two make {}", 2 + 2) + ); + } + + #[test] + fn merge_shared() { + let t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(10, 20); + assert!(a.is_shared()); + assert_eq!("01234567890123456789", &*a); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&a); + assert!(b.is_shared()); + assert!(a.is_shared()); + assert!(a.is_shared_with(&b)); + assert!(b.is_shared_with(&a)); + assert_eq!("012345678901234567890123456789", &*b); + + assert!(t.is_shared()); + assert!(t.is_shared_with(&a)); + assert!(t.is_shared_with(&b)); + } + + #[test] + fn merge_cant_share() { + let t = "012345678901234567890123456789".to_tendril(); + let mut b = t.subtendril(0, 10); + assert!(b.is_shared()); + assert_eq!("0123456789", &*b); + + b.push_tendril(&"abcd".to_tendril()); + assert!(!b.is_shared()); + assert_eq!("0123456789abcd", &*b); + } + + #[test] + fn shared_doesnt_reserve() { + let mut t = "012345678901234567890123456789".to_tendril(); + let a = t.subtendril(1, 10); + + assert!(t.is_shared()); + t.reserve(10); + assert!(t.is_shared()); + + let _ = a; + } + + #[test] + fn out_of_bounds() { + assert!("".to_tendril().try_subtendril(0, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(0, 4).is_err()); + assert!("abc".to_tendril().try_subtendril(3, 1).is_err()); + assert!("abc".to_tendril().try_subtendril(7, 1).is_err()); + + let mut t = "".to_tendril(); + assert!(t.try_pop_front(1).is_err()); + assert!(t.try_pop_front(5).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_err()); + assert!(t.try_pop_back(5).is_err()); + assert!(t.try_pop_back(500).is_err()); + + let mut t = "abcd".to_tendril(); + assert!(t.try_pop_front(1).is_ok()); + assert!(t.try_pop_front(4).is_err()); + assert!(t.try_pop_front(500).is_err()); + assert!(t.try_pop_back(1).is_ok()); + assert!(t.try_pop_back(3).is_err()); + assert!(t.try_pop_back(500).is_err()); + } + + #[test] + fn compare() { + for &a in &[ + "indiscretions", + "validity", + "hallucinogenics", + "timelessness", + "original", + "microcosms", + "boilers", + "mammoth", + ] { + for &b in &[ + "intrepidly", + "frigid", + "spa", + "cardigans", + "guileful", + "evaporated", + "unenthusiastic", + "legitimate", + ] { + let ta = a.to_tendril(); + let tb = b.to_tendril(); + + assert_eq!(a.eq(b), ta.eq(&tb)); + assert_eq!(a.ne(b), ta.ne(&tb)); + assert_eq!(a.lt(b), ta.lt(&tb)); + assert_eq!(a.le(b), ta.le(&tb)); + assert_eq!(a.gt(b), ta.gt(&tb)); + assert_eq!(a.ge(b), ta.ge(&tb)); + assert_eq!(a.partial_cmp(b), ta.partial_cmp(&tb)); + assert_eq!(a.cmp(b), ta.cmp(&tb)); + } + } + } + + #[test] + fn extend_and_from_iterator() { + // Testing Extend<T> and FromIterator<T> for the various Ts. + + // Tendril<F> + let mut t = "Hello".to_tendril(); + t.extend(None::<&Tendril<_>>.into_iter()); + assert_eq!("Hello", &*t); + t.extend(&[", ".to_tendril(), "world".to_tendril(), "!".to_tendril()]); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*[ + "Hello".to_tendril(), + ", ".to_tendril(), + "world".to_tendril(), + "!".to_tendril() + ] + .iter() + .collect::<StrTendril>() + ); + + // &str + let mut t = "Hello".to_tendril(); + t.extend(None::<&str>.into_iter()); + assert_eq!("Hello", &*t); + t.extend([", ", "world", "!"].iter().map(|&s| s)); + assert_eq!("Hello, world!", &*t); + assert_eq!( + "Hello, world!", + &*["Hello", ", ", "world", "!"] + .iter() + .map(|&s| s) + .collect::<StrTendril>() + ); + + // &[u8] + let mut t = b"Hello".to_tendril(); + t.extend(None::<&[u8]>.into_iter()); + assert_eq!(b"Hello", &*t); + t.extend( + [b", ".as_ref(), b"world".as_ref(), b"!".as_ref()] + .iter() + .map(|&s| s), + ); + assert_eq!(b"Hello, world!", &*t); + assert_eq!( + b"Hello, world!", + &*[ + b"Hello".as_ref(), + b", ".as_ref(), + b"world".as_ref(), + b"!".as_ref() + ] + .iter() + .map(|&s| s) + .collect::<ByteTendril>() + ); + + let string = "the quick brown fox jumps over the lazy dog"; + let string_expected = string.to_tendril(); + let bytes = string.as_bytes(); + let bytes_expected = bytes.to_tendril(); + + // char + assert_eq!(string_expected, string.chars().collect()); + let mut tendril = StrTendril::new(); + tendril.extend(string.chars()); + assert_eq!(string_expected, tendril); + + // &u8 + assert_eq!(bytes_expected, bytes.iter().collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes); + assert_eq!(bytes_expected, tendril); + + // u8 + assert_eq!(bytes_expected, bytes.iter().map(|&b| b).collect()); + let mut tendril = ByteTendril::new(); + tendril.extend(bytes.iter().map(|&b| b)); + assert_eq!(bytes_expected, tendril); + } + + #[test] + fn from_str() { + use std::str::FromStr; + let t: Tendril<_> = FromStr::from_str("foo bar baz").unwrap(); + assert_eq!("foo bar baz", &*t); + } + + #[test] + fn from_char() { + assert_eq!("o", &*StrTendril::from_char('o')); + assert_eq!("ő", &*StrTendril::from_char('ő')); + assert_eq!("\u{a66e}", &*StrTendril::from_char('\u{a66e}')); + assert_eq!("\u{1f4a9}", &*StrTendril::from_char('\u{1f4a9}')); + } + + #[test] + #[cfg_attr(miri, ignore)] // slow + fn read() { + fn check(x: &[u8]) { + use std::io::Cursor; + let mut t = ByteTendril::new(); + assert_eq!(x.len(), Cursor::new(x).read_to_tendril(&mut t).unwrap()); + assert_eq!(x, &*t); + } + + check(b""); + check(b"abcd"); + + let long: Vec<u8> = iter::repeat(b'x').take(1_000_000).collect(); + check(&long); + } + + #[test] + fn hash_map_key() { + use std::collections::HashMap; + + // As noted with Borrow, indexing on HashMap<StrTendril, _> is byte-based because of + // https://github.com/rust-lang/rust/issues/27108. + let mut map = HashMap::new(); + map.insert("foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + + let mut map = HashMap::new(); + map.insert(b"foo".to_tendril(), 1); + assert_eq!(map.get(b"foo".as_ref()), Some(&1)); + assert_eq!(map.get(b"bar".as_ref()), None); + } + + #[test] + fn atomic() { + assert_send::<Tendril<fmt::UTF8, Atomic>>(); + let s: Tendril<fmt::UTF8, Atomic> = Tendril::from_slice("this is a string"); + assert!(!s.is_shared()); + let mut t = s.clone(); + assert!(s.is_shared()); + let sp = s.as_ptr() as usize; + thread::spawn(move || { + assert!(t.is_shared()); + t.push_slice(" extended"); + assert_eq!("this is a string extended", &*t); + assert!(t.as_ptr() as usize != sp); + assert!(!t.is_shared()); + }) + .join() + .unwrap(); + assert!(s.is_shared()); + assert_eq!("this is a string", &*s); + } + + #[test] + fn send() { + assert_send::<SendTendril<fmt::UTF8>>(); + let s = "this is a string".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("this is a string", &*s); + }) + .join() + .unwrap(); + assert_eq!("this is a string", &*t); + } + + /// https://github.com/servo/tendril/issues/58 + #[test] + fn issue_58() { + let data = "<p><i>Hello!</p>, World!</i>"; + let s: Tendril<fmt::UTF8, NonAtomic> = data.into(); + assert_eq!(&*s, data); + let s: Tendril<fmt::UTF8, Atomic> = s.into_send().into(); + assert_eq!(&*s, data); + } + + #[test] + fn inline_send() { + let s = "x".to_tendril(); + let t = s.clone(); + let s2 = s.into_send(); + thread::spawn(move || { + let s = StrTendril::from(s2); + assert!(!s.is_shared()); + assert_eq!("x", &*s); + }) + .join() + .unwrap(); + assert_eq!("x", &*t); + } +} diff --git a/vendor/tendril/src/utf8_decode.rs b/vendor/tendril/src/utf8_decode.rs new file mode 100644 index 000000000..b682d57a3 --- /dev/null +++ b/vendor/tendril/src/utf8_decode.rs @@ -0,0 +1,98 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use fmt; +use tendril::{Atomicity, Tendril}; +use utf8; + +pub struct IncompleteUtf8(utf8::Incomplete); + +impl<A> Tendril<fmt::Bytes, A> +where + A: Atomicity, +{ + pub fn decode_utf8_lossy<F>(mut self, mut push_utf8: F) -> Option<IncompleteUtf8> + where + F: FnMut(Tendril<fmt::UTF8, A>), + { + loop { + if self.is_empty() { + return None; + } + let unborrowed_result = match utf8::decode(&self) { + Ok(s) => { + debug_assert!(s.as_ptr() == self.as_ptr()); + debug_assert!(s.len() == self.len()); + Ok(()) + } + Err(utf8::DecodeError::Invalid { + valid_prefix, + invalid_sequence, + .. + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err(( + valid_prefix.len(), + Err(valid_prefix.len() + invalid_sequence.len()), + )) + } + Err(utf8::DecodeError::Incomplete { + valid_prefix, + incomplete_suffix, + }) => { + debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); + debug_assert!(valid_prefix.len() <= self.len()); + Err((valid_prefix.len(), Ok(incomplete_suffix))) + } + }; + match unborrowed_result { + Ok(()) => { + unsafe { push_utf8(self.reinterpret_without_validating()) } + return None; + } + Err((valid_len, and_then)) => { + if valid_len > 0 { + let subtendril = self.subtendril(0, valid_len as u32); + unsafe { push_utf8(subtendril.reinterpret_without_validating()) } + } + match and_then { + Ok(incomplete) => return Some(IncompleteUtf8(incomplete)), + Err(offset) => { + push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); + self.pop_front(offset as u32) + } + } + } + } + } + } +} + +impl IncompleteUtf8 { + pub fn try_complete<A, F>( + &mut self, + mut input: Tendril<fmt::Bytes, A>, + mut push_utf8: F, + ) -> Result<Tendril<fmt::Bytes, A>, ()> + where + A: Atomicity, + F: FnMut(Tendril<fmt::UTF8, A>), + { + let resume_at; + match self.0.try_complete(&input) { + None => return Err(()), + Some((result, rest)) => { + push_utf8(Tendril::from_slice( + result.unwrap_or(utf8::REPLACEMENT_CHARACTER), + )); + resume_at = input.len() - rest.len(); + } + } + input.pop_front(resume_at as u32); + Ok(input) + } +} diff --git a/vendor/tendril/src/util.rs b/vendor/tendril/src/util.rs new file mode 100644 index 000000000..28c55c128 --- /dev/null +++ b/vendor/tendril/src/util.rs @@ -0,0 +1,45 @@ +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::mem; +use std::{ptr, slice}; + +#[inline(always)] +pub unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn unsafe_slice_mut<'a>( + buf: &'a mut [u8], + start: usize, + new_len: usize, +) -> &'a mut [u8] { + debug_assert!(start <= buf.len()); + debug_assert!(new_len <= (buf.len() - start)); + slice::from_raw_parts_mut(buf.as_mut_ptr().offset(start as isize), new_len) +} + +#[inline(always)] +pub unsafe fn copy_and_advance(dest: &mut *mut u8, src: &[u8]) { + ptr::copy_nonoverlapping(src.as_ptr(), *dest, src.len()); + *dest = dest.offset(src.len() as isize) +} + +#[inline(always)] +pub unsafe fn copy_lifetime_mut<'a, S: ?Sized, T: ?Sized + 'a>( + _ptr: &'a mut S, + ptr: &mut T, +) -> &'a mut T { + mem::transmute(ptr) +} + +#[inline(always)] +pub unsafe fn copy_lifetime<'a, S: ?Sized, T: ?Sized + 'a>(_ptr: &'a S, ptr: &T) -> &'a T { + mem::transmute(ptr) +} |