diff options
Diffstat (limited to 'third_party/rust/encoding_rs/src/mem.rs')
-rw-r--r-- | third_party/rust/encoding_rs/src/mem.rs | 3356 |
1 files changed, 3356 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/src/mem.rs b/third_party/rust/encoding_rs/src/mem.rs new file mode 100644 index 0000000000..3330619310 --- /dev/null +++ b/third_party/rust/encoding_rs/src/mem.rs @@ -0,0 +1,3356 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Functions for converting between different in-RAM representations of text +//! and for quickly checking if the Unicode Bidirectional Algorithm can be +//! avoided. +//! +//! By using slices for output, the functions here seek to enable by-register +//! (ALU register or SIMD register as available) operations in order to +//! outperform iterator-based conversions available in the Rust standard +//! library. +//! +//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to +//! U+00FF, inclusive, and does not refer to the windows-1252 range. This +//! in-memory encoding is sometimes used as a storage optimization of text +//! when UTF-16 indexing and length semantics are exposed. +//! +//! The FFI binding for this module are in the +//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). + +#[cfg(feature = "alloc")] +use alloc::borrow::Cow; +#[cfg(feature = "alloc")] +use alloc::string::String; +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use super::in_inclusive_range16; +use super::in_inclusive_range32; +use super::in_inclusive_range8; +use super::in_range16; +use super::in_range32; +use super::DecoderResult; +use crate::ascii::*; +use crate::utf_8::*; + +macro_rules! non_fuzz_debug_assert { + ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); }) +} + +cfg_if! { + if #[cfg(feature = "simd-accel")] { + use ::core::intrinsics::likely; + use ::core::intrinsics::unlikely; + } else { + #[inline(always)] + // Unsafe to match the intrinsic, which is needlessly unsafe. + unsafe fn likely(b: bool) -> bool { + b + } + #[inline(always)] + // Unsafe to match the intrinsic, which is needlessly unsafe. + unsafe fn unlikely(b: bool) -> bool { + b + } + } +} + +/// Classification of text as Latin1 (all code points are below U+0100), +/// left-to-right with some non-Latin1 characters or as containing at least +/// some right-to-left characters. +#[must_use] +#[derive(Debug, PartialEq, Eq)] +#[repr(C)] +pub enum Latin1Bidi { + /// Every character is below U+0100. + Latin1 = 0, + /// There is at least one character that's U+0100 or higher, but there + /// are no right-to-left characters. + LeftToRight = 1, + /// There is at least one right-to-left character. + Bidi = 2, +} + +// `as` truncates, so works on 32-bit, too. +#[allow(dead_code)] +const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize; + +#[allow(unused_macros)] +macro_rules! by_unit_check_alu { + ($name:ident, $unit:ty, $bound:expr, $mask:ident) => { + #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] + #[inline(always)] + fn $name(buffer: &[$unit]) -> bool { + let mut offset = 0usize; + let mut accu = 0usize; + let unit_size = ::core::mem::size_of::<$unit>(); + let len = buffer.len(); + if len >= ALU_ALIGNMENT / unit_size { + // The most common reason to return `false` is for the first code + // unit to fail the test, so check that first. + if buffer[0] >= $bound { + return false; + } + let src = buffer.as_ptr(); + let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) + & ALU_ALIGNMENT_MASK) + / unit_size; + if until_alignment + ALU_ALIGNMENT / unit_size <= len { + if until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + while until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + } + if accu >= $bound { + return false; + } + } + let len_minus_stride = len - ALU_ALIGNMENT / unit_size; + if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { + let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); + loop { + let unroll_accu = unsafe { *(src.add(offset) as *const usize) } + | unsafe { + *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) + } + | unsafe { + *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) + as *const usize) + } + | unsafe { + *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) + as *const usize) + }; + if unroll_accu & $mask != 0 { + return false; + } + offset += 4 * (ALU_ALIGNMENT / unit_size); + if offset > len_minus_unroll { + break; + } + } + } + while offset <= len_minus_stride { + accu |= unsafe { *(src.add(offset) as *const usize) }; + offset += ALU_ALIGNMENT / unit_size; + } + } + } + for &unit in &buffer[offset..] { + accu |= unit as usize; + } + accu & $mask == 0 + } + }; +} + +#[allow(unused_macros)] +macro_rules! by_unit_check_simd { + ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => { + #[inline(always)] + fn $name(buffer: &[$unit]) -> bool { + let mut offset = 0usize; + let mut accu = 0usize; + let unit_size = ::core::mem::size_of::<$unit>(); + let len = buffer.len(); + if len >= SIMD_STRIDE_SIZE / unit_size { + // The most common reason to return `false` is for the first code + // unit to fail the test, so check that first. + if buffer[0] >= $bound { + return false; + } + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT + - ((src as usize) & SIMD_ALIGNMENT_MASK)) + & SIMD_ALIGNMENT_MASK) + / unit_size; + if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len { + if until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + while until_alignment != 0 { + accu |= buffer[offset] as usize; + offset += 1; + until_alignment -= 1; + } + if accu >= $bound { + return false; + } + } + let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; + if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { + let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); + loop { + let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } + | unsafe { + *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size)) + as *const $simd_ty) + } + | unsafe { + *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) + as *const $simd_ty) + } + | unsafe { + *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) + as *const $simd_ty) + }; + if !$func(unroll_accu) { + return false; + } + offset += 4 * (SIMD_STRIDE_SIZE / unit_size); + if offset > len_minus_unroll { + break; + } + } + } + let mut simd_accu = $splat; + while offset <= len_minus_stride { + simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; + offset += SIMD_STRIDE_SIZE / unit_size; + } + if !$func(simd_accu) { + return false; + } + } + } + for &unit in &buffer[offset..] { + accu |= unit as usize; + } + accu < $bound + } + }; +} + +cfg_if! { + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { + use crate::simd_funcs::*; + use packed_simd::u8x16; + use packed_simd::u16x8; + + const SIMD_ALIGNMENT: usize = 16; + + const SIMD_ALIGNMENT_MASK: usize = 15; + + by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); + by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); + by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); + + #[inline(always)] + fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { + // This function is a mess, because it simultaneously tries to do + // only aligned SIMD (perhaps misguidedly) and needs to deal with + // the last code unit in a SIMD stride being part of a valid + // surrogate pair. + let unit_size = ::core::mem::size_of::<u16>(); + let src = buffer.as_ptr(); + let len = buffer.len(); + let mut offset = 0usize; + 'outer: loop { + let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / unit_size; + if until_alignment == 0 { + if offset + SIMD_STRIDE_SIZE / unit_size > len { + break; + } + } else { + let offset_plus_until_alignment = offset + until_alignment; + let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; + if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len { + break; + } + let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); + if up_to < until_alignment { + return offset + up_to; + } + if last_valid_low { + offset = offset_plus_until_alignment_plus_one; + continue; + } + offset = offset_plus_until_alignment; + } + let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; + loop { + let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size; + if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) { + if offset_plus_stride == len { + break 'outer; + } + let offset_plus_stride_plus_one = offset_plus_stride + 1; + let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); + if up_to < SIMD_STRIDE_SIZE / unit_size { + return offset + up_to; + } + if last_valid_low { + offset = offset_plus_stride_plus_one; + continue 'outer; + } + } + offset = offset_plus_stride; + if offset > len_minus_stride { + break 'outer; + } + } + } + let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); + offset + up_to + } + } else { + by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); + by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); + by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); + + #[inline(always)] + fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { + let (up_to, _) = utf16_valid_up_to_alu(buffer); + up_to + } + } +} + +/// The second return value is true iff the last code unit of the slice was +/// reached and turned out to be a low surrogate that is part of a valid pair. +#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] +#[inline(always)] +fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { + let len = buffer.len(); + if len == 0 { + return (0, false); + } + let mut offset = 0usize; + loop { + let unit = buffer[offset]; + let next = offset + 1; + let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); + if unit_minus_surrogate_start > (0xDFFF - 0xD800) { + // Not a surrogate + offset = next; + if offset == len { + return (offset, false); + } + continue; + } + if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { + // high surrogate + if next < len { + let second = buffer[next]; + let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); + if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { + // The next code unit is a low surrogate. Advance position. + offset = next + 1; + if offset == len { + return (offset, true); + } + continue; + } + // The next code unit is not a low surrogate. Don't advance + // position and treat the high surrogate as unpaired. + // fall through + } + // Unpaired, fall through + } + // Unpaired surrogate + return (offset, false); + } +} + +cfg_if! { + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { + #[inline(always)] + fn is_str_latin1_impl(buffer: &str) -> Option<usize> { + let mut offset = 0usize; + let bytes = buffer.as_bytes(); + let len = bytes.len(); + if len >= SIMD_STRIDE_SIZE { + let src = bytes.as_ptr(); + let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK; + if until_alignment + SIMD_STRIDE_SIZE <= len { + while until_alignment != 0 { + if bytes[offset] > 0xC3 { + return Some(offset); + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - SIMD_STRIDE_SIZE; + loop { + if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) { + // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. + while bytes[offset] & 0xC0 == 0x80 { + offset += 1; + } + return Some(offset); + } + offset += SIMD_STRIDE_SIZE; + if offset > len_minus_stride { + break; + } + } + } + } + for i in offset..len { + if bytes[i] > 0xC3 { + return Some(i); + } + } + None + } + } else { + #[inline(always)] + fn is_str_latin1_impl(buffer: &str) -> Option<usize> { + let mut bytes = buffer.as_bytes(); + let mut total = 0; + loop { + if let Some((byte, offset)) = validate_ascii(bytes) { + total += offset; + if byte > 0xC3 { + return Some(total); + } + bytes = &bytes[offset + 2..]; + total += 2; + } else { + return None; + } + } + } + } +} + +#[inline(always)] +fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> { + let mut bytes = buffer; + let mut total = 0; + loop { + if let Some((byte, offset)) = validate_ascii(bytes) { + total += offset; + if in_inclusive_range8(byte, 0xC2, 0xC3) { + let next = offset + 1; + if next == bytes.len() { + return Some(total); + } + if bytes[next] & 0xC0 != 0x80 { + return Some(total); + } + bytes = &bytes[offset + 2..]; + total += 2; + } else { + return Some(total); + } + } else { + return None; + } + } +} + +cfg_if! { + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { + #[inline(always)] + fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { + let mut offset = 0usize; + let len = buffer.len(); + if len >= SIMD_STRIDE_SIZE / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / 2; + if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { + while until_alignment != 0 { + if is_utf16_code_unit_bidi(buffer[offset]) { + return true; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); + loop { + if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) { + return true; + } + offset += SIMD_STRIDE_SIZE / 2; + if offset > len_minus_stride { + break; + } + } + } + } + for &u in &buffer[offset..] { + if is_utf16_code_unit_bidi(u) { + return true; + } + } + false + } + } else { + #[inline(always)] + fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { + for &u in buffer { + if is_utf16_code_unit_bidi(u) { + return true; + } + } + false + } + } +} + +cfg_if! { + if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { + #[inline(always)] + fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { + let mut offset = 0usize; + let len = buffer.len(); + if len >= SIMD_STRIDE_SIZE / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & + SIMD_ALIGNMENT_MASK) / 2; + if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { + while until_alignment != 0 { + if buffer[offset] > 0xFF { + // This transition isn't optimal, since the aligment is recomputing + // but not tweaking further today. + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); + loop { + let mut s = unsafe { *(src.add(offset) as *const u16x8) }; + if !simd_is_latin1(s) { + loop { + if is_u16x8_bidi(s) { + return Latin1Bidi::Bidi; + } + offset += SIMD_STRIDE_SIZE / 2; + if offset > len_minus_stride { + for &u in &buffer[offset..] { + if is_utf16_code_unit_bidi(u) { + return Latin1Bidi::Bidi; + } + } + return Latin1Bidi::LeftToRight; + } + s = unsafe { *(src.add(offset) as *const u16x8) }; + } + } + offset += SIMD_STRIDE_SIZE / 2; + if offset > len_minus_stride { + break; + } + } + } + } + let mut iter = (&buffer[offset..]).iter(); + loop { + if let Some(&u) = iter.next() { + if u > 0xFF { + let mut inner_u = u; + loop { + if is_utf16_code_unit_bidi(inner_u) { + return Latin1Bidi::Bidi; + } + if let Some(&code_unit) = iter.next() { + inner_u = code_unit; + } else { + return Latin1Bidi::LeftToRight; + } + } + } + } else { + return Latin1Bidi::Latin1; + } + } + } + } else { + #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] + #[inline(always)] + fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { + let mut offset = 0usize; + let len = buffer.len(); + if len >= ALU_ALIGNMENT / 2 { + let src = buffer.as_ptr(); + let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & + ALU_ALIGNMENT_MASK) / 2; + if until_alignment + ALU_ALIGNMENT / 2 <= len { + while until_alignment != 0 { + if buffer[offset] > 0xFF { + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - ALU_ALIGNMENT / 2; + loop { + if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 { + if is_utf16_bidi_impl(&buffer[offset..]) { + return Latin1Bidi::Bidi; + } + return Latin1Bidi::LeftToRight; + } + offset += ALU_ALIGNMENT / 2; + if offset > len_minus_stride { + break; + } + } + } + } + let mut iter = (&buffer[offset..]).iter(); + loop { + if let Some(&u) = iter.next() { + if u > 0xFF { + let mut inner_u = u; + loop { + if is_utf16_code_unit_bidi(inner_u) { + return Latin1Bidi::Bidi; + } + if let Some(&code_unit) = iter.next() { + inner_u = code_unit; + } else { + return Latin1Bidi::LeftToRight; + } + } + } + } else { + return Latin1Bidi::Latin1; + } + } + } + } +} + +/// Checks whether the buffer is all-ASCII. +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +pub fn is_ascii(buffer: &[u8]) -> bool { + is_ascii_impl(buffer) +} + +/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing +/// only ASCII characters). +/// +/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function +/// is not guaranteed to fail fast.) +pub fn is_basic_latin(buffer: &[u16]) -> bool { + is_basic_latin_impl(buffer) +} + +/// Checks whether the buffer is valid UTF-8 representing only code points +/// less than or equal to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 +/// invalidity or code points above U+00FF are discovered. +pub fn is_utf8_latin1(buffer: &[u8]) -> bool { + is_utf8_latin1_impl(buffer).is_none() +} + +/// Checks whether the buffer represents only code points less than or equal +/// to U+00FF. +/// +/// Fails fast. (I.e. returns before having read the whole buffer if code +/// points above U+00FF are discovered. +pub fn is_str_latin1(buffer: &str) -> bool { + is_str_latin1_impl(buffer).is_none() +} + +/// Checks whether the buffer represents only code point less than or equal +/// to U+00FF. +/// +/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function +/// is not guaranteed to fail fast.) +pub fn is_utf16_latin1(buffer: &[u16]) -> bool { + is_utf16_latin1_impl(buffer) +} + +/// Checks whether a potentially-invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Returns `true` if the input is invalid UTF-8 or the input contains an +/// RTL character. Returns `false` if the input is valid UTF-8 and contains +/// no RTL characters. +#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))] +#[inline] +pub fn is_utf8_bidi(buffer: &[u8]) -> bool { + // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster + // than UTF-8 validation followed by `is_str_bidi()` for German, + // Russian and Japanese. However, this is considerably slower for Thai. + // Chances are that the compiler makes some branch predictions that are + // unfortunate for Thai. Not spending the time to manually optimize + // further at this time, since it's unclear if this variant even has + // use cases. However, this is worth revisiting once Rust gets the + // ability to annotate relative priorities of match arms. + + // U+058F: D6 8F + // U+0590: D6 90 + // U+08FF: E0 A3 BF + // U+0900: E0 A4 80 + // + // U+200F: E2 80 8F + // U+202B: E2 80 AB + // U+202E: E2 80 AE + // U+2067: E2 81 A7 + // + // U+FB1C: EF AC 9C + // U+FB1D: EF AC 9D + // U+FDFF: EF B7 BF + // U+FE00: EF B8 80 + // + // U+FE6F: EF B9 AF + // U+FE70: EF B9 B0 + // U+FEFE: EF BB BE + // U+FEFF: EF BB BF + // + // U+107FF: F0 90 9F BF + // U+10800: F0 90 A0 80 + // U+10FFF: F0 90 BF BF + // U+11000: F0 91 80 80 + // + // U+1E7FF: F0 9E 9F BF + // U+1E800: F0 9E A0 80 + // U+1EFFF: F0 9E BF BF + // U+1F000: F0 9F 80 80 + let mut src = buffer; + 'outer: loop { + if let Some((mut byte, mut read)) = validate_ascii(src) { + // Check for the longest sequence to avoid checking twice for the + // multi-byte sequences. + if read + 4 <= src.len() { + 'inner: loop { + // At this point, `byte` is not included in `read`. + match byte { + 0..=0x7F => { + // ASCII: go back to SIMD. + read += 1; + src = &src[read..]; + continue 'outer; + } + 0xC2..=0xD5 => { + // Two-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + return true; + } + read += 2; + } + 0xD6 => { + // Two-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + return true; + } + // XXX consider folding the above and below checks + if second > 0x8F { + return true; + } + read += 2; + } + // two-byte starting with 0xD7 and above is bidi + 0xE1 | 0xE3..=0xEC | 0xEE => { + // Three-byte normal + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }) + | (third >> 6)) + != 2 + { + return true; + } + read += 3; + } + 0xE2 => { + // Three-byte normal, potentially bidi + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }) + | (third >> 6)) + != 2 + { + return true; + } + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + read += 3; + } + 0xEF => { + // Three-byte normal, potentially bidi + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }) + | (third >> 6)) + != 2 + { + return true; + } + if in_inclusive_range8(second, 0xAC, 0xB7) { + if second == 0xAC { + if third > 0x9C { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + if third > 0xAF { + return true; + } + } else if second == 0xBB { + if third != 0xBF { + return true; + } + } else { + return true; + } + } + read += 3; + } + 0xE0 => { + // Three-byte special lower bound, potentially bidi + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }) + | (third >> 6)) + != 2 + { + return true; + } + // XXX can this be folded into the above validity check + if second < 0xA4 { + return true; + } + read += 3; + } + 0xED => { + // Three-byte special upper bound + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }) + | (third >> 6)) + != 2 + { + return true; + } + read += 3; + } + 0xF1..=0xF4 => { + // Four-byte normal + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + let fourth = unsafe { *(src.get_unchecked(read + 3)) }; + if (u16::from( + UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }, + ) | u16::from(third >> 6) + | (u16::from(fourth & 0xC0) << 2)) + != 0x202 + { + return true; + } + read += 4; + } + 0xF0 => { + // Four-byte special lower bound, potentially bidi + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + let fourth = unsafe { *(src.get_unchecked(read + 3)) }; + if (u16::from( + UTF8_DATA.table[usize::from(second)] + & unsafe { + *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) + }, + ) | u16::from(third >> 6) + | (u16::from(fourth & 0xC0) << 2)) + != 0x202 + { + return true; + } + if unsafe { unlikely(second == 0x90 || second == 0x9E) } { + let third = src[read + 2]; + if third >= 0xA0 { + return true; + } + } + read += 4; + } + _ => { + // Invalid lead or bidi-only lead + return true; + } + } + if read + 4 > src.len() { + if read == src.len() { + return false; + } + byte = src[read]; + break 'inner; + } + byte = src[read]; + continue 'inner; + } + } + // We can't have a complete 4-byte sequence, but we could still have + // a complete shorter sequence. + + // At this point, `byte` is not included in `read`. + match byte { + 0..=0x7F => { + // ASCII: go back to SIMD. + read += 1; + src = &src[read..]; + continue 'outer; + } + 0xC2..=0xD5 => { + // Two-byte + let new_read = read + 2; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + return true; + } + read = new_read; + // We need to deal with the case where we came here with 3 bytes + // left, so we need to take a look at the last one. + src = &src[read..]; + continue 'outer; + } + 0xD6 => { + // Two-byte, potentially bidi + let new_read = read + 2; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + return true; + } + // XXX consider folding the above and below checks + if second > 0x8F { + return true; + } + read = new_read; + // We need to deal with the case where we came here with 3 bytes + // left, so we need to take a look at the last one. + src = &src[read..]; + continue 'outer; + } + // two-byte starting with 0xD7 and above is bidi + 0xE1 | 0xE3..=0xEC | 0xEE => { + // Three-byte normal + let new_read = read + 3; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + return true; + } + } + 0xE2 => { + // Three-byte normal, potentially bidi + let new_read = read + 3; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + return true; + } + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + } + 0xEF => { + // Three-byte normal, potentially bidi + let new_read = read + 3; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + return true; + } + if in_inclusive_range8(second, 0xAC, 0xB7) { + if second == 0xAC { + if third > 0x9C { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + if third > 0xAF { + return true; + } + } else if second == 0xBB { + if third != 0xBF { + return true; + } + } else { + return true; + } + } + } + 0xE0 => { + // Three-byte special lower bound, potentially bidi + let new_read = read + 3; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + return true; + } + // XXX can this be folded into the above validity check + if second < 0xA4 { + return true; + } + } + 0xED => { + // Three-byte special upper bound + let new_read = read + 3; + if new_read > src.len() { + return true; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + return true; + } + } + _ => { + // Invalid lead, 4-byte lead or 2-byte bidi-only lead + return true; + } + } + return false; + } else { + return false; + } + } +} + +/// Checks whether a valid UTF-8 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] +#[inline] +pub fn is_str_bidi(buffer: &str) -> bool { + // U+058F: D6 8F + // U+0590: D6 90 + // U+08FF: E0 A3 BF + // U+0900: E0 A4 80 + // + // U+200F: E2 80 8F + // U+202B: E2 80 AB + // U+202E: E2 80 AE + // U+2067: E2 81 A7 + // + // U+FB1C: EF AC 9C + // U+FB1D: EF AC 9D + // U+FDFF: EF B7 BF + // U+FE00: EF B8 80 + // + // U+FE6F: EF B9 AF + // U+FE70: EF B9 B0 + // U+FEFE: EF BB BE + // U+FEFF: EF BB BF + // + // U+107FF: F0 90 9F BF + // U+10800: F0 90 A0 80 + // U+10FFF: F0 90 BF BF + // U+11000: F0 91 80 80 + // + // U+1E7FF: F0 9E 9F BF + // U+1E800: F0 9E A0 80 + // U+1EFFF: F0 9E BF BF + // U+1F000: F0 9F 80 80 + let mut bytes = buffer.as_bytes(); + 'outer: loop { + // TODO: Instead of just validating ASCII using SIMD, use SIMD + // to check for non-ASCII lead bytes, too, to quickly conclude + // that the vector consist entirely of CJK and below-Hebrew + // code points. + // Unfortunately, scripts above Arabic but below CJK share + // lead bytes with RTL. + if let Some((mut byte, mut read)) = validate_ascii(bytes) { + 'inner: loop { + // At this point, `byte` is not included in `read`. + if byte < 0xE0 { + if byte >= 0x80 { + // Two-byte + // Adding `unlikely` here improved throughput on + // Russian plain text by 33%! + if unsafe { unlikely(byte >= 0xD6) } { + if byte == 0xD6 { + let second = bytes[read + 1]; + if second > 0x8F { + return true; + } + } else { + return true; + } + } + read += 2; + } else { + // ASCII: write and go back to SIMD. + read += 1; + // Intuitively, we should go back to the outer loop only + // if byte is 0x30 or above, so as to avoid trashing on + // ASCII space, comma and period in non-Latin context. + // However, the extra branch seems to cost more than it's + // worth. + bytes = &bytes[read..]; + continue 'outer; + } + } else if byte < 0xF0 { + // Three-byte + if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } { + let second = bytes[read + 1]; + if byte == 0xE0 { + if second < 0xA4 { + return true; + } + } else if byte == 0xE2 { + let third = bytes[read + 2]; + if second == 0x80 { + if third == 0x8F || third == 0xAB || third == 0xAE { + return true; + } + } else if second == 0x81 { + if third == 0xA7 { + return true; + } + } + } else { + debug_assert_eq!(byte, 0xEF); + if in_inclusive_range8(second, 0xAC, 0xB7) { + if second == 0xAC { + let third = bytes[read + 2]; + if third > 0x9C { + return true; + } + } else { + return true; + } + } else if in_inclusive_range8(second, 0xB9, 0xBB) { + if second == 0xB9 { + let third = bytes[read + 2]; + if third > 0xAF { + return true; + } + } else if second == 0xBB { + let third = bytes[read + 2]; + if third != 0xBF { + return true; + } + } else { + return true; + } + } + } + } + read += 3; + } else { + // Four-byte + let second = bytes[read + 1]; + if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } { + let third = bytes[read + 2]; + if third >= 0xA0 { + return true; + } + } + read += 4; + } + // The comparison is always < or == and never >, but including + // > here to let the compiler assume that < is true if this + // comparison is false. + if read >= bytes.len() { + return false; + } + byte = bytes[read]; + continue 'inner; + } + } else { + return false; + } + } +} + +/// Checks whether a UTF-16 buffer contains code points that trigger +/// right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Returns `true` if the input contains an RTL character or an unpaired +/// high surrogate that could be the high half of an RTL character. +/// Returns `false` if the input contains neither RTL characters nor +/// unpaired high surrogates that could be higher halves of RTL characters. +pub fn is_utf16_bidi(buffer: &[u16]) -> bool { + is_utf16_bidi_impl(buffer) +} + +/// Checks whether a scalar value triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +#[inline(always)] +pub fn is_char_bidi(c: char) -> bool { + // Controls: + // Every control with RIGHT-TO-LEFT in its name in + // https://www.unicode.org/charts/PDF/U2000.pdf + // U+200F RLM + // U+202B RLE + // U+202E RLO + // U+2067 RLI + // + // BMP RTL: + // https://www.unicode.org/roadmaps/bmp/ + // U+0590...U+08FF + // U+FB1D...U+FDFF Hebrew presentation forms and + // Arabic Presentation Forms A + // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM) + // + // Supplementary RTL: + // https://www.unicode.org/roadmaps/smp/ + // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) + // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) + let code_point = u32::from(c); + if code_point < 0x0590 { + // Below Hebrew + return false; + } + if in_range32(code_point, 0x0900, 0xFB1D) { + // Above Arabic Extended-A and below Hebrew presentation forms + if in_inclusive_range32(code_point, 0x200F, 0x2067) { + // In the range that contains the RTL controls + return code_point == 0x200F + || code_point == 0x202B + || code_point == 0x202E + || code_point == 0x2067; + } + return false; + } + if code_point > 0x1EFFF { + // Above second astral RTL. (Emoji is here.) + return false; + } + if in_range32(code_point, 0x11000, 0x1E800) { + // Between astral RTL blocks + return false; + } + if in_range32(code_point, 0xFEFF, 0x10800) { + // Above Arabic Presentations Forms B (excl. BOM) and below first + // astral RTL + return false; + } + if in_range32(code_point, 0xFE00, 0xFE70) { + // Between Arabic Presentations Forms + return false; + } + true +} + +/// Checks whether a UTF-16 code unit triggers right-to-left processing. +/// +/// The check is done on a Unicode block basis without regard to assigned +/// vs. unassigned code points in the block. Hebrew presentation forms in +/// the Alphabetic Presentation Forms block are treated as if they formed +/// a block on their own (i.e. it treated as right-to-left). Additionally, +/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked +/// for. Control characters that are technically bidi controls but do not +/// cause right-to-left behavior without the presence of right-to-left +/// characters or right-to-left controls are not checked for. As a special +/// case, U+FEFF is excluded from Arabic Presentation Forms-B. +/// +/// Since supplementary-plane right-to-left blocks are identifiable from the +/// high surrogate without examining the low surrogate, this function returns +/// `true` for such high surrogates making the function suitable for handling +/// supplementary-plane text without decoding surrogate pairs to scalar +/// values. Obviously, such high surrogates are then reported as right-to-left +/// even if actually unpaired. +#[inline(always)] +pub fn is_utf16_code_unit_bidi(u: u16) -> bool { + if u < 0x0590 { + // Below Hebrew + return false; + } + if in_range16(u, 0x0900, 0xD802) { + // Above Arabic Extended-A and below first RTL surrogate + if in_inclusive_range16(u, 0x200F, 0x2067) { + // In the range that contains the RTL controls + return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; + } + return false; + } + if in_range16(u, 0xD83C, 0xFB1D) { + // Between astral RTL high surrogates and Hebrew presentation forms + // (Emoji is here) + return false; + } + if in_range16(u, 0xD804, 0xD83A) { + // Between RTL high surragates + return false; + } + if u > 0xFEFE { + // Above Arabic Presentation Forms (excl. BOM) + return false; + } + if in_range16(u, 0xFE00, 0xFE70) { + // Between Arabic Presentations Forms + return false; + } + true +} + +/// Checks whether a potentially invalid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { + if let Some(offset) = is_utf8_latin1_impl(buffer) { + if is_utf8_bidi(&buffer[offset..]) { + Latin1Bidi::Bidi + } else { + Latin1Bidi::LeftToRight + } + } else { + Latin1Bidi::Latin1 + } +} + +/// Checks whether a valid UTF-8 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { + // The transition from the latin1 check to the bidi check isn't + // optimal but not tweaking it to perfection today. + if let Some(offset) = is_str_latin1_impl(buffer) { + if is_str_bidi(&buffer[offset..]) { + Latin1Bidi::Bidi + } else { + Latin1Bidi::LeftToRight + } + } else { + Latin1Bidi::Latin1 + } +} + +/// Checks whether a potentially invalid UTF-16 buffer contains code points +/// that trigger right-to-left processing or is all-Latin1. +/// +/// Possibly more efficient than performing the checks separately. +/// +/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. +/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return +/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. +pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { + check_utf16_for_latin1_and_bidi_impl(buffer) +} + +/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer _plus one_. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { + // TODO: Can the requirement for dst to be at least one unit longer + // be eliminated? + assert!(dst.len() > src.len()); + let mut decoder = Utf8Decoder::new_inner(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + let (result, read, written) = + decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); + total_read += read; + total_written += written; + match result { + DecoderResult::InputEmpty => { + return total_written; + } + DecoderResult::OutputFull => { + unreachable!("The assert at the top of the function should have caught this."); + } + DecoderResult::Malformed(_, _) => { + // There should always be space for the U+FFFD, because + // otherwise we'd have gotten OutputFull already. + dst[total_written] = 0xFFFD; + total_written += 1; + } + } + } +} + +/// Converts valid UTF-8 to valid UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + let bytes = src.as_bytes(); + let mut read = 0; + let mut written = 0; + 'outer: loop { + let mut byte = { + let src_remaining = &bytes[read..]; + let dst_remaining = &mut dst[written..]; + let length = src_remaining.len(); + match unsafe { + ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + } { + None => { + written += length; + return written; + } + Some((non_ascii, consumed)) => { + read += consumed; + written += consumed; + non_ascii + } + } + }; + 'inner: loop { + // At this point, `byte` is not included in `read`. + if byte < 0xE0 { + if byte >= 0x80 { + // Two-byte + let second = unsafe { *(bytes.get_unchecked(read + 1)) }; + let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); + unsafe { *(dst.get_unchecked_mut(written)) = point }; + read += 2; + written += 1; + } else { + // ASCII: write and go back to SIMD. + unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + read += 1; + written += 1; + // Intuitively, we should go back to the outer loop only + // if byte is 0x30 or above, so as to avoid trashing on + // ASCII space, comma and period in non-Latin context. + // However, the extra branch seems to cost more than it's + // worth. + continue 'outer; + } + } else if byte < 0xF0 { + // Three-byte + let second = unsafe { *(bytes.get_unchecked(read + 1)) }; + let third = unsafe { *(bytes.get_unchecked(read + 2)) }; + let point = ((u16::from(byte) & 0xF) << 12) + | ((u16::from(second) & 0x3F) << 6) + | (u16::from(third) & 0x3F); + unsafe { *(dst.get_unchecked_mut(written)) = point }; + read += 3; + written += 1; + } else { + // Four-byte + let second = unsafe { *(bytes.get_unchecked(read + 1)) }; + let third = unsafe { *(bytes.get_unchecked(read + 2)) }; + let fourth = unsafe { *(bytes.get_unchecked(read + 3)) }; + let point = ((u32::from(byte) & 0x7) << 18) + | ((u32::from(second) & 0x3F) << 12) + | ((u32::from(third) & 0x3F) << 6) + | (u32::from(fourth) & 0x3F); + unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; + unsafe { + *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 + }; + read += 4; + written += 2; + } + // The comparison is always < or == and never >, but including + // > here to let the compiler assume that < is true if this + // comparison is false. + if read >= src.len() { + return written; + } + byte = bytes[read]; + continue 'inner; + } + } +} + +/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written or `None` if the input was invalid. +/// +/// When the input was invalid, some output may have been written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst); + if read == src.len() { + return Some(written); + } + None +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER with potentially insufficient output +/// space. +/// +/// Returns the number of code units read and the number of bytes written. +/// +/// Guarantees that the bytes in the destination beyond the number of +/// bytes claimed as written by the second item of the return tuple +/// are left unmodified. +/// +/// Not all code units are read if there isn't enough output space. +/// +/// Note that this method isn't designed for general streamability but for +/// not allocating memory for the worst case up front. Specifically, +/// if the input starts with or ends with an unpaired surrogate, those are +/// replaced with the REPLACEMENT CHARACTER. +/// +/// Matches the semantics of `TextEncoder.encodeInto()` from the +/// Encoding Standard. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use +/// `convert_utf16_to_str_partial()` instead of using this function +/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. +#[inline(always)] +pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { + // The two functions called below are marked `inline(never)` to make + // transitions from the hot part (first function) into the cold part + // (second function) go through a return and another call to discouge + // the CPU from speculating from the hot code into the cold code. + // Letting the transitions be mere intra-function jumps, even to + // basic blocks out-of-lined to the end of the function would wipe + // away a quarter of Arabic encode performance on Haswell! + let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst); + if unsafe { likely(read == src.len()) } { + return (read, written); + } + let (tail_read, tail_written) = + convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]); + (read + tail_read, written + tail_written) +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times three. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` +/// instead of using this function together with the `unsafe` method +/// `as_bytes_mut()` on `&mut str`. +#[inline(always)] +pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { + assert!(dst.len() >= src.len() * 3); + let (read, written) = convert_utf16_to_utf8_partial(src, dst); + debug_assert_eq!(read, src.len()); + written +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER such that the validity of the output is +/// signaled using the Rust type system with potentially insufficient output +/// space. +/// +/// Returns the number of code units read and the number of bytes written. +/// +/// Not all code units are read if there isn't enough output space. +/// +/// Note that this method isn't designed for general streamability but for +/// not allocating memory for the worst case up front. Specifically, +/// if the input starts with or ends with an unpaired surrogate, those are +/// replaced with the REPLACEMENT CHARACTER. +pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) { + let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; + let (read, written) = convert_utf16_to_utf8_partial(src, bytes); + let len = bytes.len(); + let mut trail = written; + while trail < len && ((bytes[trail] & 0xC0) == 0x80) { + bytes[trail] = 0; + trail += 1; + } + (read, written) +} + +/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced +/// with the REPLACEMENT CHARACTER such that the validity of the output is +/// signaled using the Rust type system. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times three. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline(always)] +pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { + assert!(dst.len() >= src.len() * 3); + let (read, written) = convert_utf16_to_str_partial(src, dst); + debug_assert_eq!(read, src.len()); + written +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of `u16`s written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + // TODO: On aarch64, the safe version autovectorizes to the same unpacking + // instructions and this code, but, yet, the autovectorized version is + // faster. + unsafe { + unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); + } +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient +/// output space. +/// +/// Returns the number of bytes read and the number of bytes written. +/// +/// If the output isn't large enough, not all input is consumed. +/// +/// # Safety +/// +/// If you want to convert into a `&mut str`, use +/// `convert_utf16_to_str_partial()` instead of using this function +/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. +pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { + let src_len = src.len(); + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr(); + let dst_len = dst.len(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + // src can't advance more than dst + let src_left = src_len - total_read; + let dst_left = dst_len - total_written; + let min_left = ::core::cmp::min(src_left, dst_left); + if let Some((non_ascii, consumed)) = unsafe { + ascii_to_ascii( + src_ptr.add(total_read), + dst_ptr.add(total_written), + min_left, + ) + } { + total_read += consumed; + total_written += consumed; + if total_written.checked_add(2).unwrap() > dst_len { + return (total_read, total_written); + } + + total_read += 1; // consume `non_ascii` + + dst[total_written] = (non_ascii >> 6) | 0xC0; + total_written += 1; + dst[total_written] = (non_ascii & 0x3F) | 0x80; + total_written += 1; + continue; + } + return (total_read + min_left, total_written + min_left); + } +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times two. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// # Safety +/// +/// Note that this function may write garbage beyond the number of bytes +/// indicated by the return value, so using a `&mut str` interpreted as +/// `&mut [u8]` as the destination is not safe. If you want to convert into +/// a `&mut str`, use `convert_utf16_to_str()` instead of this function. +#[inline] +pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len() * 2, + "Destination must not be shorter than the source times two." + ); + let (read, written) = convert_latin1_to_utf8_partial(src, dst); + debug_assert_eq!(read, src.len()); + written +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the +/// output is signaled using the Rust type system with potentially insufficient +/// output space. +/// +/// Returns the number of bytes read and the number of bytes written. +/// +/// If the output isn't large enough, not all input is consumed. +#[inline] +pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) { + let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; + let (read, written) = convert_latin1_to_utf8_partial(src, bytes); + let len = bytes.len(); + let mut trail = written; + let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE); + while trail < max { + bytes[trail] = 0; + trail += 1; + } + while trail < len && ((bytes[trail] & 0xC0) == 0x80) { + bytes[trail] = 0; + trail += 1; + } + (read, written) +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the +/// output is signaled using the Rust type system. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer times two. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +#[inline] +pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { + assert!( + dst.len() >= src.len() * 2, + "Destination must not be shorter than the source times two." + ); + let (read, written) = convert_latin1_to_str_partial(src, dst); + debug_assert_eq!(read, src.len()); + written +} + +/// If the input is valid UTF-8 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, this function +/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise +/// does something that is memory-safe without any promises about any +/// properties of the output. In particular, callers shouldn't assume the +/// output to be the same across crate versions or CPU architectures and +/// should not assume that non-ASCII input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// If debug assertions are enabled (and not fuzzing) and the input is +/// not in the range U+0000 to U+00FF, inclusive. +pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + non_fuzz_debug_assert!(is_utf8_latin1(src)); + let src_len = src.len(); + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr(); + let mut total_read = 0usize; + let mut total_written = 0usize; + loop { + // dst can't advance more than src + let src_left = src_len - total_read; + if let Some((non_ascii, consumed)) = unsafe { + ascii_to_ascii( + src_ptr.add(total_read), + dst_ptr.add(total_written), + src_left, + ) + } { + total_read += consumed + 1; + total_written += consumed; + + if total_read == src_len { + return total_written; + } + + let trail = src[total_read]; + total_read += 1; + + dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); + total_written += 1; + continue; + } + return total_written + src_left; + } +} + +/// If the input is valid UTF-16 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, does something +/// that is memory-safe without any promises about any properties of the +/// output and will probably assert in debug builds in future versions. +/// In particular, callers shouldn't assume the output to be the same across +/// crate versions or CPU architectures and should not assume that non-ASCII +/// input can't map to ASCII output. +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// The number of bytes written equals the length of the source buffer. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +/// +/// (Probably in future versions if debug assertions are enabled (and not +/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) +pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + // non_fuzz_debug_assert!(is_utf16_latin1(src)); + unsafe { + pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); + } +} + +/// Converts bytes whose unsigned value is interpreted as Unicode code point +/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. +/// +/// Borrows if input is ASCII-only. Performs a single heap allocation +/// otherwise. +/// +/// Only available if the `alloc` feature is enabled (enabled by default). +#[cfg(feature = "alloc")] +pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { + let up_to = ascii_valid_up_to(bytes); + // >= makes later things optimize better than == + if up_to >= bytes.len() { + debug_assert_eq!(up_to, bytes.len()); + let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; + return Cow::Borrowed(s); + } + let (head, tail) = bytes.split_at(up_to); + let capacity = head.len() + tail.len() * 2; + let mut vec = Vec::with_capacity(capacity); + unsafe { + vec.set_len(capacity); + } + (&mut vec[..up_to]).copy_from_slice(head); + let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); + vec.truncate(up_to + written); + Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) +} + +/// If the input is valid UTF-8 representing only Unicode code points from +/// U+0000 to U+00FF, inclusive, converts the input into output that +/// represents the value of each code point as the unsigned byte value of +/// each output byte. +/// +/// If the input does not fulfill the condition stated above, this function +/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise +/// does something that is memory-safe without any promises about any +/// properties of the output. In particular, callers shouldn't assume the +/// output to be the same across crate versions or CPU architectures and +/// should not assume that non-ASCII input can't map to ASCII output. +/// +/// Borrows if input is ASCII-only. Performs a single heap allocation +/// otherwise. +/// +/// Only available if the `alloc` feature is enabled (enabled by default). +#[cfg(feature = "alloc")] +pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { + let bytes = string.as_bytes(); + let up_to = ascii_valid_up_to(bytes); + // >= makes later things optimize better than == + if up_to >= bytes.len() { + debug_assert_eq!(up_to, bytes.len()); + return Cow::Borrowed(bytes); + } + let (head, tail) = bytes.split_at(up_to); + let capacity = bytes.len(); + let mut vec = Vec::with_capacity(capacity); + unsafe { + vec.set_len(capacity); + } + (&mut vec[..up_to]).copy_from_slice(head); + let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); + vec.truncate(up_to + written); + Cow::Owned(vec) +} + +/// Returns the index of the first unpaired surrogate or, if the input is +/// valid UTF-16 in its entirety, the length of the input. +pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { + utf16_valid_up_to_impl(buffer) +} + +/// Returns the index of first byte that starts an invalid byte +/// sequence or a non-Latin1 byte sequence, or the length of the +/// string if there are neither. +pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { + is_utf8_latin1_impl(buffer).unwrap_or(buffer.len()) +} + +/// Returns the index of first byte that starts a non-Latin1 byte +/// sequence, or the length of the string if there are none. +pub fn str_latin1_up_to(buffer: &str) -> usize { + is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) +} + +/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. +#[inline] +pub fn ensure_utf16_validity(buffer: &mut [u16]) { + let mut offset = 0; + loop { + offset += utf16_valid_up_to(&buffer[offset..]); + if offset == buffer.len() { + return; + } + buffer[offset] = 0xFFFD; + offset += 1; + } +} + +/// Copies ASCII from source to destination up to the first non-ASCII byte +/// (or the end of the input if it is ASCII in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } + { + consumed + } else { + src.len() + } +} + +/// Copies ASCII from source to destination zero-extending it to UTF-16 up to +/// the first non-ASCII byte (or the end of the input if it is ASCII in its +/// entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of `u16`s written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) } + { + consumed + } else { + src.len() + } +} + +/// Copies Basic Latin from source to destination narrowing it to ASCII up to +/// the first non-Basic Latin code unit (or the end of the input if it is +/// Basic Latin in its entirety). +/// +/// The length of the destination buffer must be at least the length of the +/// source buffer. +/// +/// Returns the number of bytes written. +/// +/// # Panics +/// +/// Panics if the destination buffer is shorter than stated above. +pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { + assert!( + dst.len() >= src.len(), + "Destination must not be shorter than the source." + ); + if let Some((_, consumed)) = + unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } + { + consumed + } else { + src.len() + } +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn test_is_ascii_success() { + let mut src: Vec<u8> = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u8; + } + for i in 0..src.len() { + assert!(is_ascii(&src[i..])); + } + } + + #[test] + fn test_is_ascii_fail() { + let mut src: Vec<u8> = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u8; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0xA0; + assert!(!is_ascii(tail)); + } + } + } + + #[test] + fn test_is_basic_latin_success() { + let mut src: Vec<u16> = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + assert!(is_basic_latin(&src[i..])); + } + } + + #[test] + fn test_is_basic_latin_fail() { + let mut src: Vec<u16> = Vec::with_capacity(128); + src.resize(128, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0xA0; + assert!(!is_basic_latin(tail)); + } + } + } + + #[test] + fn test_is_utf16_latin1_success() { + let mut src: Vec<u16> = Vec::with_capacity(256); + src.resize(256, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + assert!(is_utf16_latin1(&src[i..])); + assert_eq!( + check_utf16_for_latin1_and_bidi(&src[i..]), + Latin1Bidi::Latin1 + ); + } + } + + #[test] + fn test_is_utf16_latin1_fail() { + let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow + let mut src: Vec<u16> = Vec::with_capacity(len); + src.resize(len, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + assert!(!is_utf16_latin1(tail)); + assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); + } + } + } + + #[test] + fn test_is_str_latin1_success() { + let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow + let mut src: Vec<u16> = Vec::with_capacity(len); + src.resize(len, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let s = String::from_utf16(&src[i..]).unwrap(); + assert!(is_str_latin1(&s[..])); + assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); + } + } + + #[test] + fn test_is_str_latin1_fail() { + let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow + let mut src: Vec<u16> = Vec::with_capacity(len); + src.resize(len, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + let s = String::from_utf16(tail).unwrap(); + assert!(!is_str_latin1(&s[..])); + assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); + } + } + } + + #[test] + fn test_is_utf8_latin1_success() { + let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow + let mut src: Vec<u16> = Vec::with_capacity(len); + src.resize(len, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let s = String::from_utf16(&src[i..]).unwrap(); + assert!(is_utf8_latin1(s.as_bytes())); + assert_eq!( + check_utf8_for_latin1_and_bidi(s.as_bytes()), + Latin1Bidi::Latin1 + ); + } + } + + #[test] + fn test_is_utf8_latin1_fail() { + let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow + let mut src: Vec<u16> = Vec::with_capacity(len); + src.resize(len, 0); + for i in 0..src.len() { + src[i] = i as u16; + } + for i in 0..src.len() { + let tail = &mut src[i..]; + for j in 0..tail.len() { + tail[j] = 0x100 + j as u16; + let s = String::from_utf16(tail).unwrap(); + assert!(!is_utf8_latin1(s.as_bytes())); + assert_ne!( + check_utf8_for_latin1_and_bidi(s.as_bytes()), + Latin1Bidi::Latin1 + ); + } + } + } + + #[test] + fn test_is_utf8_latin1_invalid() { + assert!(!is_utf8_latin1(b"\xC3")); + assert!(!is_utf8_latin1(b"a\xC3")); + assert!(!is_utf8_latin1(b"\xFF")); + assert!(!is_utf8_latin1(b"a\xFF")); + assert!(!is_utf8_latin1(b"\xC3\xFF")); + assert!(!is_utf8_latin1(b"a\xC3\xFF")); + } + + #[test] + fn test_convert_utf8_to_utf16() { + let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1); + dst.resize(src.len() + 1, 0); + let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); + dst.truncate(len); + let reference: Vec<u16> = src.encode_utf16().collect(); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_str_to_utf16() { + let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let mut dst: Vec<u16> = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + let len = convert_str_to_utf16(src, &mut dst[..]); + dst.truncate(len); + let reference: Vec<u16> = src.encode_utf16().collect(); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_utf16_to_utf8_partial() { + let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let src: Vec<u16> = reference.encode_utf16().collect(); + let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); + dst.resize(src.len() * 3 + 1, 0); + let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); + let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); + dst.truncate(len); + assert_eq!(dst, reference.as_bytes()); + } + + #[test] + fn test_convert_utf16_to_utf8() { + let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; + let src: Vec<u16> = reference.encode_utf16().collect(); + let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); + dst.resize(src.len() * 3 + 1, 0); + let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); + dst.truncate(len); + assert_eq!(dst, reference.as_bytes()); + } + + #[test] + fn test_convert_latin1_to_utf16() { + let mut src: Vec<u8> = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec<u16> = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u8; + reference[i] = i as u16; + } + let mut dst: Vec<u16> = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + convert_latin1_to_utf16(&src[..], &mut dst[..]); + assert_eq!(dst, reference); + } + + #[test] + fn test_convert_latin1_to_utf8_partial() { + let mut dst = [0u8, 2]; + let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]); + assert_eq!(read, 1); + assert_eq!(written, 1); + } + + #[test] + fn test_convert_latin1_to_utf8() { + let mut src: Vec<u8> = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec<u16> = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u8; + reference[i] = i as u16; + } + let s = String::from_utf16(&reference[..]).unwrap(); + let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2); + dst.resize(src.len() * 2, 0); + let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); + dst.truncate(len); + assert_eq!(&dst[..], s.as_bytes()); + } + + #[test] + fn test_convert_utf8_to_latin1_lossy() { + let mut reference: Vec<u8> = Vec::with_capacity(256); + reference.resize(256, 0); + let mut src16: Vec<u16> = Vec::with_capacity(256); + src16.resize(256, 0); + for i in 0..256 { + src16[i] = i as u16; + reference[i] = i as u8; + } + let src = String::from_utf16(&src16[..]).unwrap(); + let mut dst: Vec<u8> = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); + dst.truncate(len); + assert_eq!(dst, reference); + } + + #[cfg(all(debug_assertions, not(fuzzing)))] + #[test] + #[should_panic] + fn test_convert_utf8_to_latin1_lossy_panics() { + let mut dst = [0u8; 16]; + let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]); + } + + #[test] + fn test_convert_utf16_to_latin1_lossy() { + let mut src: Vec<u16> = Vec::with_capacity(256); + src.resize(256, 0); + let mut reference: Vec<u8> = Vec::with_capacity(256); + reference.resize(256, 0); + for i in 0..256 { + src[i] = i as u16; + reference[i] = i as u8; + } + let mut dst: Vec<u8> = Vec::with_capacity(src.len()); + dst.resize(src.len(), 0); + convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); + assert_eq!(dst, reference); + } + + #[test] + // #[should_panic] + fn test_convert_utf16_to_latin1_lossy_panics() { + let mut dst = [0u8; 16]; + let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); + } + + #[test] + fn test_utf16_valid_up_to() { + let valid = vec![ + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, + 0xD83Du16, 0xDCA9u16, 0x00B6u16, + ]; + assert_eq!(utf16_valid_up_to(&valid[..]), 16); + let lone_high = vec![ + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0x2603u16, 0xD83Du16, 0x00B6u16, + ]; + assert_eq!(utf16_valid_up_to(&lone_high[..]), 14); + let lone_low = vec![ + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0x2603u16, 0xDCA9u16, 0x00B6u16, + ]; + assert_eq!(utf16_valid_up_to(&lone_low[..]), 14); + let lone_high_at_end = vec![ + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0x2603u16, 0x00B6u16, 0xD83Du16, + ]; + assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15); + } + + #[test] + fn test_ensure_utf16_validity() { + let mut src = vec![ + 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + ]; + let reference = vec![ + 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, + ]; + ensure_utf16_validity(&mut src[..]); + assert_eq!(src, reference); + } + + #[test] + fn test_is_char_bidi() { + assert!(!is_char_bidi('a')); + assert!(!is_char_bidi('\u{03B1}')); + assert!(!is_char_bidi('\u{3041}')); + assert!(!is_char_bidi('\u{1F4A9}')); + assert!(!is_char_bidi('\u{FE00}')); + assert!(!is_char_bidi('\u{202C}')); + assert!(!is_char_bidi('\u{FEFF}')); + assert!(is_char_bidi('\u{0590}')); + assert!(is_char_bidi('\u{08FF}')); + assert!(is_char_bidi('\u{061C}')); + assert!(is_char_bidi('\u{FB50}')); + assert!(is_char_bidi('\u{FDFF}')); + assert!(is_char_bidi('\u{FE70}')); + assert!(is_char_bidi('\u{FEFE}')); + assert!(is_char_bidi('\u{200F}')); + assert!(is_char_bidi('\u{202B}')); + assert!(is_char_bidi('\u{202E}')); + assert!(is_char_bidi('\u{2067}')); + assert!(is_char_bidi('\u{10800}')); + assert!(is_char_bidi('\u{10FFF}')); + assert!(is_char_bidi('\u{1E800}')); + assert!(is_char_bidi('\u{1EFFF}')); + } + + #[test] + fn test_is_utf16_code_unit_bidi() { + assert!(!is_utf16_code_unit_bidi(0x0062)); + assert!(!is_utf16_code_unit_bidi(0x03B1)); + assert!(!is_utf16_code_unit_bidi(0x3041)); + assert!(!is_utf16_code_unit_bidi(0xD801)); + assert!(!is_utf16_code_unit_bidi(0xFE00)); + assert!(!is_utf16_code_unit_bidi(0x202C)); + assert!(!is_utf16_code_unit_bidi(0xFEFF)); + assert!(is_utf16_code_unit_bidi(0x0590)); + assert!(is_utf16_code_unit_bidi(0x08FF)); + assert!(is_utf16_code_unit_bidi(0x061C)); + assert!(is_utf16_code_unit_bidi(0xFB1D)); + assert!(is_utf16_code_unit_bidi(0xFB50)); + assert!(is_utf16_code_unit_bidi(0xFDFF)); + assert!(is_utf16_code_unit_bidi(0xFE70)); + assert!(is_utf16_code_unit_bidi(0xFEFE)); + assert!(is_utf16_code_unit_bidi(0x200F)); + assert!(is_utf16_code_unit_bidi(0x202B)); + assert!(is_utf16_code_unit_bidi(0x202E)); + assert!(is_utf16_code_unit_bidi(0x2067)); + assert!(is_utf16_code_unit_bidi(0xD802)); + assert!(is_utf16_code_unit_bidi(0xD803)); + assert!(is_utf16_code_unit_bidi(0xD83A)); + assert!(is_utf16_code_unit_bidi(0xD83B)); + } + + #[test] + fn test_is_str_bidi() { + assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop")); + assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop")); + assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop")); + } + + #[test] + fn test_is_utf8_bidi() { + assert!(!is_utf8_bidi( + "abcdefghijklmnopaabcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes() + )); + assert!(!is_utf8_bidi( + "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes() + )); + assert!(is_utf8_bidi( + "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes() + )); + } + + #[test] + fn test_is_utf16_bidi() { + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(!is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, + ])); + + assert!(is_utf16_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ])); + } + + #[test] + fn test_check_str_for_latin1_and_bidi() { + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_ne!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + assert_eq!( + check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"), + Latin1Bidi::Bidi + ); + } + + #[test] + fn test_check_utf8_for_latin1_and_bidi() { + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()), + Latin1Bidi::Bidi + ); + } + + #[test] + fn test_check_utf16_for_latin1_and_bidi() { + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_ne!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + + assert_eq!( + check_utf16_for_latin1_and_bidi(&[ + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, + 0x65, 0x66, 0x67, 0x68, 0x69, + ]), + Latin1Bidi::Bidi + ); + } + + #[inline(always)] + pub fn reference_is_char_bidi(c: char) -> bool { + match c { + '\u{0590}'..='\u{08FF}' + | '\u{FB1D}'..='\u{FDFF}' + | '\u{FE70}'..='\u{FEFE}' + | '\u{10800}'..='\u{10FFF}' + | '\u{1E800}'..='\u{1EFFF}' + | '\u{200F}' + | '\u{202B}' + | '\u{202E}' + | '\u{2067}' => true, + _ => false, + } + } + + #[inline(always)] + pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { + match u { + 0x0590..=0x08FF + | 0xFB1D..=0xFDFF + | 0xFE70..=0xFEFE + | 0xD802 + | 0xD803 + | 0xD83A + | 0xD83B + | 0x200F + | 0x202B + | 0x202E + | 0x2067 => true, + _ => false, + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_is_char_bidi_thoroughly() { + for i in 0..0xD800u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); + } + for i in 0xE000..0x110000u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_is_utf16_code_unit_bidi_thoroughly() { + for i in 0..0x10000u32 { + let u = i as u16; + assert_eq!( + is_utf16_code_unit_bidi(u), + reference_is_utf16_code_unit_bidi(u) + ); + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_is_str_bidi_thoroughly() { + let mut buf = [0; 4]; + for i in 0..0xD800u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + assert_eq!( + is_str_bidi(c.encode_utf8(&mut buf[..])), + reference_is_char_bidi(c) + ); + } + for i in 0xE000..0x110000u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + assert_eq!( + is_str_bidi(c.encode_utf8(&mut buf[..])), + reference_is_char_bidi(c) + ); + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_is_utf8_bidi_thoroughly() { + let mut buf = [0; 8]; + for i in 0..0xD800u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + let expect = reference_is_char_bidi(c); + { + let len = { + let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); + assert_eq!(is_utf8_bidi(bytes), expect); + bytes.len() + }; + { + let tail = &mut buf[len..]; + for b in tail.iter_mut() { + *b = 0; + } + } + } + assert_eq!(is_utf8_bidi(&buf[..]), expect); + } + for i in 0xE000..0x110000u32 { + let c: char = ::core::char::from_u32(i).unwrap(); + let expect = reference_is_char_bidi(c); + { + let len = { + let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); + assert_eq!(is_utf8_bidi(bytes), expect); + bytes.len() + }; + { + let tail = &mut buf[len..]; + for b in tail.iter_mut() { + *b = 0; + } + } + } + assert_eq!(is_utf8_bidi(&buf[..]), expect); + } + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_is_utf16_bidi_thoroughly() { + let mut buf = [0; 32]; + for i in 0..0x10000u32 { + let u = i as u16; + buf[15] = u; + assert_eq!( + is_utf16_bidi(&buf[..]), + reference_is_utf16_code_unit_bidi(u) + ); + } + } + + #[test] + fn test_is_utf8_bidi_edge_cases() { + assert!(!is_utf8_bidi(b"\xD5\xBF\x61")); + assert!(!is_utf8_bidi(b"\xD6\x80\x61")); + assert!(!is_utf8_bidi(b"abc")); + assert!(is_utf8_bidi(b"\xD5\xBF\xC2")); + assert!(is_utf8_bidi(b"\xD6\x80\xC2")); + assert!(is_utf8_bidi(b"ab\xC2")); + } + + #[test] + fn test_decode_latin1() { + match decode_latin1(b"ab") { + Cow::Borrowed(s) => { + assert_eq!(s, "ab"); + } + Cow::Owned(_) => { + unreachable!("Should have borrowed"); + } + } + assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}"); + } + + #[test] + fn test_encode_latin1_lossy() { + match encode_latin1_lossy("ab") { + Cow::Borrowed(s) => { + assert_eq!(s, b"ab"); + } + Cow::Owned(_) => { + unreachable!("Should have borrowed"); + } + } + assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]); + } + + #[test] + fn test_convert_utf8_to_utf16_without_replacement() { + let mut buf = [0u16; 5]; + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]), + Some(2) + ); + assert_eq!(buf[0], u16::from(b'a')); + assert_eq!(buf[1], u16::from(b'b')); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]), + Some(2) + ); + assert_eq!(buf[0], 0xE4); + assert_eq!(buf[1], u16::from(b'c')); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]), + Some(1) + ); + assert_eq!(buf[0], 0x2603); + assert_eq!(buf[1], u16::from(b'c')); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]), + Some(2) + ); + assert_eq!(buf[0], 0x2603); + assert_eq!(buf[1], u16::from(b'd')); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]), + Some(2) + ); + assert_eq!(buf[0], 0x2603); + assert_eq!(buf[1], 0xE4); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]), + Some(2) + ); + assert_eq!(buf[0], 0xD83D); + assert_eq!(buf[1], 0xDCCE); + assert_eq!(buf[2], 0); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]), + Some(3) + ); + assert_eq!(buf[0], 0xD83D); + assert_eq!(buf[1], 0xDCCE); + assert_eq!(buf[2], u16::from(b'e')); + assert_eq!( + convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]), + None + ); + } +} |