// Copyright Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Functions for converting between different in-RAM representations of text //! and for quickly checking if the Unicode Bidirectional Algorithm can be //! avoided. //! //! By using slices for output, the functions here seek to enable by-register //! (ALU register or SIMD register as available) operations in order to //! outperform iterator-based conversions available in the Rust standard //! library. //! //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to //! U+00FF, inclusive, and does not refer to the windows-1252 range. This //! in-memory encoding is sometimes used as a storage optimization of text //! when UTF-16 indexing and length semantics are exposed. //! //! The FFI binding for this module are in the //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). #[cfg(feature = "alloc")] use alloc::borrow::Cow; #[cfg(feature = "alloc")] use alloc::string::String; #[cfg(feature = "alloc")] use alloc::vec::Vec; use super::in_inclusive_range16; use super::in_inclusive_range32; use super::in_inclusive_range8; use super::in_range16; use super::in_range32; use super::DecoderResult; use crate::ascii::*; use crate::utf_8::*; macro_rules! non_fuzz_debug_assert { ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); }) } cfg_if! { if #[cfg(feature = "simd-accel")] { use ::core::intrinsics::likely; use ::core::intrinsics::unlikely; } else { #[inline(always)] fn likely(b: bool) -> bool { b } #[inline(always)] fn unlikely(b: bool) -> bool { b } } } /// Classification of text as Latin1 (all code points are below U+0100), /// left-to-right with some non-Latin1 characters or as containing at least /// some right-to-left characters. #[must_use] #[derive(Debug, PartialEq, Eq)] #[repr(C)] pub enum Latin1Bidi { /// Every character is below U+0100. Latin1 = 0, /// There is at least one character that's U+0100 or higher, but there /// are no right-to-left characters. LeftToRight = 1, /// There is at least one right-to-left character. Bidi = 2, } // `as` truncates, so works on 32-bit, too. #[allow(dead_code)] const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize; #[allow(unused_macros)] macro_rules! by_unit_check_alu { ($name:ident, $unit:ty, $bound:expr, $mask:ident) => { #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] #[inline(always)] fn $name(buffer: &[$unit]) -> bool { let mut offset = 0usize; let mut accu = 0usize; let unit_size = ::core::mem::size_of::<$unit>(); let len = buffer.len(); if len >= ALU_ALIGNMENT / unit_size { // The most common reason to return `false` is for the first code // unit to fail the test, so check that first. if buffer[0] >= $bound { return false; } let src = buffer.as_ptr(); let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK) / unit_size; if until_alignment + ALU_ALIGNMENT / unit_size <= len { if until_alignment != 0 { accu |= buffer[offset] as usize; offset += 1; until_alignment -= 1; while until_alignment != 0 { accu |= buffer[offset] as usize; offset += 1; until_alignment -= 1; } if accu >= $bound { return false; } } let len_minus_stride = len - ALU_ALIGNMENT / unit_size; if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); loop { let unroll_accu = unsafe { *(src.add(offset) as *const usize) } | unsafe { *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) } | unsafe { *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) as *const usize) } | unsafe { *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) as *const usize) }; if unroll_accu & $mask != 0 { return false; } offset += 4 * (ALU_ALIGNMENT / unit_size); if offset > len_minus_unroll { break; } } } while offset <= len_minus_stride { accu |= unsafe { *(src.add(offset) as *const usize) }; offset += ALU_ALIGNMENT / unit_size; } } } for &unit in &buffer[offset..] { accu |= unit as usize; } accu & $mask == 0 } }; } #[allow(unused_macros)] macro_rules! by_unit_check_simd { ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => { #[inline(always)] fn $name(buffer: &[$unit]) -> bool { let mut offset = 0usize; let mut accu = 0usize; let unit_size = ::core::mem::size_of::<$unit>(); let len = buffer.len(); if len >= SIMD_STRIDE_SIZE / unit_size { // The most common reason to return `false` is for the first code // unit to fail the test, so check that first. if buffer[0] >= $bound { return false; } let src = buffer.as_ptr(); let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK) / unit_size; if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len { if until_alignment != 0 { accu |= buffer[offset] as usize; offset += 1; until_alignment -= 1; while until_alignment != 0 { accu |= buffer[offset] as usize; offset += 1; until_alignment -= 1; } if accu >= $bound { return false; } } let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); loop { let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } | unsafe { *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size)) as *const $simd_ty) } | unsafe { *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) as *const $simd_ty) } | unsafe { *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) as *const $simd_ty) }; if !$func(unroll_accu) { return false; } offset += 4 * (SIMD_STRIDE_SIZE / unit_size); if offset > len_minus_unroll { break; } } } let mut simd_accu = $splat; while offset <= len_minus_stride { simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; offset += SIMD_STRIDE_SIZE / unit_size; } if !$func(simd_accu) { return false; } } } for &unit in &buffer[offset..] { accu |= unit as usize; } accu < $bound } }; } cfg_if! { if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { use crate::simd_funcs::*; use packed_simd::u8x16; use packed_simd::u16x8; const SIMD_ALIGNMENT: usize = 16; const SIMD_ALIGNMENT_MASK: usize = 15; by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); #[inline(always)] fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { // This function is a mess, because it simultaneously tries to do // only aligned SIMD (perhaps misguidedly) and needs to deal with // the last code unit in a SIMD stride being part of a valid // surrogate pair. let unit_size = ::core::mem::size_of::(); let src = buffer.as_ptr(); let len = buffer.len(); let mut offset = 0usize; 'outer: loop { let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK) / unit_size; if until_alignment == 0 { if offset + SIMD_STRIDE_SIZE / unit_size > len { break; } } else { let offset_plus_until_alignment = offset + until_alignment; let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len { break; } let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); if up_to < until_alignment { return offset + up_to; } if last_valid_low { offset = offset_plus_until_alignment_plus_one; continue; } offset = offset_plus_until_alignment; } let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; loop { let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size; if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) { if offset_plus_stride == len { break 'outer; } let offset_plus_stride_plus_one = offset_plus_stride + 1; let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); if up_to < SIMD_STRIDE_SIZE / unit_size { return offset + up_to; } if last_valid_low { offset = offset_plus_stride_plus_one; continue 'outer; } } offset = offset_plus_stride; if offset > len_minus_stride { break 'outer; } } } let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); offset + up_to } } else { by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); #[inline(always)] fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { let (up_to, _) = utf16_valid_up_to_alu(buffer); up_to } } } /// The second return value is true iff the last code unit of the slice was /// reached and turned out to be a low surrogate that is part of a valid pair. #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] #[inline(always)] fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { let len = buffer.len(); if len == 0 { return (0, false); } let mut offset = 0usize; loop { let unit = buffer[offset]; let next = offset + 1; let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); if unit_minus_surrogate_start > (0xDFFF - 0xD800) { // Not a surrogate offset = next; if offset == len { return (offset, false); } continue; } if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { // high surrogate if next < len { let second = buffer[next]; let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { // The next code unit is a low surrogate. Advance position. offset = next + 1; if offset == len { return (offset, true); } continue; } // The next code unit is not a low surrogate. Don't advance // position and treat the high surrogate as unpaired. // fall through } // Unpaired, fall through } // Unpaired surrogate return (offset, false); } } cfg_if! { if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { #[inline(always)] fn is_str_latin1_impl(buffer: &str) -> Option { let mut offset = 0usize; let bytes = buffer.as_bytes(); let len = bytes.len(); if len >= SIMD_STRIDE_SIZE { let src = bytes.as_ptr(); let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK; if until_alignment + SIMD_STRIDE_SIZE <= len { while until_alignment != 0 { if bytes[offset] > 0xC3 { return Some(offset); } offset += 1; until_alignment -= 1; } let len_minus_stride = len - SIMD_STRIDE_SIZE; loop { if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) { // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. while bytes[offset] & 0xC0 == 0x80 { offset += 1; } return Some(offset); } offset += SIMD_STRIDE_SIZE; if offset > len_minus_stride { break; } } } } for i in offset..len { if bytes[i] > 0xC3 { return Some(i); } } None } } else { #[inline(always)] fn is_str_latin1_impl(buffer: &str) -> Option { let mut bytes = buffer.as_bytes(); let mut total = 0; loop { if let Some((byte, offset)) = validate_ascii(bytes) { total += offset; if byte > 0xC3 { return Some(total); } bytes = &bytes[offset + 2..]; total += 2; } else { return None; } } } } } #[inline(always)] fn is_utf8_latin1_impl(buffer: &[u8]) -> Option { let mut bytes = buffer; let mut total = 0; loop { if let Some((byte, offset)) = validate_ascii(bytes) { total += offset; if in_inclusive_range8(byte, 0xC2, 0xC3) { let next = offset + 1; if next == bytes.len() { return Some(total); } if bytes[next] & 0xC0 != 0x80 { return Some(total); } bytes = &bytes[offset + 2..]; total += 2; } else { return Some(total); } } else { return None; } } } cfg_if! { if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { #[inline(always)] fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { let mut offset = 0usize; let len = buffer.len(); if len >= SIMD_STRIDE_SIZE / 2 { let src = buffer.as_ptr(); let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK) / 2; if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { while until_alignment != 0 { if is_utf16_code_unit_bidi(buffer[offset]) { return true; } offset += 1; until_alignment -= 1; } let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); loop { if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) { return true; } offset += SIMD_STRIDE_SIZE / 2; if offset > len_minus_stride { break; } } } } for &u in &buffer[offset..] { if is_utf16_code_unit_bidi(u) { return true; } } false } } else { #[inline(always)] fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { for &u in buffer { if is_utf16_code_unit_bidi(u) { return true; } } false } } } cfg_if! { if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { #[inline(always)] fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { let mut offset = 0usize; let len = buffer.len(); if len >= SIMD_STRIDE_SIZE / 2 { let src = buffer.as_ptr(); let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK) / 2; if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { while until_alignment != 0 { if buffer[offset] > 0xFF { // This transition isn't optimal, since the aligment is recomputing // but not tweaking further today. if is_utf16_bidi_impl(&buffer[offset..]) { return Latin1Bidi::Bidi; } return Latin1Bidi::LeftToRight; } offset += 1; until_alignment -= 1; } let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); loop { let mut s = unsafe { *(src.add(offset) as *const u16x8) }; if !simd_is_latin1(s) { loop { if is_u16x8_bidi(s) { return Latin1Bidi::Bidi; } offset += SIMD_STRIDE_SIZE / 2; if offset > len_minus_stride { for &u in &buffer[offset..] { if is_utf16_code_unit_bidi(u) { return Latin1Bidi::Bidi; } } return Latin1Bidi::LeftToRight; } s = unsafe { *(src.add(offset) as *const u16x8) }; } } offset += SIMD_STRIDE_SIZE / 2; if offset > len_minus_stride { break; } } } } let mut iter = (&buffer[offset..]).iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { let mut inner_u = u; loop { if is_utf16_code_unit_bidi(inner_u) { return Latin1Bidi::Bidi; } if let Some(&code_unit) = iter.next() { inner_u = code_unit; } else { return Latin1Bidi::LeftToRight; } } } } else { return Latin1Bidi::Latin1; } } } } else { #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] #[inline(always)] fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { let mut offset = 0usize; let len = buffer.len(); if len >= ALU_ALIGNMENT / 2 { let src = buffer.as_ptr(); let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK) / 2; if until_alignment + ALU_ALIGNMENT / 2 <= len { while until_alignment != 0 { if buffer[offset] > 0xFF { if is_utf16_bidi_impl(&buffer[offset..]) { return Latin1Bidi::Bidi; } return Latin1Bidi::LeftToRight; } offset += 1; until_alignment -= 1; } let len_minus_stride = len - ALU_ALIGNMENT / 2; loop { if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 { if is_utf16_bidi_impl(&buffer[offset..]) { return Latin1Bidi::Bidi; } return Latin1Bidi::LeftToRight; } offset += ALU_ALIGNMENT / 2; if offset > len_minus_stride { break; } } } } let mut iter = (&buffer[offset..]).iter(); loop { if let Some(&u) = iter.next() { if u > 0xFF { let mut inner_u = u; loop { if is_utf16_code_unit_bidi(inner_u) { return Latin1Bidi::Bidi; } if let Some(&code_unit) = iter.next() { inner_u = code_unit; } else { return Latin1Bidi::LeftToRight; } } } } else { return Latin1Bidi::Latin1; } } } } } /// Checks whether the buffer is all-ASCII. /// /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function /// is not guaranteed to fail fast.) pub fn is_ascii(buffer: &[u8]) -> bool { is_ascii_impl(buffer) } /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing /// only ASCII characters). /// /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function /// is not guaranteed to fail fast.) pub fn is_basic_latin(buffer: &[u16]) -> bool { is_basic_latin_impl(buffer) } /// Checks whether the buffer is valid UTF-8 representing only code points /// less than or equal to U+00FF. /// /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 /// invalidity or code points above U+00FF are discovered. pub fn is_utf8_latin1(buffer: &[u8]) -> bool { is_utf8_latin1_impl(buffer).is_none() } /// Checks whether the buffer represents only code points less than or equal /// to U+00FF. /// /// Fails fast. (I.e. returns before having read the whole buffer if code /// points above U+00FF are discovered. pub fn is_str_latin1(buffer: &str) -> bool { is_str_latin1_impl(buffer).is_none() } /// Checks whether the buffer represents only code point less than or equal /// to U+00FF. /// /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function /// is not guaranteed to fail fast.) pub fn is_utf16_latin1(buffer: &[u16]) -> bool { is_utf16_latin1_impl(buffer) } /// Checks whether a potentially-invalid UTF-8 buffer contains code points /// that trigger right-to-left processing. /// /// The check is done on a Unicode block basis without regard to assigned /// vs. unassigned code points in the block. Hebrew presentation forms in /// the Alphabetic Presentation Forms block are treated as if they formed /// a block on their own (i.e. it treated as right-to-left). Additionally, /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked /// for. Control characters that are technically bidi controls but do not /// cause right-to-left behavior without the presence of right-to-left /// characters or right-to-left controls are not checked for. As a special /// case, U+FEFF is excluded from Arabic Presentation Forms-B. /// /// Returns `true` if the input is invalid UTF-8 or the input contains an /// RTL character. Returns `false` if the input is valid UTF-8 and contains /// no RTL characters. #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))] #[inline] pub fn is_utf8_bidi(buffer: &[u8]) -> bool { // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster // than UTF-8 validation followed by `is_str_bidi()` for German, // Russian and Japanese. However, this is considerably slower for Thai. // Chances are that the compiler makes some branch predictions that are // unfortunate for Thai. Not spending the time to manually optimize // further at this time, since it's unclear if this variant even has // use cases. However, this is worth revisiting once Rust gets the // ability to annotate relative priorities of match arms. // U+058F: D6 8F // U+0590: D6 90 // U+08FF: E0 A3 BF // U+0900: E0 A4 80 // // U+200F: E2 80 8F // U+202B: E2 80 AB // U+202E: E2 80 AE // U+2067: E2 81 A7 // // U+FB1C: EF AC 9C // U+FB1D: EF AC 9D // U+FDFF: EF B7 BF // U+FE00: EF B8 80 // // U+FE6F: EF B9 AF // U+FE70: EF B9 B0 // U+FEFE: EF BB BE // U+FEFF: EF BB BF // // U+107FF: F0 90 9F BF // U+10800: F0 90 A0 80 // U+10FFF: F0 90 BF BF // U+11000: F0 91 80 80 // // U+1E7FF: F0 9E 9F BF // U+1E800: F0 9E A0 80 // U+1EFFF: F0 9E BF BF // U+1F000: F0 9F 80 80 let mut src = buffer; 'outer: loop { if let Some((mut byte, mut read)) = validate_ascii(src) { // Check for the longest sequence to avoid checking twice for the // multi-byte sequences. if read + 4 <= src.len() { 'inner: loop { // At this point, `byte` is not included in `read`. match byte { 0..=0x7F => { // ASCII: go back to SIMD. read += 1; src = &src[read..]; continue 'outer; } 0xC2..=0xD5 => { // Two-byte let second = unsafe { *(src.get_unchecked(read + 1)) }; if !in_inclusive_range8(second, 0x80, 0xBF) { return true; } read += 2; } 0xD6 => { // Two-byte let second = unsafe { *(src.get_unchecked(read + 1)) }; if !in_inclusive_range8(second, 0x80, 0xBF) { return true; } // XXX consider folding the above and below checks if second > 0x8F { return true; } read += 2; } // two-byte starting with 0xD7 and above is bidi 0xE1 | 0xE3..=0xEC | 0xEE => { // Three-byte normal let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } read += 3; } 0xE2 => { // Three-byte normal, potentially bidi let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } if second == 0x80 { if third == 0x8F || third == 0xAB || third == 0xAE { return true; } } else if second == 0x81 { if third == 0xA7 { return true; } } read += 3; } 0xEF => { // Three-byte normal, potentially bidi let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } if in_inclusive_range8(second, 0xAC, 0xB7) { if second == 0xAC { if third > 0x9C { return true; } } else { return true; } } else if in_inclusive_range8(second, 0xB9, 0xBB) { if second == 0xB9 { if third > 0xAF { return true; } } else if second == 0xBB { if third != 0xBF { return true; } } else { return true; } } read += 3; } 0xE0 => { // Three-byte special lower bound, potentially bidi let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } // XXX can this be folded into the above validity check if second < 0xA4 { return true; } read += 3; } 0xED => { // Three-byte special upper bound let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } read += 3; } 0xF1..=0xF4 => { // Four-byte normal let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; let fourth = unsafe { *(src.get_unchecked(read + 3)) }; if (u16::from( UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, ) | u16::from(third >> 6) | (u16::from(fourth & 0xC0) << 2)) != 0x202 { return true; } read += 4; } 0xF0 => { // Four-byte special lower bound, potentially bidi let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; let fourth = unsafe { *(src.get_unchecked(read + 3)) }; if (u16::from( UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, ) | u16::from(third >> 6) | (u16::from(fourth & 0xC0) << 2)) != 0x202 { return true; } if unlikely(second == 0x90 || second == 0x9E) { let third = src[read + 2]; if third >= 0xA0 { return true; } } read += 4; } _ => { // Invalid lead or bidi-only lead return true; } } if read + 4 > src.len() { if read == src.len() { return false; } byte = src[read]; break 'inner; } byte = src[read]; continue 'inner; } } // We can't have a complete 4-byte sequence, but we could still have // a complete shorter sequence. // At this point, `byte` is not included in `read`. match byte { 0..=0x7F => { // ASCII: go back to SIMD. read += 1; src = &src[read..]; continue 'outer; } 0xC2..=0xD5 => { // Two-byte let new_read = read + 2; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; if !in_inclusive_range8(second, 0x80, 0xBF) { return true; } read = new_read; // We need to deal with the case where we came here with 3 bytes // left, so we need to take a look at the last one. src = &src[read..]; continue 'outer; } 0xD6 => { // Two-byte, potentially bidi let new_read = read + 2; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; if !in_inclusive_range8(second, 0x80, 0xBF) { return true; } // XXX consider folding the above and below checks if second > 0x8F { return true; } read = new_read; // We need to deal with the case where we came here with 3 bytes // left, so we need to take a look at the last one. src = &src[read..]; continue 'outer; } // two-byte starting with 0xD7 and above is bidi 0xE1 | 0xE3..=0xEC | 0xEE => { // Three-byte normal let new_read = read + 3; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } } 0xE2 => { // Three-byte normal, potentially bidi let new_read = read + 3; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } if second == 0x80 { if third == 0x8F || third == 0xAB || third == 0xAE { return true; } } else if second == 0x81 { if third == 0xA7 { return true; } } } 0xEF => { // Three-byte normal, potentially bidi let new_read = read + 3; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } if in_inclusive_range8(second, 0xAC, 0xB7) { if second == 0xAC { if third > 0x9C { return true; } } else { return true; } } else if in_inclusive_range8(second, 0xB9, 0xBB) { if second == 0xB9 { if third > 0xAF { return true; } } else if second == 0xBB { if third != 0xBF { return true; } } else { return true; } } } 0xE0 => { // Three-byte special lower bound, potentially bidi let new_read = read + 3; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } // XXX can this be folded into the above validity check if second < 0xA4 { return true; } } 0xED => { // Three-byte special upper bound let new_read = read + 3; if new_read > src.len() { return true; } let second = unsafe { *(src.get_unchecked(read + 1)) }; let third = unsafe { *(src.get_unchecked(read + 2)) }; if ((UTF8_DATA.table[usize::from(second)] & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) | (third >> 6)) != 2 { return true; } } _ => { // Invalid lead, 4-byte lead or 2-byte bidi-only lead return true; } } return false; } else { return false; } } } /// Checks whether a valid UTF-8 buffer contains code points that trigger /// right-to-left processing. /// /// The check is done on a Unicode block basis without regard to assigned /// vs. unassigned code points in the block. Hebrew presentation forms in /// the Alphabetic Presentation Forms block are treated as if they formed /// a block on their own (i.e. it treated as right-to-left). Additionally, /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked /// for. Control characters that are technically bidi controls but do not /// cause right-to-left behavior without the presence of right-to-left /// characters or right-to-left controls are not checked for. As a special /// case, U+FEFF is excluded from Arabic Presentation Forms-B. #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] #[inline] pub fn is_str_bidi(buffer: &str) -> bool { // U+058F: D6 8F // U+0590: D6 90 // U+08FF: E0 A3 BF // U+0900: E0 A4 80 // // U+200F: E2 80 8F // U+202B: E2 80 AB // U+202E: E2 80 AE // U+2067: E2 81 A7 // // U+FB1C: EF AC 9C // U+FB1D: EF AC 9D // U+FDFF: EF B7 BF // U+FE00: EF B8 80 // // U+FE6F: EF B9 AF // U+FE70: EF B9 B0 // U+FEFE: EF BB BE // U+FEFF: EF BB BF // // U+107FF: F0 90 9F BF // U+10800: F0 90 A0 80 // U+10FFF: F0 90 BF BF // U+11000: F0 91 80 80 // // U+1E7FF: F0 9E 9F BF // U+1E800: F0 9E A0 80 // U+1EFFF: F0 9E BF BF // U+1F000: F0 9F 80 80 let mut bytes = buffer.as_bytes(); 'outer: loop { // TODO: Instead of just validating ASCII using SIMD, use SIMD // to check for non-ASCII lead bytes, too, to quickly conclude // that the vector consist entirely of CJK and below-Hebrew // code points. // Unfortunately, scripts above Arabic but below CJK share // lead bytes with RTL. if let Some((mut byte, mut read)) = validate_ascii(bytes) { 'inner: loop { // At this point, `byte` is not included in `read`. if byte < 0xE0 { if byte >= 0x80 { // Two-byte // Adding `unlikely` here improved throughput on // Russian plain text by 33%! if unlikely(byte >= 0xD6) { if byte == 0xD6 { let second = bytes[read + 1]; if second > 0x8F { return true; } } else { return true; } } read += 2; } else { // ASCII: write and go back to SIMD. read += 1; // Intuitively, we should go back to the outer loop only // if byte is 0x30 or above, so as to avoid trashing on // ASCII space, comma and period in non-Latin context. // However, the extra branch seems to cost more than it's // worth. bytes = &bytes[read..]; continue 'outer; } } else if byte < 0xF0 { // Three-byte if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) { let second = bytes[read + 1]; if byte == 0xE0 { if second < 0xA4 { return true; } } else if byte == 0xE2 { let third = bytes[read + 2]; if second == 0x80 { if third == 0x8F || third == 0xAB || third == 0xAE { return true; } } else if second == 0x81 { if third == 0xA7 { return true; } } } else { debug_assert_eq!(byte, 0xEF); if in_inclusive_range8(second, 0xAC, 0xB7) { if second == 0xAC { let third = bytes[read + 2]; if third > 0x9C { return true; } } else { return true; } } else if in_inclusive_range8(second, 0xB9, 0xBB) { if second == 0xB9 { let third = bytes[read + 2]; if third > 0xAF { return true; } } else if second == 0xBB { let third = bytes[read + 2]; if third != 0xBF { return true; } } else { return true; } } } } read += 3; } else { // Four-byte let second = bytes[read + 1]; if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) { let third = bytes[read + 2]; if third >= 0xA0 { return true; } } read += 4; } // The comparison is always < or == and never >, but including // > here to let the compiler assume that < is true if this // comparison is false. if read >= bytes.len() { return false; } byte = bytes[read]; continue 'inner; } } else { return false; } } } /// Checks whether a UTF-16 buffer contains code points that trigger /// right-to-left processing. /// /// The check is done on a Unicode block basis without regard to assigned /// vs. unassigned code points in the block. Hebrew presentation forms in /// the Alphabetic Presentation Forms block are treated as if they formed /// a block on their own (i.e. it treated as right-to-left). Additionally, /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked /// for. Control characters that are technically bidi controls but do not /// cause right-to-left behavior without the presence of right-to-left /// characters or right-to-left controls are not checked for. As a special /// case, U+FEFF is excluded from Arabic Presentation Forms-B. /// /// Returns `true` if the input contains an RTL character or an unpaired /// high surrogate that could be the high half of an RTL character. /// Returns `false` if the input contains neither RTL characters nor /// unpaired high surrogates that could be higher halves of RTL characters. pub fn is_utf16_bidi(buffer: &[u16]) -> bool { is_utf16_bidi_impl(buffer) } /// Checks whether a scalar value triggers right-to-left processing. /// /// The check is done on a Unicode block basis without regard to assigned /// vs. unassigned code points in the block. Hebrew presentation forms in /// the Alphabetic Presentation Forms block are treated as if they formed /// a block on their own (i.e. it treated as right-to-left). Additionally, /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked /// for. Control characters that are technically bidi controls but do not /// cause right-to-left behavior without the presence of right-to-left /// characters or right-to-left controls are not checked for. As a special /// case, U+FEFF is excluded from Arabic Presentation Forms-B. #[inline(always)] pub fn is_char_bidi(c: char) -> bool { // Controls: // Every control with RIGHT-TO-LEFT in its name in // https://www.unicode.org/charts/PDF/U2000.pdf // U+200F RLM // U+202B RLE // U+202E RLO // U+2067 RLI // // BMP RTL: // https://www.unicode.org/roadmaps/bmp/ // U+0590...U+08FF // U+FB1D...U+FDFF Hebrew presentation forms and // Arabic Presentation Forms A // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM) // // Supplementary RTL: // https://www.unicode.org/roadmaps/smp/ // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) let code_point = u32::from(c); if code_point < 0x0590 { // Below Hebrew return false; } if in_range32(code_point, 0x0900, 0xFB1D) { // Above Arabic Extended-A and below Hebrew presentation forms if in_inclusive_range32(code_point, 0x200F, 0x2067) { // In the range that contains the RTL controls return code_point == 0x200F || code_point == 0x202B || code_point == 0x202E || code_point == 0x2067; } return false; } if code_point > 0x1EFFF { // Above second astral RTL. (Emoji is here.) return false; } if in_range32(code_point, 0x11000, 0x1E800) { // Between astral RTL blocks return false; } if in_range32(code_point, 0xFEFF, 0x10800) { // Above Arabic Presentations Forms B (excl. BOM) and below first // astral RTL return false; } if in_range32(code_point, 0xFE00, 0xFE70) { // Between Arabic Presentations Forms return false; } true } /// Checks whether a UTF-16 code unit triggers right-to-left processing. /// /// The check is done on a Unicode block basis without regard to assigned /// vs. unassigned code points in the block. Hebrew presentation forms in /// the Alphabetic Presentation Forms block are treated as if they formed /// a block on their own (i.e. it treated as right-to-left). Additionally, /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked /// for. Control characters that are technically bidi controls but do not /// cause right-to-left behavior without the presence of right-to-left /// characters or right-to-left controls are not checked for. As a special /// case, U+FEFF is excluded from Arabic Presentation Forms-B. /// /// Since supplementary-plane right-to-left blocks are identifiable from the /// high surrogate without examining the low surrogate, this function returns /// `true` for such high surrogates making the function suitable for handling /// supplementary-plane text without decoding surrogate pairs to scalar /// values. Obviously, such high surrogates are then reported as right-to-left /// even if actually unpaired. #[inline(always)] pub fn is_utf16_code_unit_bidi(u: u16) -> bool { if u < 0x0590 { // Below Hebrew return false; } if in_range16(u, 0x0900, 0xD802) { // Above Arabic Extended-A and below first RTL surrogate if in_inclusive_range16(u, 0x200F, 0x2067) { // In the range that contains the RTL controls return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; } return false; } if in_range16(u, 0xD83C, 0xFB1D) { // Between astral RTL high surrogates and Hebrew presentation forms // (Emoji is here) return false; } if in_range16(u, 0xD804, 0xD83A) { // Between RTL high surragates return false; } if u > 0xFEFE { // Above Arabic Presentation Forms (excl. BOM) return false; } if in_range16(u, 0xFE00, 0xFE70) { // Between Arabic Presentations Forms return false; } true } /// Checks whether a potentially invalid UTF-8 buffer contains code points /// that trigger right-to-left processing or is all-Latin1. /// /// Possibly more efficient than performing the checks separately. /// /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { if let Some(offset) = is_utf8_latin1_impl(buffer) { if is_utf8_bidi(&buffer[offset..]) { Latin1Bidi::Bidi } else { Latin1Bidi::LeftToRight } } else { Latin1Bidi::Latin1 } } /// Checks whether a valid UTF-8 buffer contains code points /// that trigger right-to-left processing or is all-Latin1. /// /// Possibly more efficient than performing the checks separately. /// /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { // The transition from the latin1 check to the bidi check isn't // optimal but not tweaking it to perfection today. if let Some(offset) = is_str_latin1_impl(buffer) { if is_str_bidi(&buffer[offset..]) { Latin1Bidi::Bidi } else { Latin1Bidi::LeftToRight } } else { Latin1Bidi::Latin1 } } /// Checks whether a potentially invalid UTF-16 buffer contains code points /// that trigger right-to-left processing or is all-Latin1. /// /// Possibly more efficient than performing the checks separately. /// /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { check_utf16_for_latin1_and_bidi_impl(buffer) } /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced /// with the REPLACEMENT CHARACTER. /// /// The length of the destination buffer must be at least the length of the /// source buffer _plus one_. /// /// Returns the number of `u16`s written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { // TODO: Can the requirement for dst to be at least one unit longer // be eliminated? assert!(dst.len() > src.len()); let mut decoder = Utf8Decoder::new_inner(); let mut total_read = 0usize; let mut total_written = 0usize; loop { let (result, read, written) = decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); total_read += read; total_written += written; match result { DecoderResult::InputEmpty => { return total_written; } DecoderResult::OutputFull => { unreachable!("The assert at the top of the function should have caught this."); } DecoderResult::Malformed(_, _) => { // There should always be space for the U+FFFD, because // otherwise we'd have gotten OutputFull already. dst[total_written] = 0xFFFD; total_written += 1; } } } } /// Converts valid UTF-8 to valid UTF-16. /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of `u16`s written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); let bytes = src.as_bytes(); let mut read = 0; let mut written = 0; 'outer: loop { let mut byte = { let src_remaining = &bytes[read..]; let dst_remaining = &mut dst[written..]; let length = src_remaining.len(); match unsafe { ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) } { None => { written += length; return written; } Some((non_ascii, consumed)) => { read += consumed; written += consumed; non_ascii } } }; 'inner: loop { // At this point, `byte` is not included in `read`. if byte < 0xE0 { if byte >= 0x80 { // Two-byte let second = unsafe { *(bytes.get_unchecked(read + 1)) }; let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); unsafe { *(dst.get_unchecked_mut(written)) = point }; read += 2; written += 1; } else { // ASCII: write and go back to SIMD. unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; read += 1; written += 1; // Intuitively, we should go back to the outer loop only // if byte is 0x30 or above, so as to avoid trashing on // ASCII space, comma and period in non-Latin context. // However, the extra branch seems to cost more than it's // worth. continue 'outer; } } else if byte < 0xF0 { // Three-byte let second = unsafe { *(bytes.get_unchecked(read + 1)) }; let third = unsafe { *(bytes.get_unchecked(read + 2)) }; let point = ((u16::from(byte) & 0xF) << 12) | ((u16::from(second) & 0x3F) << 6) | (u16::from(third) & 0x3F); unsafe { *(dst.get_unchecked_mut(written)) = point }; read += 3; written += 1; } else { // Four-byte let second = unsafe { *(bytes.get_unchecked(read + 1)) }; let third = unsafe { *(bytes.get_unchecked(read + 2)) }; let fourth = unsafe { *(bytes.get_unchecked(read + 3)) }; let point = ((u32::from(byte) & 0x7) << 18) | ((u32::from(second) & 0x3F) << 12) | ((u32::from(third) & 0x3F) << 6) | (u32::from(fourth) & 0x3F); unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; unsafe { *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 }; read += 4; written += 2; } // The comparison is always < or == and never >, but including // > here to let the compiler assume that < is true if this // comparison is false. if read >= src.len() { return written; } byte = bytes[read]; continue 'inner; } } } /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of `u16`s written or `None` if the input was invalid. /// /// When the input was invalid, some output may have been written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst); if read == src.len() { return Some(written); } None } /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced /// with the REPLACEMENT CHARACTER with potentially insufficient output /// space. /// /// Returns the number of code units read and the number of bytes written. /// /// Guarantees that the bytes in the destination beyond the number of /// bytes claimed as written by the second item of the return tuple /// are left unmodified. /// /// Not all code units are read if there isn't enough output space. /// /// Note that this method isn't designed for general streamability but for /// not allocating memory for the worst case up front. Specifically, /// if the input starts with or ends with an unpaired surrogate, those are /// replaced with the REPLACEMENT CHARACTER. /// /// Matches the semantics of `TextEncoder.encodeInto()` from the /// Encoding Standard. /// /// # Safety /// /// If you want to convert into a `&mut str`, use /// `convert_utf16_to_str_partial()` instead of using this function /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. #[inline(always)] pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { // The two functions called below are marked `inline(never)` to make // transitions from the hot part (first function) into the cold part // (second function) go through a return and another call to discouge // the CPU from speculating from the hot code into the cold code. // Letting the transitions be mere intra-function jumps, even to // basic blocks out-of-lined to the end of the function would wipe // away a quarter of Arabic encode performance on Haswell! let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst); if likely(read == src.len()) { return (read, written); } let (tail_read, tail_written) = convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]); (read + tail_read, written + tail_written) } /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced /// with the REPLACEMENT CHARACTER. /// /// The length of the destination buffer must be at least the length of the /// source buffer times three. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. /// /// # Safety /// /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` /// instead of using this function together with the `unsafe` method /// `as_bytes_mut()` on `&mut str`. #[inline(always)] pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { assert!(dst.len() >= src.len() * 3); let (read, written) = convert_utf16_to_utf8_partial(src, dst); debug_assert_eq!(read, src.len()); written } /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced /// with the REPLACEMENT CHARACTER such that the validity of the output is /// signaled using the Rust type system with potentially insufficient output /// space. /// /// Returns the number of code units read and the number of bytes written. /// /// Not all code units are read if there isn't enough output space. /// /// Note that this method isn't designed for general streamability but for /// not allocating memory for the worst case up front. Specifically, /// if the input starts with or ends with an unpaired surrogate, those are /// replaced with the REPLACEMENT CHARACTER. pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) { let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; let (read, written) = convert_utf16_to_utf8_partial(src, bytes); let len = bytes.len(); let mut trail = written; while trail < len && ((bytes[trail] & 0xC0) == 0x80) { bytes[trail] = 0; trail += 1; } (read, written) } /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced /// with the REPLACEMENT CHARACTER such that the validity of the output is /// signaled using the Rust type system. /// /// The length of the destination buffer must be at least the length of the /// source buffer times three. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. #[inline(always)] pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { assert!(dst.len() >= src.len() * 3); let (read, written) = convert_utf16_to_str_partial(src, dst); debug_assert_eq!(read, src.len()); written } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// The number of `u16`s written equals the length of the source buffer. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); // TODO: On aarch64, the safe version autovectorizes to the same unpacking // instructions and this code, but, yet, the autovectorized version is // faster. unsafe { unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); } } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient /// output space. /// /// Returns the number of bytes read and the number of bytes written. /// /// If the output isn't large enough, not all input is consumed. /// /// # Safety /// /// If you want to convert into a `&mut str`, use /// `convert_utf16_to_str_partial()` instead of using this function /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { let src_len = src.len(); let src_ptr = src.as_ptr(); let dst_ptr = dst.as_mut_ptr(); let dst_len = dst.len(); let mut total_read = 0usize; let mut total_written = 0usize; loop { // src can't advance more than dst let src_left = src_len - total_read; let dst_left = dst_len - total_written; let min_left = ::core::cmp::min(src_left, dst_left); if let Some((non_ascii, consumed)) = unsafe { ascii_to_ascii( src_ptr.add(total_read), dst_ptr.add(total_written), min_left, ) } { total_read += consumed; total_written += consumed; if total_written.checked_add(2).unwrap() > dst_len { return (total_read, total_written); } total_read += 1; // consume `non_ascii` dst[total_written] = (non_ascii >> 6) | 0xC0; total_written += 1; dst[total_written] = (non_ascii & 0x3F) | 0x80; total_written += 1; continue; } return (total_read + min_left, total_written + min_left); } } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. /// /// The length of the destination buffer must be at least the length of the /// source buffer times two. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. /// /// # Safety /// /// Note that this function may write garbage beyond the number of bytes /// indicated by the return value, so using a `&mut str` interpreted as /// `&mut [u8]` as the destination is not safe. If you want to convert into /// a `&mut str`, use `convert_utf16_to_str()` instead of this function. #[inline] pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { assert!( dst.len() >= src.len() * 2, "Destination must not be shorter than the source times two." ); let (read, written) = convert_latin1_to_utf8_partial(src, dst); debug_assert_eq!(read, src.len()); written } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the /// output is signaled using the Rust type system with potentially insufficient /// output space. /// /// Returns the number of bytes read and the number of bytes written. /// /// If the output isn't large enough, not all input is consumed. #[inline] pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) { let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; let (read, written) = convert_latin1_to_utf8_partial(src, bytes); let len = bytes.len(); let mut trail = written; let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE); while trail < max { bytes[trail] = 0; trail += 1; } while trail < len && ((bytes[trail] & 0xC0) == 0x80) { bytes[trail] = 0; trail += 1; } (read, written) } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the /// output is signaled using the Rust type system. /// /// The length of the destination buffer must be at least the length of the /// source buffer times two. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. #[inline] pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { assert!( dst.len() >= src.len() * 2, "Destination must not be shorter than the source times two." ); let (read, written) = convert_latin1_to_str_partial(src, dst); debug_assert_eq!(read, src.len()); written } /// If the input is valid UTF-8 representing only Unicode code points from /// U+0000 to U+00FF, inclusive, converts the input into output that /// represents the value of each code point as the unsigned byte value of /// each output byte. /// /// If the input does not fulfill the condition stated above, this function /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise /// does something that is memory-safe without any promises about any /// properties of the output. In particular, callers shouldn't assume the /// output to be the same across crate versions or CPU architectures and /// should not assume that non-ASCII input can't map to ASCII output. /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. /// /// If debug assertions are enabled (and not fuzzing) and the input is /// not in the range U+0000 to U+00FF, inclusive. pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); non_fuzz_debug_assert!(is_utf8_latin1(src)); let src_len = src.len(); let src_ptr = src.as_ptr(); let dst_ptr = dst.as_mut_ptr(); let mut total_read = 0usize; let mut total_written = 0usize; loop { // dst can't advance more than src let src_left = src_len - total_read; if let Some((non_ascii, consumed)) = unsafe { ascii_to_ascii( src_ptr.add(total_read), dst_ptr.add(total_written), src_left, ) } { total_read += consumed + 1; total_written += consumed; if total_read == src_len { return total_written; } let trail = src[total_read]; total_read += 1; dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); total_written += 1; continue; } return total_written + src_left; } } /// If the input is valid UTF-16 representing only Unicode code points from /// U+0000 to U+00FF, inclusive, converts the input into output that /// represents the value of each code point as the unsigned byte value of /// each output byte. /// /// If the input does not fulfill the condition stated above, does something /// that is memory-safe without any promises about any properties of the /// output and will probably assert in debug builds in future versions. /// In particular, callers shouldn't assume the output to be the same across /// crate versions or CPU architectures and should not assume that non-ASCII /// input can't map to ASCII output. /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// The number of bytes written equals the length of the source buffer. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. /// /// (Probably in future versions if debug assertions are enabled (and not /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); // non_fuzz_debug_assert!(is_utf16_latin1(src)); unsafe { pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); } } /// Converts bytes whose unsigned value is interpreted as Unicode code point /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. /// /// Borrows if input is ASCII-only. Performs a single heap allocation /// otherwise. /// /// Only available if the `alloc` feature is enabled (enabled by default). #[cfg(feature = "alloc")] pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { let up_to = ascii_valid_up_to(bytes); // >= makes later things optimize better than == if up_to >= bytes.len() { debug_assert_eq!(up_to, bytes.len()); let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; return Cow::Borrowed(s); } let (head, tail) = bytes.split_at(up_to); let capacity = head.len() + tail.len() * 2; let mut vec = Vec::with_capacity(capacity); unsafe { vec.set_len(capacity); } (&mut vec[..up_to]).copy_from_slice(head); let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) } /// If the input is valid UTF-8 representing only Unicode code points from /// U+0000 to U+00FF, inclusive, converts the input into output that /// represents the value of each code point as the unsigned byte value of /// each output byte. /// /// If the input does not fulfill the condition stated above, this function /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise /// does something that is memory-safe without any promises about any /// properties of the output. In particular, callers shouldn't assume the /// output to be the same across crate versions or CPU architectures and /// should not assume that non-ASCII input can't map to ASCII output. /// /// Borrows if input is ASCII-only. Performs a single heap allocation /// otherwise. /// /// Only available if the `alloc` feature is enabled (enabled by default). #[cfg(feature = "alloc")] pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { let bytes = string.as_bytes(); let up_to = ascii_valid_up_to(bytes); // >= makes later things optimize better than == if up_to >= bytes.len() { debug_assert_eq!(up_to, bytes.len()); return Cow::Borrowed(bytes); } let (head, tail) = bytes.split_at(up_to); let capacity = bytes.len(); let mut vec = Vec::with_capacity(capacity); unsafe { vec.set_len(capacity); } (&mut vec[..up_to]).copy_from_slice(head); let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); vec.truncate(up_to + written); Cow::Owned(vec) } /// Returns the index of the first unpaired surrogate or, if the input is /// valid UTF-16 in its entirety, the length of the input. pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { utf16_valid_up_to_impl(buffer) } /// Returns the index of first byte that starts an invalid byte /// sequence or a non-Latin1 byte sequence, or the length of the /// string if there are neither. pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { is_utf8_latin1_impl(buffer).unwrap_or(buffer.len()) } /// Returns the index of first byte that starts a non-Latin1 byte /// sequence, or the length of the string if there are none. pub fn str_latin1_up_to(buffer: &str) -> usize { is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) } /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. #[inline] pub fn ensure_utf16_validity(buffer: &mut [u16]) { let mut offset = 0; loop { offset += utf16_valid_up_to(&buffer[offset..]); if offset == buffer.len() { return; } buffer[offset] = 0xFFFD; offset += 1; } } /// Copies ASCII from source to destination up to the first non-ASCII byte /// (or the end of the input if it is ASCII in its entirety). /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); if let Some((_, consumed)) = unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { consumed } else { src.len() } } /// Copies ASCII from source to destination zero-extending it to UTF-16 up to /// the first non-ASCII byte (or the end of the input if it is ASCII in its /// entirety). /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of `u16`s written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); if let Some((_, consumed)) = unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { consumed } else { src.len() } } /// Copies Basic Latin from source to destination narrowing it to ASCII up to /// the first non-Basic Latin code unit (or the end of the input if it is /// Basic Latin in its entirety). /// /// The length of the destination buffer must be at least the length of the /// source buffer. /// /// Returns the number of bytes written. /// /// # Panics /// /// Panics if the destination buffer is shorter than stated above. pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { assert!( dst.len() >= src.len(), "Destination must not be shorter than the source." ); if let Some((_, consumed)) = unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } { consumed } else { src.len() } } // Any copyright to the test code below this comment is dedicated to the // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ #[cfg(all(test, feature = "alloc"))] mod tests { use super::*; #[test] fn test_is_ascii_success() { let mut src: Vec = Vec::with_capacity(128); src.resize(128, 0); for i in 0..src.len() { src[i] = i as u8; } for i in 0..src.len() { assert!(is_ascii(&src[i..])); } } #[test] fn test_is_ascii_fail() { let mut src: Vec = Vec::with_capacity(128); src.resize(128, 0); for i in 0..src.len() { src[i] = i as u8; } for i in 0..src.len() { let tail = &mut src[i..]; for j in 0..tail.len() { tail[j] = 0xA0; assert!(!is_ascii(tail)); } } } #[test] fn test_is_basic_latin_success() { let mut src: Vec = Vec::with_capacity(128); src.resize(128, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { assert!(is_basic_latin(&src[i..])); } } #[test] fn test_is_basic_latin_fail() { let mut src: Vec = Vec::with_capacity(128); src.resize(128, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let tail = &mut src[i..]; for j in 0..tail.len() { tail[j] = 0xA0; assert!(!is_basic_latin(tail)); } } } #[test] fn test_is_utf16_latin1_success() { let mut src: Vec = Vec::with_capacity(256); src.resize(256, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { assert!(is_utf16_latin1(&src[i..])); assert_eq!( check_utf16_for_latin1_and_bidi(&src[i..]), Latin1Bidi::Latin1 ); } } #[test] fn test_is_utf16_latin1_fail() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow let mut src: Vec = Vec::with_capacity(len); src.resize(len, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let tail = &mut src[i..]; for j in 0..tail.len() { tail[j] = 0x100 + j as u16; assert!(!is_utf16_latin1(tail)); assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); } } } #[test] fn test_is_str_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow let mut src: Vec = Vec::with_capacity(len); src.resize(len, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let s = String::from_utf16(&src[i..]).unwrap(); assert!(is_str_latin1(&s[..])); assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); } } #[test] fn test_is_str_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow let mut src: Vec = Vec::with_capacity(len); src.resize(len, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let tail = &mut src[i..]; for j in 0..tail.len() { tail[j] = 0x100 + j as u16; let s = String::from_utf16(tail).unwrap(); assert!(!is_str_latin1(&s[..])); assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); } } } #[test] fn test_is_utf8_latin1_success() { let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow let mut src: Vec = Vec::with_capacity(len); src.resize(len, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let s = String::from_utf16(&src[i..]).unwrap(); assert!(is_utf8_latin1(s.as_bytes())); assert_eq!( check_utf8_for_latin1_and_bidi(s.as_bytes()), Latin1Bidi::Latin1 ); } } #[test] fn test_is_utf8_latin1_fail() { let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow let mut src: Vec = Vec::with_capacity(len); src.resize(len, 0); for i in 0..src.len() { src[i] = i as u16; } for i in 0..src.len() { let tail = &mut src[i..]; for j in 0..tail.len() { tail[j] = 0x100 + j as u16; let s = String::from_utf16(tail).unwrap(); assert!(!is_utf8_latin1(s.as_bytes())); assert_ne!( check_utf8_for_latin1_and_bidi(s.as_bytes()), Latin1Bidi::Latin1 ); } } } #[test] fn test_is_utf8_latin1_invalid() { assert!(!is_utf8_latin1(b"\xC3")); assert!(!is_utf8_latin1(b"a\xC3")); assert!(!is_utf8_latin1(b"\xFF")); assert!(!is_utf8_latin1(b"a\xFF")); assert!(!is_utf8_latin1(b"\xC3\xFF")); assert!(!is_utf8_latin1(b"a\xC3\xFF")); } #[test] fn test_convert_utf8_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let mut dst: Vec = Vec::with_capacity(src.len() + 1); dst.resize(src.len() + 1, 0); let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); assert_eq!(dst, reference); } #[test] fn test_convert_str_to_utf16() { let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let mut dst: Vec = Vec::with_capacity(src.len()); dst.resize(src.len(), 0); let len = convert_str_to_utf16(src, &mut dst[..]); dst.truncate(len); let reference: Vec = src.encode_utf16().collect(); assert_eq!(dst, reference); } #[test] fn test_convert_utf16_to_utf8_partial() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); dst.resize(src.len() * 3 + 1, 0); let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); dst.truncate(len); assert_eq!(dst, reference.as_bytes()); } #[test] fn test_convert_utf16_to_utf8() { let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; let src: Vec = reference.encode_utf16().collect(); let mut dst: Vec = Vec::with_capacity(src.len() * 3 + 1); dst.resize(src.len() * 3 + 1, 0); let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference.as_bytes()); } #[test] fn test_convert_latin1_to_utf16() { let mut src: Vec = Vec::with_capacity(256); src.resize(256, 0); let mut reference: Vec = Vec::with_capacity(256); reference.resize(256, 0); for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } let mut dst: Vec = Vec::with_capacity(src.len()); dst.resize(src.len(), 0); convert_latin1_to_utf16(&src[..], &mut dst[..]); assert_eq!(dst, reference); } #[test] fn test_convert_latin1_to_utf8_partial() { let mut dst = [0u8, 2]; let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]); assert_eq!(read, 1); assert_eq!(written, 1); } #[test] fn test_convert_latin1_to_utf8() { let mut src: Vec = Vec::with_capacity(256); src.resize(256, 0); let mut reference: Vec = Vec::with_capacity(256); reference.resize(256, 0); for i in 0..256 { src[i] = i as u8; reference[i] = i as u16; } let s = String::from_utf16(&reference[..]).unwrap(); let mut dst: Vec = Vec::with_capacity(src.len() * 2); dst.resize(src.len() * 2, 0); let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); dst.truncate(len); assert_eq!(&dst[..], s.as_bytes()); } #[test] fn test_convert_utf8_to_latin1_lossy() { let mut reference: Vec = Vec::with_capacity(256); reference.resize(256, 0); let mut src16: Vec = Vec::with_capacity(256); src16.resize(256, 0); for i in 0..256 { src16[i] = i as u16; reference[i] = i as u8; } let src = String::from_utf16(&src16[..]).unwrap(); let mut dst: Vec = Vec::with_capacity(src.len()); dst.resize(src.len(), 0); let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); dst.truncate(len); assert_eq!(dst, reference); } #[cfg(all(debug_assertions, not(fuzzing)))] #[test] #[should_panic] fn test_convert_utf8_to_latin1_lossy_panics() { let mut dst = [0u8; 16]; let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]); } #[test] fn test_convert_utf16_to_latin1_lossy() { let mut src: Vec = Vec::with_capacity(256); src.resize(256, 0); let mut reference: Vec = Vec::with_capacity(256); reference.resize(256, 0); for i in 0..256 { src[i] = i as u16; reference[i] = i as u8; } let mut dst: Vec = Vec::with_capacity(src.len()); dst.resize(src.len(), 0); convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); assert_eq!(dst, reference); } #[test] // #[should_panic] fn test_convert_utf16_to_latin1_lossy_panics() { let mut dst = [0u8; 16]; let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); } #[test] fn test_utf16_valid_up_to() { let valid = vec![ 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, 0xD83Du16, 0xDCA9u16, 0x00B6u16, ]; assert_eq!(utf16_valid_up_to(&valid[..]), 16); let lone_high = vec![ 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, 0xD83Du16, 0x00B6u16, ]; assert_eq!(utf16_valid_up_to(&lone_high[..]), 14); let lone_low = vec![ 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, 0xDCA9u16, 0x00B6u16, ]; assert_eq!(utf16_valid_up_to(&lone_low[..]), 14); let lone_high_at_end = vec![ 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, 0x00B6u16, 0xD83Du16, ]; assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15); } #[test] fn test_ensure_utf16_validity() { let mut src = vec![ 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, ]; let reference = vec![ 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, ]; ensure_utf16_validity(&mut src[..]); assert_eq!(src, reference); } #[test] fn test_is_char_bidi() { assert!(!is_char_bidi('a')); assert!(!is_char_bidi('\u{03B1}')); assert!(!is_char_bidi('\u{3041}')); assert!(!is_char_bidi('\u{1F4A9}')); assert!(!is_char_bidi('\u{FE00}')); assert!(!is_char_bidi('\u{202C}')); assert!(!is_char_bidi('\u{FEFF}')); assert!(is_char_bidi('\u{0590}')); assert!(is_char_bidi('\u{08FF}')); assert!(is_char_bidi('\u{061C}')); assert!(is_char_bidi('\u{FB50}')); assert!(is_char_bidi('\u{FDFF}')); assert!(is_char_bidi('\u{FE70}')); assert!(is_char_bidi('\u{FEFE}')); assert!(is_char_bidi('\u{200F}')); assert!(is_char_bidi('\u{202B}')); assert!(is_char_bidi('\u{202E}')); assert!(is_char_bidi('\u{2067}')); assert!(is_char_bidi('\u{10800}')); assert!(is_char_bidi('\u{10FFF}')); assert!(is_char_bidi('\u{1E800}')); assert!(is_char_bidi('\u{1EFFF}')); } #[test] fn test_is_utf16_code_unit_bidi() { assert!(!is_utf16_code_unit_bidi(0x0062)); assert!(!is_utf16_code_unit_bidi(0x03B1)); assert!(!is_utf16_code_unit_bidi(0x3041)); assert!(!is_utf16_code_unit_bidi(0xD801)); assert!(!is_utf16_code_unit_bidi(0xFE00)); assert!(!is_utf16_code_unit_bidi(0x202C)); assert!(!is_utf16_code_unit_bidi(0xFEFF)); assert!(is_utf16_code_unit_bidi(0x0590)); assert!(is_utf16_code_unit_bidi(0x08FF)); assert!(is_utf16_code_unit_bidi(0x061C)); assert!(is_utf16_code_unit_bidi(0xFB1D)); assert!(is_utf16_code_unit_bidi(0xFB50)); assert!(is_utf16_code_unit_bidi(0xFDFF)); assert!(is_utf16_code_unit_bidi(0xFE70)); assert!(is_utf16_code_unit_bidi(0xFEFE)); assert!(is_utf16_code_unit_bidi(0x200F)); assert!(is_utf16_code_unit_bidi(0x202B)); assert!(is_utf16_code_unit_bidi(0x202E)); assert!(is_utf16_code_unit_bidi(0x2067)); assert!(is_utf16_code_unit_bidi(0xD802)); assert!(is_utf16_code_unit_bidi(0xD803)); assert!(is_utf16_code_unit_bidi(0xD83A)); assert!(is_utf16_code_unit_bidi(0xD83B)); } #[test] fn test_is_str_bidi() { assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop")); assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop")); assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop")); } #[test] fn test_is_utf8_bidi() { assert!(!is_utf8_bidi( "abcdefghijklmnopaabcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes() )); assert!(!is_utf8_bidi( "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes() )); assert!(is_utf8_bidi( "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes() )); } #[test] fn test_is_utf16_bidi() { assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(!is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); assert!(is_utf16_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ])); } #[test] fn test_check_str_for_latin1_and_bidi() { assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_ne!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"), Latin1Bidi::Bidi ); assert_eq!( check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"), Latin1Bidi::Bidi ); } #[test] fn test_check_utf8_for_latin1_and_bidi() { assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_ne!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); assert_eq!( check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()), Latin1Bidi::Bidi ); } #[test] fn test_check_utf16_for_latin1_and_bidi() { assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_ne!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); assert_eq!( check_utf16_for_latin1_and_bidi(&[ 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, ]), Latin1Bidi::Bidi ); } #[inline(always)] pub fn reference_is_char_bidi(c: char) -> bool { match c { '\u{0590}'..='\u{08FF}' | '\u{FB1D}'..='\u{FDFF}' | '\u{FE70}'..='\u{FEFE}' | '\u{10800}'..='\u{10FFF}' | '\u{1E800}'..='\u{1EFFF}' | '\u{200F}' | '\u{202B}' | '\u{202E}' | '\u{2067}' => true, _ => false, } } #[inline(always)] pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { match u { 0x0590..=0x08FF | 0xFB1D..=0xFDFF | 0xFE70..=0xFEFE | 0xD802 | 0xD803 | 0xD83A | 0xD83B | 0x200F | 0x202B | 0x202E | 0x2067 => true, _ => false, } } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_is_char_bidi_thoroughly() { for i in 0..0xD800u32 { let c: char = ::core::char::from_u32(i).unwrap(); assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); } for i in 0xE000..0x110000u32 { let c: char = ::core::char::from_u32(i).unwrap(); assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); } } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_is_utf16_code_unit_bidi_thoroughly() { for i in 0..0x10000u32 { let u = i as u16; assert_eq!( is_utf16_code_unit_bidi(u), reference_is_utf16_code_unit_bidi(u) ); } } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_is_str_bidi_thoroughly() { let mut buf = [0; 4]; for i in 0..0xD800u32 { let c: char = ::core::char::from_u32(i).unwrap(); assert_eq!( is_str_bidi(c.encode_utf8(&mut buf[..])), reference_is_char_bidi(c) ); } for i in 0xE000..0x110000u32 { let c: char = ::core::char::from_u32(i).unwrap(); assert_eq!( is_str_bidi(c.encode_utf8(&mut buf[..])), reference_is_char_bidi(c) ); } } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_is_utf8_bidi_thoroughly() { let mut buf = [0; 8]; for i in 0..0xD800u32 { let c: char = ::core::char::from_u32(i).unwrap(); let expect = reference_is_char_bidi(c); { let len = { let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); assert_eq!(is_utf8_bidi(bytes), expect); bytes.len() }; { let tail = &mut buf[len..]; for b in tail.iter_mut() { *b = 0; } } } assert_eq!(is_utf8_bidi(&buf[..]), expect); } for i in 0xE000..0x110000u32 { let c: char = ::core::char::from_u32(i).unwrap(); let expect = reference_is_char_bidi(c); { let len = { let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); assert_eq!(is_utf8_bidi(bytes), expect); bytes.len() }; { let tail = &mut buf[len..]; for b in tail.iter_mut() { *b = 0; } } } assert_eq!(is_utf8_bidi(&buf[..]), expect); } } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_is_utf16_bidi_thoroughly() { let mut buf = [0; 32]; for i in 0..0x10000u32 { let u = i as u16; buf[15] = u; assert_eq!( is_utf16_bidi(&buf[..]), reference_is_utf16_code_unit_bidi(u) ); } } #[test] fn test_is_utf8_bidi_edge_cases() { assert!(!is_utf8_bidi(b"\xD5\xBF\x61")); assert!(!is_utf8_bidi(b"\xD6\x80\x61")); assert!(!is_utf8_bidi(b"abc")); assert!(is_utf8_bidi(b"\xD5\xBF\xC2")); assert!(is_utf8_bidi(b"\xD6\x80\xC2")); assert!(is_utf8_bidi(b"ab\xC2")); } #[test] fn test_decode_latin1() { match decode_latin1(b"ab") { Cow::Borrowed(s) => { assert_eq!(s, "ab"); } Cow::Owned(_) => { unreachable!("Should have borrowed"); } } assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}"); } #[test] fn test_encode_latin1_lossy() { match encode_latin1_lossy("ab") { Cow::Borrowed(s) => { assert_eq!(s, b"ab"); } Cow::Owned(_) => { unreachable!("Should have borrowed"); } } assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]); } #[test] fn test_convert_utf8_to_utf16_without_replacement() { let mut buf = [0u16; 5]; assert_eq!( convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]), Some(2) ); assert_eq!(buf[0], u16::from(b'a')); assert_eq!(buf[1], u16::from(b'b')); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]), Some(2) ); assert_eq!(buf[0], 0xE4); assert_eq!(buf[1], u16::from(b'c')); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]), Some(1) ); assert_eq!(buf[0], 0x2603); assert_eq!(buf[1], u16::from(b'c')); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]), Some(2) ); assert_eq!(buf[0], 0x2603); assert_eq!(buf[1], u16::from(b'd')); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]), Some(2) ); assert_eq!(buf[0], 0x2603); assert_eq!(buf[1], 0xE4); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]), Some(2) ); assert_eq!(buf[0], 0xD83D); assert_eq!(buf[1], 0xDCCE); assert_eq!(buf[2], 0); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]), Some(3) ); assert_eq!(buf[0], 0xD83D); assert_eq!(buf[1], 0xDCCE); assert_eq!(buf[2], u16::from(b'e')); assert_eq!( convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]), None ); } }