diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/encoding_rs/src/utf_8.rs | |
parent | Initial commit. (diff) | |
download | firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/encoding_rs/src/utf_8.rs')
-rw-r--r-- | third_party/rust/encoding_rs/src/utf_8.rs | 1631 |
1 files changed, 1631 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/src/utf_8.rs b/third_party/rust/encoding_rs/src/utf_8.rs new file mode 100644 index 0000000000..2f72cc091d --- /dev/null +++ b/third_party/rust/encoding_rs/src/utf_8.rs @@ -0,0 +1,1631 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::*; +use crate::ascii::ascii_to_basic_latin; +use crate::ascii::basic_latin_to_ascii; +use crate::ascii::validate_ascii; +use crate::handles::*; +use crate::mem::convert_utf16_to_utf8_partial; +use crate::variant::*; + +cfg_if! { + if #[cfg(feature = "simd-accel")] { + use ::core::intrinsics::unlikely; + use ::core::intrinsics::likely; + } else { + #[inline(always)] + // Unsafe to match the intrinsic, which is needlessly unsafe. + unsafe fn unlikely(b: bool) -> bool { + b + } + #[inline(always)] + // Unsafe to match the intrinsic, which is needlessly unsafe. + unsafe fn likely(b: bool) -> bool { + b + } + } +} + +#[repr(align(64))] // Align to cache lines +pub struct Utf8Data { + pub table: [u8; 384], +} + +// BEGIN GENERATED CODE. PLEASE DO NOT EDIT. +// Instead, please regenerate using generate-encoding-data.py + +pub static UTF8_DATA: Utf8Data = Utf8Data { + table: [ + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148, + 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + ], +}; + +// END GENERATED CODE + +pub fn utf8_valid_up_to(src: &[u8]) -> usize { + let mut read = 0; + 'outer: loop { + let mut byte = { + let src_remaining = &src[read..]; + match validate_ascii(src_remaining) { + None => { + return src.len(); + } + Some((non_ascii, consumed)) => { + read += consumed; + non_ascii + } + } + }; + // Check for the longest sequence to avoid checking twice for the + // multi-byte sequences. This can't overflow with 64-bit address space, + // because full 64 bits aren't in use. In the 32-bit PAE case, for this + // to overflow would mean that the source slice would be so large that + // the address space of the process would not have space for any code. + // Therefore, the slice cannot be so long that this would overflow. + if unsafe { likely(read + 4 <= src.len()) } { + 'inner: loop { + // At this point, `byte` is not included in `read`, because we + // don't yet know that a) the UTF-8 sequence is valid and b) that there + // is output space if it is an astral sequence. + // Inspecting the lead byte directly is faster than what the + // std lib does! + if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } { + // Two-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + break 'outer; + } + read += 2; + + // Next lead (manually inlined) + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if byte < 0x80 { + read += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + if unsafe { likely(byte < 0xF0) } { + 'three: loop { + // Three-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + break 'outer; + } + read += 3; + + // Next lead (manually inlined) + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if in_inclusive_range8(byte, 0xE0, 0xEF) { + continue 'three; + } + if unsafe { likely(byte < 0x80) } { + read += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + } + // Four-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + let fourth = unsafe { *(src.get_unchecked(read + 3)) }; + if (u16::from( + UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, + ) | u16::from(third >> 6) + | (u16::from(fourth & 0xC0) << 2)) + != 0x202 + { + break 'outer; + } + read += 4; + + // Next lead + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if byte < 0x80 { + read += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + } + // We can't have a complete 4-byte sequence, but we could still have + // one to three shorter sequences. + 'tail: loop { + // >= is better for bound check elision than == + if read >= src.len() { + break 'outer; + } + byte = src[read]; + // At this point, `byte` is not included in `read`, because we + // don't yet know that a) the UTF-8 sequence is valid and b) that there + // is output space if it is an astral sequence. + // Inspecting the lead byte directly is faster than what the + // std lib does! + if byte < 0x80 { + read += 1; + continue 'tail; + } + if in_inclusive_range8(byte, 0xC2, 0xDF) { + // Two-byte + let new_read = read + 2; + if new_read > src.len() { + break 'outer; + } + let second = src[read + 1]; + if !in_inclusive_range8(second, 0x80, 0xBF) { + break 'outer; + } + read += 2; + continue 'tail; + } + // We need to exclude valid four byte lead bytes, because + // `UTF8_DATA.second_mask` covers + if byte < 0xF0 { + // Three-byte + let new_read = read + 3; + if new_read > src.len() { + break 'outer; + } + let second = src[read + 1]; + let third = src[read + 2]; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + break 'outer; + } + read += 3; + // `'tail` handles sequences shorter than 4, so + // there can't be another sequence after this one. + break 'outer; + } + break 'outer; + } + } + read +} + +#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))] +pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) { + let mut read = 0; + let mut written = 0; + 'outer: loop { + let mut byte = { + let src_remaining = &src[read..]; + let dst_remaining = &mut dst[written..]; + let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len()); + match unsafe { + ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + } { + None => { + read += length; + written += length; + break 'outer; + } + Some((non_ascii, consumed)) => { + read += consumed; + written += consumed; + non_ascii + } + } + }; + // Check for the longest sequence to avoid checking twice for the + // multi-byte sequences. This can't overflow with 64-bit address space, + // because full 64 bits aren't in use. In the 32-bit PAE case, for this + // to overflow would mean that the source slice would be so large that + // the address space of the process would not have space for any code. + // Therefore, the slice cannot be so long that this would overflow. + if unsafe { likely(read + 4 <= src.len()) } { + 'inner: loop { + // At this point, `byte` is not included in `read`, because we + // don't yet know that a) the UTF-8 sequence is valid and b) that there + // is output space if it is an astral sequence. + // We know, thanks to `ascii_to_basic_latin` that there is output + // space for at least one UTF-16 code unit, so no need to check + // for output space in the BMP cases. + // Inspecting the lead byte directly is faster than what the + // std lib does! + if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } { + // Two-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + if !in_inclusive_range8(second, 0x80, 0xBF) { + break 'outer; + } + unsafe { + *(dst.get_unchecked_mut(written)) = + ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F) + }; + read += 2; + written += 1; + + // Next lead (manually inlined) + if written == dst.len() { + break 'outer; + } + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if byte < 0x80 { + unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + read += 1; + written += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + if unsafe { likely(byte < 0xF0) } { + 'three: loop { + // Three-byte + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + break 'outer; + } + let point = ((u16::from(byte) & 0xF) << 12) + | ((u16::from(second) & 0x3F) << 6) + | (u16::from(third) & 0x3F); + unsafe { *(dst.get_unchecked_mut(written)) = point }; + read += 3; + written += 1; + + // Next lead (manually inlined) + if written == dst.len() { + break 'outer; + } + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if in_inclusive_range8(byte, 0xE0, 0xEF) { + continue 'three; + } + if unsafe { likely(byte < 0x80) } { + unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + read += 1; + written += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + } + // Four-byte + if written + 1 == dst.len() { + break 'outer; + } + let second = unsafe { *(src.get_unchecked(read + 1)) }; + let third = unsafe { *(src.get_unchecked(read + 2)) }; + let fourth = unsafe { *(src.get_unchecked(read + 3)) }; + if (u16::from( + UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, + ) | u16::from(third >> 6) + | (u16::from(fourth & 0xC0) << 2)) + != 0x202 + { + break 'outer; + } + let point = ((u32::from(byte) & 0x7) << 18) + | ((u32::from(second) & 0x3F) << 12) + | ((u32::from(third) & 0x3F) << 6) + | (u32::from(fourth) & 0x3F); + unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; + unsafe { + *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 + }; + read += 4; + written += 2; + + // Next lead + if written == dst.len() { + break 'outer; + } + if unsafe { likely(read + 4 <= src.len()) } { + byte = unsafe { *(src.get_unchecked(read)) }; + if byte < 0x80 { + unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; + read += 1; + written += 1; + continue 'outer; + } + continue 'inner; + } + break 'inner; + } + } + // We can't have a complete 4-byte sequence, but we could still have + // one to three shorter sequences. + 'tail: loop { + // >= is better for bound check elision than == + if read >= src.len() || written >= dst.len() { + break 'outer; + } + byte = src[read]; + // At this point, `byte` is not included in `read`, because we + // don't yet know that a) the UTF-8 sequence is valid and b) that there + // is output space if it is an astral sequence. + // Inspecting the lead byte directly is faster than what the + // std lib does! + if byte < 0x80 { + dst[written] = u16::from(byte); + read += 1; + written += 1; + continue 'tail; + } + if in_inclusive_range8(byte, 0xC2, 0xDF) { + // Two-byte + let new_read = read + 2; + if new_read > src.len() { + break 'outer; + } + let second = src[read + 1]; + if !in_inclusive_range8(second, 0x80, 0xBF) { + break 'outer; + } + dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); + read += 2; + written += 1; + continue 'tail; + } + // We need to exclude valid four byte lead bytes, because + // `UTF8_DATA.second_mask` covers + if byte < 0xF0 { + // Three-byte + let new_read = read + 3; + if new_read > src.len() { + break 'outer; + } + let second = src[read + 1]; + let third = src[read + 2]; + if ((UTF8_DATA.table[usize::from(second)] + & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) + | (third >> 6)) + != 2 + { + break 'outer; + } + let point = ((u16::from(byte) & 0xF) << 12) + | ((u16::from(second) & 0x3F) << 6) + | (u16::from(third) & 0x3F); + dst[written] = point; + read += 3; + written += 1; + // `'tail` handles sequences shorter than 4, so + // there can't be another sequence after this one. + break 'outer; + } + break 'outer; + } + } + (read, written) +} + +pub struct Utf8Decoder { + code_point: u32, + bytes_seen: usize, // 1, 2 or 3: counts continuations only + bytes_needed: usize, // 1, 2 or 3: counts continuations only + lower_boundary: u8, + upper_boundary: u8, +} + +impl Utf8Decoder { + pub fn new_inner() -> Utf8Decoder { + Utf8Decoder { + code_point: 0, + bytes_seen: 0, + bytes_needed: 0, + lower_boundary: 0x80u8, + upper_boundary: 0xBFu8, + } + } + + pub fn new() -> VariantDecoder { + VariantDecoder::Utf8(Utf8Decoder::new_inner()) + } + + pub fn in_neutral_state(&self) -> bool { + self.bytes_needed == 0 + } + + fn extra_from_state(&self) -> usize { + if self.bytes_needed == 0 { + 0 + } else { + self.bytes_seen + 1 + } + } + + pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { + byte_length.checked_add(1 + self.extra_from_state()) + } + + pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { + byte_length.checked_add(3 + self.extra_from_state()) + } + + pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { + checked_add( + 3, + checked_mul(3, byte_length.checked_add(self.extra_from_state())), + ) + } + + decoder_functions!( + {}, + { + // This is the fast path. The rest runs only at the + // start and end for partial sequences. + if self.bytes_needed == 0 { + dest.copy_utf8_up_to_invalid_from(&mut source); + } + }, + { + if self.bytes_needed != 0 { + let bad_bytes = (self.bytes_seen + 1) as u8; + self.code_point = 0; + self.bytes_needed = 0; + self.bytes_seen = 0; + return ( + DecoderResult::Malformed(bad_bytes, 0), + src_consumed, + dest.written(), + ); + } + }, + { + if self.bytes_needed == 0 { + if b < 0x80u8 { + destination_handle.write_ascii(b); + continue; + } + if b < 0xC2u8 { + return ( + DecoderResult::Malformed(1, 0), + unread_handle.consumed(), + destination_handle.written(), + ); + } + if b < 0xE0u8 { + self.bytes_needed = 1; + self.code_point = u32::from(b) & 0x1F; + continue; + } + if b < 0xF0u8 { + if b == 0xE0u8 { + self.lower_boundary = 0xA0u8; + } else if b == 0xEDu8 { + self.upper_boundary = 0x9Fu8; + } + self.bytes_needed = 2; + self.code_point = u32::from(b) & 0xF; + continue; + } + if b < 0xF5u8 { + if b == 0xF0u8 { + self.lower_boundary = 0x90u8; + } else if b == 0xF4u8 { + self.upper_boundary = 0x8Fu8; + } + self.bytes_needed = 3; + self.code_point = u32::from(b) & 0x7; + continue; + } + return ( + DecoderResult::Malformed(1, 0), + unread_handle.consumed(), + destination_handle.written(), + ); + } + // self.bytes_needed != 0 + if !(b >= self.lower_boundary && b <= self.upper_boundary) { + let bad_bytes = (self.bytes_seen + 1) as u8; + self.code_point = 0; + self.bytes_needed = 0; + self.bytes_seen = 0; + self.lower_boundary = 0x80u8; + self.upper_boundary = 0xBFu8; + return ( + DecoderResult::Malformed(bad_bytes, 0), + unread_handle.unread(), + destination_handle.written(), + ); + } + self.lower_boundary = 0x80u8; + self.upper_boundary = 0xBFu8; + self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F); + self.bytes_seen += 1; + if self.bytes_seen != self.bytes_needed { + continue; + } + if self.bytes_needed == 3 { + destination_handle.write_astral(self.code_point); + } else { + destination_handle.write_bmp_excl_ascii(self.code_point as u16); + } + self.code_point = 0; + self.bytes_needed = 0; + self.bytes_seen = 0; + continue; + }, + self, + src_consumed, + dest, + source, + b, + destination_handle, + unread_handle, + check_space_astral + ); +} + +#[cfg_attr(feature = "cargo-clippy", allow(never_loop))] +#[inline(never)] +pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) { + let mut read = 0; + let mut written = 0; + 'outer: loop { + let mut unit = { + let src_remaining = &src[read..]; + let dst_remaining = &mut dst[written..]; + let length = if dst_remaining.len() < src_remaining.len() { + dst_remaining.len() + } else { + src_remaining.len() + }; + match unsafe { + basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) + } { + None => { + read += length; + written += length; + return (read, written); + } + Some((non_ascii, consumed)) => { + read += consumed; + written += consumed; + non_ascii + } + } + }; + 'inner: loop { + // The following loop is only broken out of as a goto forward. + loop { + // Unfortunately, this check isn't enough for the compiler to elide + // the bound checks on writes to dst, which is why they are manually + // elided, which makes a measurable difference. + if written.checked_add(4).unwrap() > dst.len() { + return (read, written); + } + read += 1; + if unit < 0x800 { + unsafe { + *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8; + written += 1; + *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; + written += 1; + } + break; + } + let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); + if unsafe { likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) } { + unsafe { + *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8; + written += 1; + *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; + written += 1; + *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; + written += 1; + } + break; + } + if unsafe { likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) } { + // high surrogate + // read > src.len() is impossible, but using + // >= instead of == allows the compiler to elide a bound check. + if read >= src.len() { + debug_assert_eq!(read, src.len()); + // Unpaired surrogate at the end of the buffer. + unsafe { + *(dst.get_unchecked_mut(written)) = 0xEFu8; + written += 1; + *(dst.get_unchecked_mut(written)) = 0xBFu8; + written += 1; + *(dst.get_unchecked_mut(written)) = 0xBDu8; + written += 1; + } + return (read, written); + } + let second = src[read]; + let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); + if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } { + // The next code unit is a low surrogate. Advance position. + read += 1; + let astral = (u32::from(unit) << 10) + u32::from(second) + - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); + unsafe { + *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8; + written += 1; + *(dst.get_unchecked_mut(written)) = + ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8; + written += 1; + *(dst.get_unchecked_mut(written)) = + ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8; + written += 1; + *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8; + written += 1; + } + break; + } + // The next code unit is not a low surrogate. Don't advance + // position and treat the high surrogate as unpaired. + // Fall through + } + // Unpaired low surrogate + unsafe { + *(dst.get_unchecked_mut(written)) = 0xEFu8; + written += 1; + *(dst.get_unchecked_mut(written)) = 0xBFu8; + written += 1; + *(dst.get_unchecked_mut(written)) = 0xBDu8; + written += 1; + } + break; + } + // Now see if the next unit is Basic Latin + // read > src.len() is impossible, but using + // >= instead of == allows the compiler to elide a bound check. + if read >= src.len() { + debug_assert_eq!(read, src.len()); + return (read, written); + } + unit = src[read]; + if unsafe { unlikely(unit < 0x80) } { + // written > dst.len() is impossible, but using + // >= instead of == allows the compiler to elide a bound check. + if written >= dst.len() { + debug_assert_eq!(written, dst.len()); + return (read, written); + } + dst[written] = unit as u8; + read += 1; + written += 1; + // Mysteriously, adding a punctuation check here makes + // the expected benificiary cases *slower*! + continue 'outer; + } + continue 'inner; + } + } +} + +#[inline(never)] +pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) { + // Everything below is cold code! + let mut read = 0; + let mut written = 0; + let mut unit = src[read]; + // We now have up to 3 output slots, so an astral character + // will not fit. + if unit < 0x800 { + loop { + if unit < 0x80 { + if written >= dst.len() { + return (read, written); + } + read += 1; + dst[written] = unit as u8; + written += 1; + } else if unit < 0x800 { + if written + 2 > dst.len() { + return (read, written); + } + read += 1; + dst[written] = (unit >> 6) as u8 | 0xC0u8; + written += 1; + dst[written] = (unit & 0x3F) as u8 | 0x80u8; + written += 1; + } else { + return (read, written); + } + // read > src.len() is impossible, but using + // >= instead of == allows the compiler to elide a bound check. + if read >= src.len() { + debug_assert_eq!(read, src.len()); + return (read, written); + } + unit = src[read]; + } + } + // Could be an unpaired surrogate, but we'll need 3 output + // slots in any case. + if written + 3 > dst.len() { + return (read, written); + } + read += 1; + let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); + if unit_minus_surrogate_start <= (0xDFFF - 0xD800) { + // Got surrogate + if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { + // Got high surrogate + if read >= src.len() { + // Unpaired high surrogate + unit = 0xFFFD; + } else { + let second = src[read]; + if in_inclusive_range16(second, 0xDC00, 0xDFFF) { + // Valid surrogate pair, but we know it won't fit. + read -= 1; + return (read, written); + } + // Unpaired high + unit = 0xFFFD; + } + } else { + // Unpaired low + unit = 0xFFFD; + } + } + dst[written] = (unit >> 12) as u8 | 0xE0u8; + written += 1; + dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; + written += 1; + dst[written] = (unit & 0x3F) as u8 | 0x80u8; + written += 1; + debug_assert_eq!(written, dst.len()); + (read, written) +} + +pub struct Utf8Encoder; + +impl Utf8Encoder { + pub fn new(encoding: &'static Encoding) -> Encoder { + Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder)) + } + + pub fn max_buffer_length_from_utf16_without_replacement( + &self, + u16_length: usize, + ) -> Option<usize> { + u16_length.checked_mul(3) + } + + pub fn max_buffer_length_from_utf8_without_replacement( + &self, + byte_length: usize, + ) -> Option<usize> { + Some(byte_length) + } + + pub fn encode_from_utf16_raw( + &mut self, + src: &[u16], + dst: &mut [u8], + _last: bool, + ) -> (EncoderResult, usize, usize) { + let (read, written) = convert_utf16_to_utf8_partial(src, dst); + ( + if read == src.len() { + EncoderResult::InputEmpty + } else { + EncoderResult::OutputFull + }, + read, + written, + ) + } + + pub fn encode_from_utf8_raw( + &mut self, + src: &str, + dst: &mut [u8], + _last: bool, + ) -> (EncoderResult, usize, usize) { + let bytes = src.as_bytes(); + let mut to_write = bytes.len(); + if to_write <= dst.len() { + (&mut dst[..to_write]).copy_from_slice(bytes); + return (EncoderResult::InputEmpty, to_write, to_write); + } + to_write = dst.len(); + // Move back until we find a UTF-8 sequence boundary. + while (bytes[to_write] & 0xC0) == 0x80 { + to_write -= 1; + } + (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]); + (EncoderResult::OutputFull, to_write, to_write) + } +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::super::testing::*; + use super::super::*; + + // fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) { + // decode_to_utf16_without_replacement(UTF_8, bytes, expect); + // } + + fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) { + decode_to_utf8(UTF_8, bytes, expect); + } + + fn decode_valid_utf8(string: &str) { + decode_utf8_to_utf8(string.as_bytes(), string); + } + + fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) { + encode_from_utf16(UTF_8, string, expect); + } + + fn encode_utf8_from_utf8(string: &str, expect: &[u8]) { + encode_from_utf8(UTF_8, string, expect); + } + + fn encode_utf8_from_utf16_with_output_limit( + string: &[u16], + expect: &str, + limit: usize, + expect_result: EncoderResult, + ) { + let mut dst = Vec::new(); + { + dst.resize(limit, 0u8); + let mut encoder = UTF_8.new_encoder(); + let (result, read, written) = + encoder.encode_from_utf16_without_replacement(string, &mut dst, false); + assert_eq!(result, expect_result); + if expect_result == EncoderResult::InputEmpty { + assert_eq!(read, string.len()); + } + assert_eq!(&dst[..written], expect.as_bytes()); + } + { + dst.resize(64, 0u8); + for (i, elem) in dst.iter_mut().enumerate() { + *elem = i as u8; + } + let mut encoder = UTF_8.new_encoder(); + let (_, _, mut j) = + encoder.encode_from_utf16_without_replacement(string, &mut dst, false); + while j < dst.len() { + assert_eq!(usize::from(dst[j]), j); + j += 1; + } + } + } + + #[test] + fn test_utf8_decode() { + // Empty + decode_valid_utf8(""); + // ASCII + decode_valid_utf8("ab"); + // Low BMP + decode_valid_utf8("a\u{E4}Z"); + // High BMP + decode_valid_utf8("a\u{2603}Z"); + // Astral + decode_valid_utf8("a\u{1F4A9}Z"); + // Low BMP with last byte missing + decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}"); + // High BMP with last byte missing + decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}"); + // Astral with last byte missing + decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}"); + // Lone highest continuation + decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}"); + // Two lone highest continuations + decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}"); + // Low BMP followed by lowest lone continuation + decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}"); + // Low BMP followed by highest lone continuation + decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}"); + // High BMP followed by lowest lone continuation + decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}"); + // High BMP followed by highest lone continuation + decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}"); + // Astral followed by lowest lone continuation + decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}"); + // Astral followed by highest lone continuation + decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}"); + + // Boundary conditions + // Lowest single-byte + decode_valid_utf8("Z\x00"); + decode_valid_utf8("Z\x00Z"); + // Lowest single-byte as two-byte overlong sequence + decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z"); + // Lowest single-byte as three-byte overlong sequence + decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lowest single-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // One below lowest single-byte + decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}"); + decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z"); + // Highest single-byte + decode_valid_utf8("a\x7F"); + decode_valid_utf8("a\x7FZ"); + // Highest single-byte as two-byte overlong sequence + decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z"); + // Highest single-byte as three-byte overlong sequence + decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Highest single-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // One past highest single byte (also lone continuation) + decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}"); + // Two lone continuations + decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}"); + // Three lone continuations + decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + // Four lone continuations + decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + // Lowest two-byte + decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}"); + decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z"); + // Lowest two-byte as three-byte overlong sequence + decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lowest two-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lead one below lowest two-byte + decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z"); + // Trail one below lowest two-byte + decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}"); + decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z"); + // Highest two-byte + decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}"); + decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z"); + // Highest two-byte as three-byte overlong sequence + decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Highest two-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lowest three-byte + decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}"); + decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z"); + // Lowest three-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Highest below surrogates + decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}"); + decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z"); + // Highest below surrogates as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // First surrogate + decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // First surrogate as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Last surrogate + decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Last surrogate as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lowest above surrogates + decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}"); + decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z"); + // Lowest above surrogates as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Highest three-byte + decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}"); + decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z"); + // Highest three-byte as four-byte overlong sequence + decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + // Lowest four-byte + decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}"); + decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z"); + // Highest four-byte + decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}"); + decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z"); + // One past highest four-byte + decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z"); + + // Highest four-byte with last byte replaced with 0xFF + decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}"); + decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z"); + } + + #[test] + fn test_utf8_encode() { + // Empty + encode_utf8_from_utf16(&[], b""); + encode_utf8_from_utf8("", b""); + + encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes()); + encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes()); + encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes()); + encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes()); + encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes()); + encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes()); + encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes()); + encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes()); + encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes()); + encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes()); + encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes()); + encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes()); + encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes()); + encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes()); + encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes()); + } + + #[test] + fn test_encode_utf8_from_utf16_with_output_limit() { + encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty); + encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty); + encode_utf8_from_utf16_with_output_limit( + &[0x2603], + "\u{2603}", + 3, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDCA9], + "\u{1F4A9}", + 4, + EncoderResult::InputEmpty, + ); + + encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull); + encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDCA9], + "", + 3, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x0062], + "\u{63}\u{62}", + 2, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00A7], + "\u{63}\u{A7}", + 3, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x2603], + "\u{63}\u{2603}", + 4, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0xD83D, 0xDCA9], + "\u{63}\u{1F4A9}", + 5, + EncoderResult::InputEmpty, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00A7], + "\u{63}", + 2, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x2603], + "\u{63}", + 3, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0xD83D, 0xDCA9], + "\u{63}", + 4, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0x0062], + "\u{B6}\u{62}", + 3, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0x00A7], + "\u{B6}\u{A7}", + 4, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0x2603], + "\u{B6}\u{2603}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0xD83D, 0xDCA9], + "\u{B6}\u{1F4A9}", + 6, + EncoderResult::InputEmpty, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0x00A7], + "\u{B6}", + 3, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0x2603], + "\u{B6}", + 4, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x00B6, 0xD83D, 0xDCA9], + "\u{B6}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062], + "\u{263A}\u{62}", + 4, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x00A7], + "\u{263A}\u{A7}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x2603], + "\u{263A}\u{2603}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xD83D, 0xDCA9], + "\u{263A}\u{1F4A9}", + 7, + EncoderResult::InputEmpty, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x00A7], + "\u{263A}", + 4, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x2603], + "\u{263A}", + 5, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xD83D, 0xDCA9], + "\u{263A}", + 6, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0x0062], + "\u{1F60E}\u{62}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0x00A7], + "\u{1F60E}\u{A7}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0x2603], + "\u{1F60E}\u{2603}", + 7, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9], + "\u{1F60E}\u{1F4A9}", + 8, + EncoderResult::InputEmpty, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0x00A7], + "\u{1F60E}", + 5, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0x2603], + "\u{1F60E}", + 6, + EncoderResult::OutputFull, + ); + encode_utf8_from_utf16_with_output_limit( + &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9], + "\u{1F60E}", + 7, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x0062], + "\u{63}\u{B6}\u{62}\u{62}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x0062], + "\u{63}\u{B6}\u{62}", + 4, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062], + "\u{63}\u{B6}\u{62}\u{62}\u{62}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062], + "\u{63}\u{B6}\u{62}\u{62}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x0062], + "\u{263A}\u{62}\u{62}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x0062], + "\u{263A}\u{62}", + 4, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x0062, 0x0062], + "\u{263A}\u{62}\u{62}\u{62}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x0062, 0x0062], + "\u{263A}\u{62}\u{62}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x00A7], + "\u{63}\u{B6}\u{A7}", + 5, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x00A7], + "\u{63}\u{B6}", + 4, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x00A7, 0x0062], + "\u{63}\u{B6}\u{A7}\u{62}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x00A7, 0x0062], + "\u{63}\u{B6}\u{A7}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x00A7, 0x0062], + "\u{263A}\u{A7}\u{62}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x00A7, 0x0062], + "\u{263A}\u{A7}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x00A7], + "\u{63}\u{B6}\u{62}\u{A7}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x0062, 0x00A7], + "\u{63}\u{B6}\u{62}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x00A7], + "\u{263A}\u{62}\u{A7}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x0062, 0x00A7], + "\u{263A}\u{62}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x2603], + "\u{63}\u{B6}\u{2603}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0x2603], + "\u{63}\u{B6}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x2603], + "\u{263A}\u{2603}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0x2603], + "\u{263A}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0xD83D], + "\u{63}\u{B6}\u{FFFD}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0xD83D], + "\u{63}\u{B6}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xD83D], + "\u{263A}\u{FFFD}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xD83D], + "\u{263A}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0xDCA9], + "\u{63}\u{B6}\u{FFFD}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x0063, 0x00B6, 0xDCA9], + "\u{63}\u{B6}", + 5, + EncoderResult::OutputFull, + ); + + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xDCA9], + "\u{263A}\u{FFFD}", + 6, + EncoderResult::InputEmpty, + ); + encode_utf8_from_utf16_with_output_limit( + &[0x263A, 0xDCA9], + "\u{263A}", + 5, + EncoderResult::OutputFull, + ); + } + + #[test] + fn test_utf8_max_length_from_utf16() { + let mut encoder = UTF_8.new_encoder(); + let mut output = [0u8; 13]; + let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16]; + let needed = encoder + .max_buffer_length_from_utf16_without_replacement(input.len()) + .unwrap(); + let (result, _, _) = + encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true); + assert_eq!(result, EncoderResult::InputEmpty); + } + + #[test] + fn test_decode_bom_prefixed_split_byte_triple() { + let mut output = [0u16; 20]; + let mut decoder = UTF_8.new_decoder(); + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 0); + assert!(!had_errors); + } + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 0); + assert!(!had_errors); + } + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 1); + assert!(!had_errors); + assert_eq!(output[0], 0xFFFE); + } + } + + #[test] + fn test_decode_bom_prefixed_split_byte_pair() { + let mut output = [0u16; 20]; + let mut decoder = UTF_8.new_decoder(); + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 0); + assert!(!had_errors); + } + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 1); + assert!(had_errors); + assert_eq!(output[0], 0xFFFD); + } + } + + #[test] + fn test_decode_bom_prefix() { + let mut output = [0u16; 20]; + let mut decoder = UTF_8.new_decoder(); + { + let needed = decoder.max_utf16_buffer_length(1).unwrap(); + let (result, read, written, had_errors) = + decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 1); + assert!(had_errors); + assert_eq!(output[0], 0xFFFD); + } + } + + #[test] + fn test_tail() { + let mut output = [0u16; 1]; + let mut decoder = UTF_8.new_decoder_without_bom_handling(); + { + let (result, read, written, had_errors) = + decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false); + assert_eq!(result, CoderResult::OutputFull); + assert_eq!(read, 2); + assert_eq!(written, 1); + assert!(!had_errors); + assert_eq!(output[0], 0x00E4); + } + } +} |