diff options
Diffstat (limited to 'third_party/rust/encoding_rs/src/big5.rs')
-rw-r--r-- | third_party/rust/encoding_rs/src/big5.rs | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/src/big5.rs b/third_party/rust/encoding_rs/src/big5.rs new file mode 100644 index 0000000000..5c72c5ef9c --- /dev/null +++ b/third_party/rust/encoding_rs/src/big5.rs @@ -0,0 +1,427 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::*; +use crate::data::*; +use crate::handles::*; +use crate::variant::*; +// Rust 1.14.0 requires the following despite the asterisk above. +use super::in_inclusive_range32; + +pub struct Big5Decoder { + lead: Option<u8>, +} + +impl Big5Decoder { + pub fn new() -> VariantDecoder { + VariantDecoder::Big5(Big5Decoder { lead: None }) + } + + pub fn in_neutral_state(&self) -> bool { + self.lead.is_none() + } + + fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { + byte_length.checked_add(match self.lead { + None => 0, + Some(_) => 1, + }) + } + + pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { + // If there is a lead but the next byte isn't a valid trail, an + // error is generated for the lead (+1). Then another iteration checks + // space, which needs +1 to account for the possibility of astral + // output or combining pair. + checked_add(1, self.plus_one_if_lead(byte_length)) + } + + pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { + // No need to account for REPLACEMENT CHARACTERS. + // Cases: + // ASCII: 1 to 1 + // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4 + // lead set and first byte is trail: 1 to 4 worst case + // + // When checking for space for the last byte: + // no lead: the last byte must be ASCII (or fatal error): 1 to 1 + // lead set: space for 4 bytes was already checked when reading the + // lead, hence the last lead and the last trail together are worst + // case 2 to 4. + // + // If lead set and the input is a single trail byte, the worst-case + // output is 4, so we need to add one before multiplying if lead is + // set. + // + // Finally, add two so that if input is non-zero, the output is at + // least 4. + checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length))) + } + + pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { + // If there is a lead but the next byte isn't a valid trail, an + // error is generated for the lead (+(1*3)). Then another iteration + // checks space, which needs +3 to account for the possibility of astral + // output or combining pair. In between start and end, the worst case + // is that every byte is bad: *3. + checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length))) + } + + ascii_compatible_two_byte_decoder_functions!( + { + // If lead is between 0x81 and 0xFE, inclusive, + // subtract offset 0x81. + let non_ascii_minus_offset = + non_ascii.wrapping_sub(0x81); + if non_ascii_minus_offset > (0xFE - 0x81) { + return (DecoderResult::Malformed(1, 0), + source.consumed(), + handle.written()); + } + non_ascii_minus_offset + }, + { + // If trail is between 0x40 and 0x7E, inclusive, + // subtract offset 0x40. Else if trail is + // between 0xA1 and 0xFE, inclusive, subtract + // offset 0x62. + // TODO: Find out which range is more probable. + let mut trail_minus_offset = + byte.wrapping_sub(0x40); + if trail_minus_offset > (0x7E - 0x40) { + let trail_minus_range_start = + byte.wrapping_sub(0xA1); + if trail_minus_range_start > + (0xFE - 0xA1) { + if byte < 0x80 { + return (DecoderResult::Malformed(1, 0), + unread_handle_trail.unread(), + handle.written()); + } + return (DecoderResult::Malformed(2, 0), + unread_handle_trail.consumed(), + handle.written()); + } + trail_minus_offset = byte - 0x62; + } + let pointer = lead_minus_offset as usize * + 157usize + + trail_minus_offset as usize; + let rebased_pointer = pointer.wrapping_sub(942); + let low_bits = big5_low_bits(rebased_pointer); + if low_bits == 0 { + match pointer { + 1133 => { + handle.write_big5_combination(0x00CAu16, + 0x0304u16) + } + 1135 => { + handle.write_big5_combination(0x00CAu16, + 0x030Cu16) + } + 1164 => { + handle.write_big5_combination(0x00EAu16, + 0x0304u16) + } + 1166 => { + handle.write_big5_combination(0x00EAu16, + 0x030Cu16) + } + _ => { + if byte < 0x80 { + return (DecoderResult::Malformed(1, 0), + unread_handle_trail.unread(), + handle.written()); + } + return (DecoderResult::Malformed(2, 0), + unread_handle_trail.consumed(), + handle.written()); + } + } + } else if big5_is_astral(rebased_pointer) { + handle.write_astral(u32::from(low_bits) | + 0x20000u32) + } else { + handle.write_bmp_excl_ascii(low_bits) + } + }, + self, + non_ascii, + byte, + lead_minus_offset, + unread_handle_trail, + source, + handle, + 'outermost, + copy_ascii_from_check_space_astral, + check_space_astral, + false); +} + +pub struct Big5Encoder; + +impl Big5Encoder { + pub fn new(encoding: &'static Encoding) -> Encoder { + Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder)) + } + + pub fn max_buffer_length_from_utf16_without_replacement( + &self, + u16_length: usize, + ) -> Option<usize> { + // Astral: 2 to 2 + // ASCII: 1 to 1 + // Other: 1 to 2 + u16_length.checked_mul(2) + } + + pub fn max_buffer_length_from_utf8_without_replacement( + &self, + byte_length: usize, + ) -> Option<usize> { + // Astral: 4 to 2 + // Upper BMP: 3 to 2 + // Lower BMP: 2 to 2 + // ASCII: 1 to 1 + byte_length.checked_add(1) + } + + ascii_compatible_encoder_functions!( + { + // For simplicity, unified ideographs + // in the pointer range 11206...11212 are handled + // as Level 1 Hanzi. + if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) { + handle.write_two(lead, trail) + } else { + let pointer = if let Some(pointer) = big5_box_encode(bmp) { + pointer + } else if let Some(pointer) = big5_other_encode(bmp) { + pointer + } else { + return ( + EncoderResult::unmappable_from_bmp(bmp), + source.consumed(), + handle.written(), + ); + }; + let lead = pointer / 157 + 0x81; + let remainder = pointer % 157; + let trail = if remainder < 0x3F { + remainder + 0x40 + } else { + remainder + 0x62 + }; + handle.write_two(lead as u8, trail as u8) + } + }, + { + if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) { + if let Some(rebased_pointer) = big5_astral_encode(astral as u16) { + // big5_astral_encode returns rebased pointer, + // so adding 0x87 instead of 0x81. + let lead = rebased_pointer / 157 + 0x87; + let remainder = rebased_pointer % 157; + let trail = if remainder < 0x3F { + remainder + 0x40 + } else { + remainder + 0x62 + }; + handle.write_two(lead as u8, trail as u8) + } else { + return ( + EncoderResult::Unmappable(astral), + source.consumed(), + handle.written(), + ); + } + } else { + return ( + EncoderResult::Unmappable(astral), + source.consumed(), + handle.written(), + ); + } + }, + bmp, + astral, + self, + source, + handle, + copy_ascii_to_check_space_two, + check_space_two, + false + ); +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::super::testing::*; + use super::super::*; + + fn decode_big5(bytes: &[u8], expect: &str) { + decode(BIG5, bytes, expect); + } + + fn encode_big5(string: &str, expect: &[u8]) { + encode(BIG5, string, expect); + } + + #[test] + fn test_big5_decode() { + // Empty + decode_big5(b"", &""); + + // ASCII + decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); + + // Edge cases + decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); + decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); + decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); + decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); + decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); + decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); + decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); + decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); + decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); + decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); + decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); + decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); + + // Edge cases surrounded with ASCII + decode_big5( + &[0x61u8, 0x87u8, 0x40u8, 0x62u8], + &"\u{0061}\u{43F0}\u{0062}", + ); + decode_big5( + &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], + &"\u{0061}\u{79D4}\u{0062}", + ); + decode_big5( + &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], + &"\u{0061}\u{2910D}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0x62u8, 0x62u8], + &"\u{0061}\u{00CA}\u{0304}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0x64u8, 0x62u8], + &"\u{0061}\u{00CA}\u{030C}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0x66u8, 0x62u8], + &"\u{0061}\u{00CA}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], + &"\u{0061}\u{00EA}\u{0304}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], + &"\u{0061}\u{00EA}\u{030C}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], + &"\u{0061}\u{00EA}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], + &"\u{0061}\u{8991}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], + &"\u{0061}\u{27967}\u{0062}", + ); + decode_big5( + &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], + &"\u{0061}\u{8A29}\u{0062}", + ); + + // Bad sequences + decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); + decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); + decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); + decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); + decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); + decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); + } + + #[test] + fn test_big5_encode() { + // Empty + encode_big5("", b""); + + // ASCII + encode_big5("\u{0061}\u{0062}", b"\x61\x62"); + + if !cfg!(miri) { + // Miri is too slow + // Edge cases + encode_big5("\u{9EA6}\u{0061}", b"麦\x61"); + encode_big5("\u{2626B}\u{0061}", b"𦉫\x61"); + encode_big5("\u{3000}", b"\xA1\x40"); + encode_big5("\u{20AC}", b"\xA3\xE1"); + encode_big5("\u{4E00}", b"\xA4\x40"); + encode_big5("\u{27607}", b"\xC8\xA4"); + encode_big5("\u{FFE2}", b"\xC8\xCD"); + encode_big5("\u{79D4}", b"\xFE\xFE"); + + // Not in index + encode_big5("\u{2603}\u{0061}", b"☃\x61"); + } + + // duplicate low bits + encode_big5("\u{203B5}", b"\xFD\x6A"); + encode_big5("\u{25605}", b"\xFE\x46"); + + // prefer last + encode_big5("\u{2550}", b"\xF9\xF9"); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_big5_decode_all() { + let input = include_bytes!("test_data/big5_in.txt"); + let expectation = include_str!("test_data/big5_in_ref.txt"); + let (cow, had_errors) = BIG5.decode_without_bom_handling(input); + assert!(had_errors, "Should have had errors."); + assert_eq!(&cow[..], expectation); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_big5_encode_all() { + let input = include_str!("test_data/big5_out.txt"); + let expectation = include_bytes!("test_data/big5_out_ref.txt"); + let (cow, encoding, had_errors) = BIG5.encode(input); + assert!(!had_errors, "Should not have had errors."); + assert_eq!(encoding, BIG5); + assert_eq!(&cow[..], &expectation[..]); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_big5_encode_from_two_low_surrogates() { + let expectation = b"��"; + let mut output = [0u8; 40]; + let mut encoder = BIG5.new_encoder(); + let (result, read, written, had_errors) = + encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); + assert_eq!(result, CoderResult::InputEmpty); + assert_eq!(read, 2); + assert_eq!(written, expectation.len()); + assert!(had_errors); + assert_eq!(&output[..written], expectation); + } +} |