diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/rust/encoding_rs/src/gb18030.rs | |
parent | Initial commit. (diff) | |
download | firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/encoding_rs/src/gb18030.rs')
-rw-r--r-- | third_party/rust/encoding_rs/src/gb18030.rs | 767 |
1 files changed, 767 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/src/gb18030.rs b/third_party/rust/encoding_rs/src/gb18030.rs new file mode 100644 index 0000000000..a0b3bd7f03 --- /dev/null +++ b/third_party/rust/encoding_rs/src/gb18030.rs @@ -0,0 +1,767 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use super::*; +use crate::data::*; +use crate::handles::*; +use crate::variant::*; +// Rust 1.14.0 requires the following despite the asterisk above. +use super::in_inclusive_range16; +use super::in_range16; + +enum Gb18030Pending { + None, + One(u8), + Two(u8, u8), + Three(u8, u8, u8), +} + +impl Gb18030Pending { + fn is_none(&self) -> bool { + match *self { + Gb18030Pending::None => true, + _ => false, + } + } + + fn count(&self) -> usize { + match *self { + Gb18030Pending::None => 0, + Gb18030Pending::One(_) => 1, + Gb18030Pending::Two(_, _) => 2, + Gb18030Pending::Three(_, _, _) => 3, + } + } +} + +pub struct Gb18030Decoder { + first: Option<u8>, + second: Option<u8>, + third: Option<u8>, + pending: Gb18030Pending, + pending_ascii: Option<u8>, +} + +impl Gb18030Decoder { + pub fn new() -> VariantDecoder { + VariantDecoder::Gb18030(Gb18030Decoder { + first: None, + second: None, + third: None, + pending: Gb18030Pending::None, + pending_ascii: None, + }) + } + + pub fn in_neutral_state(&self) -> bool { + self.first.is_none() + && self.second.is_none() + && self.third.is_none() + && self.pending.is_none() + && self.pending_ascii.is_none() + } + + fn extra_from_state(&self, byte_length: usize) -> Option<usize> { + byte_length.checked_add( + self.pending.count() + + match self.first { + None => 0, + Some(_) => 1, + } + + match self.second { + None => 0, + Some(_) => 1, + } + + match self.third { + None => 0, + Some(_) => 1, + } + + match self.pending_ascii { + None => 0, + Some(_) => 1, + }, + ) + } + + pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { + // ASCII: 1 to 1 (worst case) + // gbk: 2 to 1 + // ranges: 4 to 1 or 4 to 2 + checked_add(1, self.extra_from_state(byte_length)) + } + + pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { + // ASCII: 1 to 1 + // gbk: 2 to 2 or 2 to 3 + // ranges: 4 to 2, 4 to 3 or 4 to 4 + // 0x80: 1 to 3 (worst case) + self.max_utf8_buffer_length(byte_length) + } + + pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { + checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) + } + + gb18030_decoder_functions!( + { + // If first is between 0x81 and 0xFE, inclusive, + // subtract offset 0x81. + let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); + if non_ascii_minus_offset > (0xFE - 0x81) { + if non_ascii == 0x80 { + handle.write_upper_bmp(0x20ACu16); + continue 'outermost; + } + return (DecoderResult::Malformed(1, 0), + source.consumed(), + handle.written()); + } + non_ascii_minus_offset + }, + { + // Two-byte (or error) + if first_minus_offset >= 0x20 { + // Not the gbk ideograph range above GB2312 + let trail_minus_offset = second.wrapping_sub(0xA1); + if trail_minus_offset <= (0xFE - 0xA1) { + // GB2312 + let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); + if hanzi_lead < (0x77 - 0x2F) { + // Level 1 Hanzi, Level 2 Hanzi + // or one of the 5 PUA code + // points in between. + let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; + let upper_bmp = GB2312_HANZI[hanzi_pointer]; + handle.write_upper_bmp(upper_bmp) + } else if first_minus_offset == 0x20 { + // Symbols (starting with ideographic space) + let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; + handle.write_bmp_excl_ascii(bmp) + } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { + handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) + } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { + handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) + } else if first_minus_offset > 0x76 { + // Bottom PUA + let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; + handle.write_upper_bmp(pua) + } else { + let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); + handle.write_bmp_excl_ascii(bmp) + } + } else { + // gbk range on the left + let mut trail_minus_offset = second.wrapping_sub(0x40); + if trail_minus_offset > (0x7E - 0x40) { + let trail_minus_range_start = second.wrapping_sub(0x80); + if trail_minus_range_start > (0xA0 - 0x80) { + if second < 0x80 { + return (DecoderResult::Malformed(1, 0), + unread_handle_second.unread(), + handle.written()); + } + return (DecoderResult::Malformed(2, 0), + unread_handle_second.consumed(), + handle.written()); + } + trail_minus_offset = second - 0x41; + } + // Zero-base lead + let left_lead = first_minus_offset - 0x20; + let left_pointer = left_lead as usize * (190 - 94) + + trail_minus_offset as usize; + let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); + if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { + let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); + handle.write_upper_bmp(upper_bmp) + } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { + let bmp = gbk_other_decode(left_pointer as u16); + handle.write_bmp_excl_ascii(bmp) + } else { + let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); + let upper_bmp = GBK_BOTTOM[bottom_pointer]; + handle.write_upper_bmp(upper_bmp) + } + } + } else { + // gbk ideograph range above GB2312 + let mut trail_minus_offset = second.wrapping_sub(0x40); + if trail_minus_offset > (0x7E - 0x40) { + let trail_minus_range_start = second.wrapping_sub(0x80); + if trail_minus_range_start > (0xFE - 0x80) { + if second < 0x80 { + return (DecoderResult::Malformed(1, 0), + unread_handle_second.unread(), + handle.written()); + } + return (DecoderResult::Malformed(2, 0), + unread_handle_second.consumed(), + handle.written()); + } + trail_minus_offset = second - 0x41; + } + let pointer = first_minus_offset as usize * 190usize + + trail_minus_offset as usize; + let upper_bmp = gbk_top_ideograph_decode(pointer as u16); + handle.write_upper_bmp(upper_bmp) + } + }, + { + // If third is between 0x81 and 0xFE, inclusive, + // subtract offset 0x81. + let third_minus_offset = third.wrapping_sub(0x81); + if third_minus_offset > (0xFE - 0x81) { + // We have an error. Let's inline what's going + // to happen when `second` is + // reprocessed. (`third` gets unread.) + // `second` is guaranteed ASCII, so let's + // put it in `pending_ascii`. Recompute + // `second` from `second_minus_offset`. + self.pending_ascii = Some(second_minus_offset + 0x30); + // Now unread `third` and designate the previous + // `first` as being in error. + return (DecoderResult::Malformed(1, 1), + unread_handle_third.unread(), + handle.written()); + } + third_minus_offset + }, + { + // If fourth is between 0x30 and 0x39, inclusive, + // subtract offset 0x30. + // + // If we have an error, we'll inline what's going + // to happen when `second` and `third` are + // reprocessed. (`fourth` gets unread.) + // `second` is guaranteed ASCII, so let's + // put it in `pending_ascii`. Recompute + // `second` from `second_minus_offset` to + // make this block reusable when `second` + // is not in scope. + // + // `third` is guaranteed to be in the range + // that makes it become the new `self.first`. + // + // `fourth` gets unread and the previous + // `first` gets designates as being in error. + let fourth_minus_offset = fourth.wrapping_sub(0x30); + if fourth_minus_offset > (0x39 - 0x30) { + self.pending_ascii = Some(second_minus_offset + 0x30); + self.pending = Gb18030Pending::One(third_minus_offset); + return (DecoderResult::Malformed(1, 2), + unread_handle_fourth.unread(), + handle.written()); + } + let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + + (second_minus_offset as usize * (10 * 126)) + + (third_minus_offset as usize * 10) + + fourth_minus_offset as usize; + if pointer <= 39419 { + // BMP + if pointer == 7457 { + handle.write_upper_bmp(0xE7C7) + } else { + handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) + } + } else if pointer >= 189_000 && pointer <= 1_237_575 { + // Astral + handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) + } else { + return (DecoderResult::Malformed(4, 0), + unread_handle_fourth.consumed(), + handle.written()); + } + }, + self, + non_ascii, + first_minus_offset, + second, + second_minus_offset, + unread_handle_second, + third, + third_minus_offset, + unread_handle_third, + fourth, + fourth_minus_offset, + unread_handle_fourth, + source, + handle, + 'outermost); +} + +// XXX Experiment with inline directives +fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { + // Try ideographic punctuation first as it's the most likely case. + // Throwing in the check for full-width currencies and tilde is probably + // more size-efficient here than elsewhere. + if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { + if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { + return Some((0xA1, pos + 0xA1)); + } + } + // Ext A + if in_range16(bmp, 0x3400, 0x4E00) { + return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { + ( + 0xFE, + pos + if pos < (0x3F - 16) { + 0x40 + 16 + } else { + 0x41 + 16 + }, + ) + }); + } + // Compatibility ideographs + if in_range16(bmp, 0xF900, 0xFB00) { + return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { + if pos < 5 { + // end of second to last row + (0xFD, pos + (190 - 94 - 5 + 0x41)) + } else { + // last row + (0xFE, pos + (0x40 - 5)) + } + }); + } + // Handle everything below U+02CA, which is in GBK_OTHER. + if bmp < 0x02CA { + if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { + // Pinyin except U+1E3F + if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { + return Some((0xA8, pos + 0xA1)); + } + } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) + || in_inclusive_range16(bmp, 0x02C7, 0x02C9) + { + // Diacritics and Latin 1 symbols + if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { + return Some((0xA1, pos + 0xA1 + 3)); + } + } + return None; + } + if bmp >= 0xE794 { + // Various brackets, all in PUA or full-width regions + if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { + return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); + } + } else if bmp == 0x1E3F { + // The one Pinyin placed elsewhere on the BMP + return Some((0xA8, 0x7B - 0x60 + 0xA1)); + } else if in_range16(bmp, 0xA000, 0xD800) { + // Since Korean has usage in China, let's spend a branch to fast-track + // Hangul. + return None; + } + // GB2312 other (except bottom PUA and PUA between Hanzi levels). + if let Some(other_pointer) = gb2312_other_encode(bmp) { + let other_lead = other_pointer as usize / 94; + let other_trail = other_pointer as usize % 94; + return Some((0xA2 + other_lead, 0xA1 + other_trail)); + } + // At this point, we've handled all mappable characters above U+02D9 but + // below U+2010. Let's check for that range in order to let lower BMP + // characters used for minority languages in China avoid the subsequent + // search that deals mainly with various symbols. + if in_range16(bmp, 0x02DA, 0x2010) { + return None; + } + // GBK other (except radicals and PUA in GBK_BOTTOM). + if let Some(other_pointer) = gbk_other_encode(bmp) { + let other_lead = other_pointer as usize / (190 - 94); + let other_trail = other_pointer as usize % (190 - 94); + let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; + return Some((other_lead + (0x81 + 0x20), other_trail + offset)); + } + // CJK Radicals Supplement or PUA in GBK_BOTTOM + if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) { + if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { + let trail = pos + 16; + let offset = if trail < 0x3F { 0x40 } else { 0x41 }; + return Some((0xFE, trail + offset)); + } + } + // GB2312 bottom PUA + let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); + if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { + let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; + let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; + return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); + } + // PUA between Hanzi Levels + let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); + if bmp_minus_pua_between_hanzi < 5 { + return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); + } + None +} + +#[cfg(not(feature = "fast-gb-hanzi-encode"))] +#[inline(always)] +fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { + if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { + (lead, trail) + } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { + let hanzi_lead = (hanzi_pointer / 94) + (0xD8); + let hanzi_trail = (hanzi_pointer % 94) + 0xA1; + (hanzi_lead as u8, hanzi_trail as u8) + } else { + let (lead, gbk_trail) = if bmp < 0x72DC { + // Above GB2312 + let pointer = gbk_top_ideograph_encode(bmp) as usize; + let lead = (pointer / 190) + 0x81; + let gbk_trail = pointer % 190; + (lead, gbk_trail) + } else { + // To the left of GB2312 + let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; + let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); + let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); + (lead, gbk_trail) + }; + let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; + (lead as u8, (gbk_trail + offset) as u8) + } +} + +#[cfg(feature = "fast-gb-hanzi-encode")] +#[inline(always)] +fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { + gbk_hanzi_encode(bmp_minus_unified_start) +} + +pub struct Gb18030Encoder { + extended: bool, +} + +impl Gb18030Encoder { + pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { + Encoder::new( + encoding, + VariantEncoder::Gb18030(Gb18030Encoder { + extended: extended_range, + }), + ) + } + + pub fn max_buffer_length_from_utf16_without_replacement( + &self, + u16_length: usize, + ) -> Option<usize> { + if self.extended { + u16_length.checked_mul(4) + } else { + // Need to add, because space check is done with the four-byte + // assumption. + checked_add(2, u16_length.checked_mul(2)) + } + } + + pub fn max_buffer_length_from_utf8_without_replacement( + &self, + byte_length: usize, + ) -> Option<usize> { + if self.extended { + // 1 to 1 + // 2 to 2 + // 3 to 2 + // 2 to 4 (worst) + // 3 to 4 + // 4 to 4 + checked_add(2, byte_length.checked_mul(2)) + } else { + // 1 to 1 + // 2 to 2 + // 3 to 2 + // Need to add, because space check is done with the four-byte + // assumption. + byte_length.checked_add(3) + } + } + + ascii_compatible_encoder_functions!( + { + let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); + if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { + // CJK Unified Ideographs + // Can't fail now, since all are + // mapped. + let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); + handle.write_two(lead, trail) + } else if bmp == 0xE5E5 { + // It's not optimal to check for the unmappable + // and for euro at this stage, but getting + // the out of the way makes the rest of the + // code less messy. + return ( + EncoderResult::unmappable_from_bmp(bmp), + source.consumed(), + handle.written(), + ); + } else if bmp == 0x20AC && !self.extended { + handle.write_one(0x80u8) + } else { + match gbk_encode_non_unified(bmp) { + Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), + None => { + if !self.extended { + return ( + EncoderResult::unmappable_from_bmp(bmp), + source.consumed(), + handle.written(), + ); + } + let range_pointer = gb18030_range_encode(bmp); + let first = range_pointer / (10 * 126 * 10); + let rem_first = range_pointer % (10 * 126 * 10); + let second = rem_first / (10 * 126); + let rem_second = rem_first % (10 * 126); + let third = rem_second / 10; + let fourth = rem_second % 10; + handle.write_four( + (first + 0x81) as u8, + (second + 0x30) as u8, + (third + 0x81) as u8, + (fourth + 0x30) as u8, + ) + } + } + } + }, + { + if !self.extended { + return ( + EncoderResult::Unmappable(astral), + source.consumed(), + handle.written(), + ); + } + let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); + let first = range_pointer / (10 * 126 * 10); + let rem_first = range_pointer % (10 * 126 * 10); + let second = rem_first / (10 * 126); + let rem_second = rem_first % (10 * 126); + let third = rem_second / 10; + let fourth = rem_second % 10; + handle.write_four( + (first + 0x81) as u8, + (second + 0x30) as u8, + (third + 0x81) as u8, + (fourth + 0x30) as u8, + ) + }, + bmp, + astral, + self, + source, + handle, + copy_ascii_to_check_space_four, + check_space_four, + false + ); +} + +// Any copyright to the test code below this comment is dedicated to the +// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::super::testing::*; + use super::super::*; + + fn decode_gb18030(bytes: &[u8], expect: &str) { + decode(GB18030, bytes, expect); + } + + fn encode_gb18030(string: &str, expect: &[u8]) { + encode(GB18030, string, expect); + } + + fn encode_gbk(string: &str, expect: &[u8]) { + encode(GBK, string, expect); + } + + #[test] + fn test_gb18030_decode() { + // Empty + decode_gb18030(b"", &""); + + // ASCII + decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}"); + + // euro + decode_gb18030(b"\x80", "\u{20AC}"); + decode_gb18030(b"\xA2\xE3", "\u{20AC}"); + + // two bytes + decode_gb18030(b"\x81\x40", "\u{4E02}"); + decode_gb18030(b"\x81\x7E", "\u{4E8A}"); + decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}"); + decode_gb18030(b"\x81\x80", "\u{4E90}"); + decode_gb18030(b"\x81\xFE", "\u{4FA2}"); + decode_gb18030(b"\xFE\x40", "\u{FA0C}"); + decode_gb18030(b"\xFE\x7E", "\u{E843}"); + decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}"); + decode_gb18030(b"\xFE\x80", "\u{4723}"); + decode_gb18030(b"\xFE\xFE", "\u{E4C5}"); + + // The difference from the original GB18030 + decode_gb18030(b"\xA3\xA0", "\u{3000}"); + decode_gb18030(b"\xA1\xA1", "\u{3000}"); + + // 0xFF + decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}"); + decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} ! + decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! + decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}"); + decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}"); + decode_gb18030( + b"\xFF\x32\x9A\x33\x00", + "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}", + ); + + // Four bytes + decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}"); + decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}"); + decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}"); + decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}"); + decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}"); + decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}"); + decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}"); + decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} ! + decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}"); + } + + #[test] + fn test_gb18030_encode() { + // Empty + encode_gb18030("", b""); + + // ASCII + encode_gb18030("\u{0061}\u{0062}", b"\x61\x62"); + + // euro + encode_gb18030("\u{20AC}", b"\xA2\xE3"); + + // two bytes + encode_gb18030("\u{4E02}", b"\x81\x40"); + encode_gb18030("\u{4E8A}", b"\x81\x7E"); + if !cfg!(miri) { + // Miri is too slow + encode_gb18030("\u{4E90}", b"\x81\x80"); + encode_gb18030("\u{4FA2}", b"\x81\xFE"); + encode_gb18030("\u{FA0C}", b"\xFE\x40"); + encode_gb18030("\u{E843}", b"\xFE\x7E"); + encode_gb18030("\u{4723}", b"\xFE\x80"); + encode_gb18030("\u{E4C5}", b"\xFE\xFE"); + } + + // The difference from the original GB18030 + encode_gb18030("\u{E5E5}", b""); + encode_gb18030("\u{3000}", b"\xA1\xA1"); + + // Four bytes + encode_gb18030("\u{0080}", b"\x81\x30\x81\x30"); + encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37"); + if !cfg!(miri) { + // Miri is too slow + encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30"); + encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33"); + encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35"); + } + + // Edge cases + encode_gb18030("\u{00F7}", b"\xA1\xC2"); + } + + #[test] + fn test_gbk_encode() { + // Empty + encode_gbk("", b""); + + // ASCII + encode_gbk("\u{0061}\u{0062}", b"\x61\x62"); + + // euro + encode_gbk("\u{20AC}", b"\x80"); + + // two bytes + encode_gbk("\u{4E02}", b"\x81\x40"); + encode_gbk("\u{4E8A}", b"\x81\x7E"); + if !cfg!(miri) { + // Miri is too slow + encode_gbk("\u{4E90}", b"\x81\x80"); + encode_gbk("\u{4FA2}", b"\x81\xFE"); + encode_gbk("\u{FA0C}", b"\xFE\x40"); + encode_gbk("\u{E843}", b"\xFE\x7E"); + encode_gbk("\u{4723}", b"\xFE\x80"); + encode_gbk("\u{E4C5}", b"\xFE\xFE"); + } + + // The difference from the original gb18030 + encode_gbk("\u{E5E5}", b""); + encode_gbk("\u{3000}", b"\xA1\xA1"); + + // Four bytes + encode_gbk("\u{0080}", b"€"); + encode_gbk("\u{E7C7}", b""); + if !cfg!(miri) { + // Miri is too slow + encode_gbk("\u{2603}", b"☃"); + encode_gbk("\u{1F4A9}", b"💩"); + encode_gbk("\u{10FFFF}", b""); + } + + // Edge cases + encode_gbk("\u{00F7}", b"\xA1\xC2"); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_gb18030_decode_all() { + let input = include_bytes!("test_data/gb18030_in.txt"); + let expectation = include_str!("test_data/gb18030_in_ref.txt"); + let (cow, had_errors) = GB18030.decode_without_bom_handling(input); + assert!(!had_errors, "Should not have had errors."); + assert_eq!(&cow[..], expectation); + } + + #[test] + #[cfg_attr(miri, ignore)] // Miri is too slow + fn test_gb18030_encode_all() { + let input = include_str!("test_data/gb18030_out.txt"); + let expectation = include_bytes!("test_data/gb18030_out_ref.txt"); + let (cow, encoding, had_errors) = GB18030.encode(input); + assert!(!had_errors, "Should not have had errors."); + assert_eq!(encoding, GB18030); + assert_eq!(&cow[..], &expectation[..]); + } + + #[test] + fn test_gb18030_encode_from_utf16_max_length() { + let mut output = [0u8; 20]; + let mut encoder = GB18030.new_encoder(); + { + let needed = encoder + .max_buffer_length_from_utf16_without_replacement(1) + .unwrap(); + let (result, read, written) = encoder.encode_from_utf16_without_replacement( + &[0x3000], + &mut output[..needed], + true, + ); + assert_eq!(result, EncoderResult::InputEmpty); + assert_eq!(read, 1); + assert_eq!(written, 2); + assert_eq!(output[0], 0xA1); + assert_eq!(output[1], 0xA1); + } + } +} |