// Copyright Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use super::*; use crate::data::*; use crate::handles::*; use crate::variant::*; // Rust 1.14.0 requires the following despite the asterisk above. use super::in_inclusive_range16; use super::in_range16; enum Gb18030Pending { None, One(u8), Two(u8, u8), Three(u8, u8, u8), } impl Gb18030Pending { fn is_none(&self) -> bool { match *self { Gb18030Pending::None => true, _ => false, } } fn count(&self) -> usize { match *self { Gb18030Pending::None => 0, Gb18030Pending::One(_) => 1, Gb18030Pending::Two(_, _) => 2, Gb18030Pending::Three(_, _, _) => 3, } } } pub struct Gb18030Decoder { first: Option, second: Option, third: Option, pending: Gb18030Pending, pending_ascii: Option, } impl Gb18030Decoder { pub fn new() -> VariantDecoder { VariantDecoder::Gb18030(Gb18030Decoder { first: None, second: None, third: None, pending: Gb18030Pending::None, pending_ascii: None, }) } pub fn in_neutral_state(&self) -> bool { self.first.is_none() && self.second.is_none() && self.third.is_none() && self.pending.is_none() && self.pending_ascii.is_none() } fn extra_from_state(&self, byte_length: usize) -> Option { byte_length.checked_add( self.pending.count() + match self.first { None => 0, Some(_) => 1, } + match self.second { None => 0, Some(_) => 1, } + match self.third { None => 0, Some(_) => 1, } + match self.pending_ascii { None => 0, Some(_) => 1, }, ) } pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option { // ASCII: 1 to 1 (worst case) // gbk: 2 to 1 // ranges: 4 to 1 or 4 to 2 checked_add(1, self.extra_from_state(byte_length)) } pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option { // ASCII: 1 to 1 // gbk: 2 to 2 or 2 to 3 // ranges: 4 to 2, 4 to 3 or 4 to 4 // 0x80: 1 to 3 (worst case) self.max_utf8_buffer_length(byte_length) } pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option { checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) } gb18030_decoder_functions!( { // If first is between 0x81 and 0xFE, inclusive, // subtract offset 0x81. let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); if non_ascii_minus_offset > (0xFE - 0x81) { if non_ascii == 0x80 { handle.write_upper_bmp(0x20ACu16); continue 'outermost; } return (DecoderResult::Malformed(1, 0), source.consumed(), handle.written()); } non_ascii_minus_offset }, { // Two-byte (or error) if first_minus_offset >= 0x20 { // Not the gbk ideograph range above GB2312 let trail_minus_offset = second.wrapping_sub(0xA1); if trail_minus_offset <= (0xFE - 0xA1) { // GB2312 let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); if hanzi_lead < (0x77 - 0x2F) { // Level 1 Hanzi, Level 2 Hanzi // or one of the 5 PUA code // points in between. let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; let upper_bmp = GB2312_HANZI[hanzi_pointer]; handle.write_upper_bmp(upper_bmp) } else if first_minus_offset == 0x20 { // Symbols (starting with ideographic space) let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; handle.write_bmp_excl_ascii(bmp) } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) } else if first_minus_offset > 0x76 { // Bottom PUA let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; handle.write_upper_bmp(pua) } else { let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); handle.write_bmp_excl_ascii(bmp) } } else { // gbk range on the left let mut trail_minus_offset = second.wrapping_sub(0x40); if trail_minus_offset > (0x7E - 0x40) { let trail_minus_range_start = second.wrapping_sub(0x80); if trail_minus_range_start > (0xA0 - 0x80) { if second < 0x80 { return (DecoderResult::Malformed(1, 0), unread_handle_second.unread(), handle.written()); } return (DecoderResult::Malformed(2, 0), unread_handle_second.consumed(), handle.written()); } trail_minus_offset = second - 0x41; } // Zero-base lead let left_lead = first_minus_offset - 0x20; let left_pointer = left_lead as usize * (190 - 94) + trail_minus_offset as usize; let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); handle.write_upper_bmp(upper_bmp) } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { let bmp = gbk_other_decode(left_pointer as u16); handle.write_bmp_excl_ascii(bmp) } else { let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); let upper_bmp = GBK_BOTTOM[bottom_pointer]; handle.write_upper_bmp(upper_bmp) } } } else { // gbk ideograph range above GB2312 let mut trail_minus_offset = second.wrapping_sub(0x40); if trail_minus_offset > (0x7E - 0x40) { let trail_minus_range_start = second.wrapping_sub(0x80); if trail_minus_range_start > (0xFE - 0x80) { if second < 0x80 { return (DecoderResult::Malformed(1, 0), unread_handle_second.unread(), handle.written()); } return (DecoderResult::Malformed(2, 0), unread_handle_second.consumed(), handle.written()); } trail_minus_offset = second - 0x41; } let pointer = first_minus_offset as usize * 190usize + trail_minus_offset as usize; let upper_bmp = gbk_top_ideograph_decode(pointer as u16); handle.write_upper_bmp(upper_bmp) } }, { // If third is between 0x81 and 0xFE, inclusive, // subtract offset 0x81. let third_minus_offset = third.wrapping_sub(0x81); if third_minus_offset > (0xFE - 0x81) { // We have an error. Let's inline what's going // to happen when `second` is // reprocessed. (`third` gets unread.) // `second` is guaranteed ASCII, so let's // put it in `pending_ascii`. Recompute // `second` from `second_minus_offset`. self.pending_ascii = Some(second_minus_offset + 0x30); // Now unread `third` and designate the previous // `first` as being in error. return (DecoderResult::Malformed(1, 1), unread_handle_third.unread(), handle.written()); } third_minus_offset }, { // If fourth is between 0x30 and 0x39, inclusive, // subtract offset 0x30. // // If we have an error, we'll inline what's going // to happen when `second` and `third` are // reprocessed. (`fourth` gets unread.) // `second` is guaranteed ASCII, so let's // put it in `pending_ascii`. Recompute // `second` from `second_minus_offset` to // make this block reusable when `second` // is not in scope. // // `third` is guaranteed to be in the range // that makes it become the new `self.first`. // // `fourth` gets unread and the previous // `first` gets designates as being in error. let fourth_minus_offset = fourth.wrapping_sub(0x30); if fourth_minus_offset > (0x39 - 0x30) { self.pending_ascii = Some(second_minus_offset + 0x30); self.pending = Gb18030Pending::One(third_minus_offset); return (DecoderResult::Malformed(1, 2), unread_handle_fourth.unread(), handle.written()); } let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + (second_minus_offset as usize * (10 * 126)) + (third_minus_offset as usize * 10) + fourth_minus_offset as usize; if pointer <= 39419 { // BMP if pointer == 7457 { handle.write_upper_bmp(0xE7C7) } else { handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) } } else if pointer >= 189_000 && pointer <= 1_237_575 { // Astral handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) } else { return (DecoderResult::Malformed(4, 0), unread_handle_fourth.consumed(), handle.written()); } }, self, non_ascii, first_minus_offset, second, second_minus_offset, unread_handle_second, third, third_minus_offset, unread_handle_third, fourth, fourth_minus_offset, unread_handle_fourth, source, handle, 'outermost); } // XXX Experiment with inline directives fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { // Try ideographic punctuation first as it's the most likely case. // Throwing in the check for full-width currencies and tilde is probably // more size-efficient here than elsewhere. if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { return Some((0xA1, pos + 0xA1)); } } // Ext A if in_range16(bmp, 0x3400, 0x4E00) { return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { ( 0xFE, pos + if pos < (0x3F - 16) { 0x40 + 16 } else { 0x41 + 16 }, ) }); } // Compatibility ideographs if in_range16(bmp, 0xF900, 0xFB00) { return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { if pos < 5 { // end of second to last row (0xFD, pos + (190 - 94 - 5 + 0x41)) } else { // last row (0xFE, pos + (0x40 - 5)) } }); } // Handle everything below U+02CA, which is in GBK_OTHER. if bmp < 0x02CA { if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { // Pinyin except U+1E3F if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { return Some((0xA8, pos + 0xA1)); } } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) || in_inclusive_range16(bmp, 0x02C7, 0x02C9) { // Diacritics and Latin 1 symbols if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { return Some((0xA1, pos + 0xA1 + 3)); } } return None; } if bmp >= 0xE794 { // Various brackets, all in PUA or full-width regions if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); } } else if bmp == 0x1E3F { // The one Pinyin placed elsewhere on the BMP return Some((0xA8, 0x7B - 0x60 + 0xA1)); } else if in_range16(bmp, 0xA000, 0xD800) { // Since Korean has usage in China, let's spend a branch to fast-track // Hangul. return None; } // GB2312 other (except bottom PUA and PUA between Hanzi levels). if let Some(other_pointer) = gb2312_other_encode(bmp) { let other_lead = other_pointer as usize / 94; let other_trail = other_pointer as usize % 94; return Some((0xA2 + other_lead, 0xA1 + other_trail)); } // At this point, we've handled all mappable characters above U+02D9 but // below U+2010. Let's check for that range in order to let lower BMP // characters used for minority languages in China avoid the subsequent // search that deals mainly with various symbols. if in_range16(bmp, 0x02DA, 0x2010) { return None; } // GBK other (except radicals and PUA in GBK_BOTTOM). if let Some(other_pointer) = gbk_other_encode(bmp) { let other_lead = other_pointer as usize / (190 - 94); let other_trail = other_pointer as usize % (190 - 94); let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; return Some((other_lead + (0x81 + 0x20), other_trail + offset)); } // CJK Radicals Supplement or PUA in GBK_BOTTOM if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) { if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { let trail = pos + 16; let offset = if trail < 0x3F { 0x40 } else { 0x41 }; return Some((0xFE, trail + offset)); } } // GB2312 bottom PUA let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); } // PUA between Hanzi Levels let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); if bmp_minus_pua_between_hanzi < 5 { return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); } None } #[cfg(not(feature = "fast-gb-hanzi-encode"))] #[inline(always)] fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { (lead, trail) } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { let hanzi_lead = (hanzi_pointer / 94) + (0xD8); let hanzi_trail = (hanzi_pointer % 94) + 0xA1; (hanzi_lead as u8, hanzi_trail as u8) } else { let (lead, gbk_trail) = if bmp < 0x72DC { // Above GB2312 let pointer = gbk_top_ideograph_encode(bmp) as usize; let lead = (pointer / 190) + 0x81; let gbk_trail = pointer % 190; (lead, gbk_trail) } else { // To the left of GB2312 let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); (lead, gbk_trail) }; let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; (lead as u8, (gbk_trail + offset) as u8) } } #[cfg(feature = "fast-gb-hanzi-encode")] #[inline(always)] fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { gbk_hanzi_encode(bmp_minus_unified_start) } pub struct Gb18030Encoder { extended: bool, } impl Gb18030Encoder { pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { Encoder::new( encoding, VariantEncoder::Gb18030(Gb18030Encoder { extended: extended_range, }), ) } pub fn max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option { if self.extended { u16_length.checked_mul(4) } else { // Need to add, because space check is done with the four-byte // assumption. checked_add(2, u16_length.checked_mul(2)) } } pub fn max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option { if self.extended { // 1 to 1 // 2 to 2 // 3 to 2 // 2 to 4 (worst) // 3 to 4 // 4 to 4 checked_add(2, byte_length.checked_mul(2)) } else { // 1 to 1 // 2 to 2 // 3 to 2 // Need to add, because space check is done with the four-byte // assumption. byte_length.checked_add(3) } } ascii_compatible_encoder_functions!( { let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { // CJK Unified Ideographs // Can't fail now, since all are // mapped. let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); handle.write_two(lead, trail) } else if bmp == 0xE5E5 { // It's not optimal to check for the unmappable // and for euro at this stage, but getting // the out of the way makes the rest of the // code less messy. return ( EncoderResult::unmappable_from_bmp(bmp), source.consumed(), handle.written(), ); } else if bmp == 0x20AC && !self.extended { handle.write_one(0x80u8) } else { match gbk_encode_non_unified(bmp) { Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), None => { if !self.extended { return ( EncoderResult::unmappable_from_bmp(bmp), source.consumed(), handle.written(), ); } let range_pointer = gb18030_range_encode(bmp); let first = range_pointer / (10 * 126 * 10); let rem_first = range_pointer % (10 * 126 * 10); let second = rem_first / (10 * 126); let rem_second = rem_first % (10 * 126); let third = rem_second / 10; let fourth = rem_second % 10; handle.write_four( (first + 0x81) as u8, (second + 0x30) as u8, (third + 0x81) as u8, (fourth + 0x30) as u8, ) } } } }, { if !self.extended { return ( EncoderResult::Unmappable(astral), source.consumed(), handle.written(), ); } let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); let first = range_pointer / (10 * 126 * 10); let rem_first = range_pointer % (10 * 126 * 10); let second = rem_first / (10 * 126); let rem_second = rem_first % (10 * 126); let third = rem_second / 10; let fourth = rem_second % 10; handle.write_four( (first + 0x81) as u8, (second + 0x30) as u8, (third + 0x81) as u8, (fourth + 0x30) as u8, ) }, bmp, astral, self, source, handle, copy_ascii_to_check_space_four, check_space_four, false ); } // Any copyright to the test code below this comment is dedicated to the // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ #[cfg(all(test, feature = "alloc"))] mod tests { use super::super::testing::*; use super::super::*; fn decode_gb18030(bytes: &[u8], expect: &str) { decode(GB18030, bytes, expect); } fn encode_gb18030(string: &str, expect: &[u8]) { encode(GB18030, string, expect); } fn encode_gbk(string: &str, expect: &[u8]) { encode(GBK, string, expect); } #[test] fn test_gb18030_decode() { // Empty decode_gb18030(b"", &""); // ASCII decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}"); // euro decode_gb18030(b"\x80", "\u{20AC}"); decode_gb18030(b"\xA2\xE3", "\u{20AC}"); // two bytes decode_gb18030(b"\x81\x40", "\u{4E02}"); decode_gb18030(b"\x81\x7E", "\u{4E8A}"); decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}"); decode_gb18030(b"\x81\x80", "\u{4E90}"); decode_gb18030(b"\x81\xFE", "\u{4FA2}"); decode_gb18030(b"\xFE\x40", "\u{FA0C}"); decode_gb18030(b"\xFE\x7E", "\u{E843}"); decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}"); decode_gb18030(b"\xFE\x80", "\u{4723}"); decode_gb18030(b"\xFE\xFE", "\u{E4C5}"); // The difference from the original GB18030 decode_gb18030(b"\xA3\xA0", "\u{3000}"); decode_gb18030(b"\xA1\xA1", "\u{3000}"); // 0xFF decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}"); decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} ! decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}"); decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}"); decode_gb18030( b"\xFF\x32\x9A\x33\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}", ); // Four bytes decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}"); decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}"); decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}"); decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}"); decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}"); decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}"); decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}"); decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} ! decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}"); } #[test] fn test_gb18030_encode() { // Empty encode_gb18030("", b""); // ASCII encode_gb18030("\u{0061}\u{0062}", b"\x61\x62"); // euro encode_gb18030("\u{20AC}", b"\xA2\xE3"); // two bytes encode_gb18030("\u{4E02}", b"\x81\x40"); encode_gb18030("\u{4E8A}", b"\x81\x7E"); if !cfg!(miri) { // Miri is too slow encode_gb18030("\u{4E90}", b"\x81\x80"); encode_gb18030("\u{4FA2}", b"\x81\xFE"); encode_gb18030("\u{FA0C}", b"\xFE\x40"); encode_gb18030("\u{E843}", b"\xFE\x7E"); encode_gb18030("\u{4723}", b"\xFE\x80"); encode_gb18030("\u{E4C5}", b"\xFE\xFE"); } // The difference from the original GB18030 encode_gb18030("\u{E5E5}", b""); encode_gb18030("\u{3000}", b"\xA1\xA1"); // Four bytes encode_gb18030("\u{0080}", b"\x81\x30\x81\x30"); encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37"); if !cfg!(miri) { // Miri is too slow encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30"); encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33"); encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35"); } // Edge cases encode_gb18030("\u{00F7}", b"\xA1\xC2"); } #[test] fn test_gbk_encode() { // Empty encode_gbk("", b""); // ASCII encode_gbk("\u{0061}\u{0062}", b"\x61\x62"); // euro encode_gbk("\u{20AC}", b"\x80"); // two bytes encode_gbk("\u{4E02}", b"\x81\x40"); encode_gbk("\u{4E8A}", b"\x81\x7E"); if !cfg!(miri) { // Miri is too slow encode_gbk("\u{4E90}", b"\x81\x80"); encode_gbk("\u{4FA2}", b"\x81\xFE"); encode_gbk("\u{FA0C}", b"\xFE\x40"); encode_gbk("\u{E843}", b"\xFE\x7E"); encode_gbk("\u{4723}", b"\xFE\x80"); encode_gbk("\u{E4C5}", b"\xFE\xFE"); } // The difference from the original gb18030 encode_gbk("\u{E5E5}", b""); encode_gbk("\u{3000}", b"\xA1\xA1"); // Four bytes encode_gbk("\u{0080}", b"€"); encode_gbk("\u{E7C7}", b""); if !cfg!(miri) { // Miri is too slow encode_gbk("\u{2603}", b"☃"); encode_gbk("\u{1F4A9}", b"💩"); encode_gbk("\u{10FFFF}", b"􏿿"); } // Edge cases encode_gbk("\u{00F7}", b"\xA1\xC2"); } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_gb18030_decode_all() { let input = include_bytes!("test_data/gb18030_in.txt"); let expectation = include_str!("test_data/gb18030_in_ref.txt"); let (cow, had_errors) = GB18030.decode_without_bom_handling(input); assert!(!had_errors, "Should not have had errors."); assert_eq!(&cow[..], expectation); } #[test] #[cfg_attr(miri, ignore)] // Miri is too slow fn test_gb18030_encode_all() { let input = include_str!("test_data/gb18030_out.txt"); let expectation = include_bytes!("test_data/gb18030_out_ref.txt"); let (cow, encoding, had_errors) = GB18030.encode(input); assert!(!had_errors, "Should not have had errors."); assert_eq!(encoding, GB18030); assert_eq!(&cow[..], &expectation[..]); } #[test] fn test_gb18030_encode_from_utf16_max_length() { let mut output = [0u8; 20]; let mut encoder = GB18030.new_encoder(); { let needed = encoder .max_buffer_length_from_utf16_without_replacement(1) .unwrap(); let (result, read, written) = encoder.encode_from_utf16_without_replacement( &[0x3000], &mut output[..needed], true, ); assert_eq!(result, EncoderResult::InputEmpty); assert_eq!(read, 1); assert_eq!(written, 2); assert_eq!(output[0], 0xA1); assert_eq!(output[1], 0xA1); } } }