diff options
Diffstat (limited to 'third_party/rust/encoding_rs/src/macros.rs')
-rw-r--r-- | third_party/rust/encoding_rs/src/macros.rs | 1622 |
1 files changed, 1622 insertions, 0 deletions
diff --git a/third_party/rust/encoding_rs/src/macros.rs b/third_party/rust/encoding_rs/src/macros.rs new file mode 100644 index 0000000000..d3bb5e9ed0 --- /dev/null +++ b/third_party/rust/encoding_rs/src/macros.rs @@ -0,0 +1,1622 @@ +// Copyright Mozilla Foundation. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +macro_rules! decoder_function { + ($preamble:block, + $loop_preable:block, + $eof:block, + $body:block, + $slf:ident, + $src_consumed:ident, + $dest:ident, + $source:ident, + $b:ident, + $destination_handle:ident, + $unread_handle:ident, + $destination_check:ident, + $name:ident, + $code_unit:ty, + $dest_struct:ident) => ( + pub fn $name(&mut $slf, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + let mut $source = ByteSource::new(src); + let mut $dest = $dest_struct::new(dst); + loop { // TODO: remove this loop + { + // Start non-boilerplate + $preamble + // End non-boilerplate + } + loop { + { + $loop_preable + } + match $source.check_available() { + Space::Full($src_consumed) => { + if last { + // Start non-boilerplate + $eof + // End non-boilerplate + } + return (DecoderResult::InputEmpty, $src_consumed, $dest.written()); + } + Space::Available(source_handle) => { + match $dest.$destination_check() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available($destination_handle) => { + let ($b, $unread_handle) = source_handle.read(); + // Start non-boilerplate + $body + // End non-boilerplate + } + } + } + } + } + } + }); +} + +macro_rules! decoder_functions { + ( + $preamble:block, + $loop_preable:block, + $eof:block, + $body:block, + $slf:ident, + $src_consumed:ident, + $dest:ident, + $source:ident, + $b:ident, + $destination_handle:ident, + $unread_handle:ident, + $destination_check:ident + ) => { + decoder_function!( + $preamble, + $loop_preable, + $eof, + $body, + $slf, + $src_consumed, + $dest, + $source, + $b, + $destination_handle, + $unread_handle, + $destination_check, + decode_to_utf8_raw, + u8, + Utf8Destination + ); + decoder_function!( + $preamble, + $loop_preable, + $eof, + $body, + $slf, + $src_consumed, + $dest, + $source, + $b, + $destination_handle, + $unread_handle, + $destination_check, + decode_to_utf16_raw, + u16, + Utf16Destination + ); + }; +} + +macro_rules! ascii_compatible_two_byte_decoder_function { + ($lead:block, + $trail:block, + $slf:ident, + $non_ascii:ident, + $byte:ident, + $lead_minus_offset:ident, + $unread_handle_trail:ident, + $source:ident, + $handle:ident, + $outermost:tt, + $copy_ascii:ident, + $destination_check:ident, + $name:ident, + $code_unit:ty, + $dest_struct:ident, + $ascii_punctuation:expr) => ( + pub fn $name(&mut $slf, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + let mut $source = ByteSource::new(src); + let mut dest_prolog = $dest_struct::new(dst); + let dest = match $slf.lead { + Some(lead) => { + let $lead_minus_offset = lead; + $slf.lead = None; + // Since we don't have `goto` we could use to jump into the trail + // handling part of the main loop, we need to repeat trail handling + // here. + match $source.check_available() { + Space::Full(src_consumed_prolog) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_prolog, + dest_prolog.written()); + } + return (DecoderResult::InputEmpty, src_consumed_prolog, dest_prolog.written()); + } + Space::Available(source_handle_prolog) => { + match dest_prolog.$destination_check() { + Space::Full(dst_written_prolog) => { + return (DecoderResult::OutputFull, + source_handle_prolog.consumed(), + dst_written_prolog); + } + Space::Available($handle) => { + let ($byte, $unread_handle_trail) = source_handle_prolog.read(); + // Start non-boilerplate + $trail + // End non-boilerplate + } + } + } + } + }, + None => { + &mut dest_prolog + } + }; + $outermost: loop { + match dest.$copy_ascii(&mut $source) { + CopyAsciiResult::Stop(ret) => return ret, + CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { + 'middle: loop { + let dest_again = { + let $lead_minus_offset = { + // Start non-boilerplate + $lead + // End non-boilerplate + }; + match $source.check_available() { + Space::Full(src_consumed_trail) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_trail, + $handle.written()); + } + $slf.lead = Some($lead_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_trail, + $handle.written()); + } + Space::Available(source_handle_trail) => { + let ($byte, $unread_handle_trail) = source_handle_trail.read(); + // Start non-boilerplate + $trail + // End non-boilerplate + } + } + }; + match $source.check_available() { + Space::Full(src_consumed) => { + return (DecoderResult::InputEmpty, + src_consumed, + dest_again.written()); + } + Space::Available(source_handle) => { + match dest_again.$destination_check() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available(mut destination_handle) => { + let (mut b, unread_handle) = source_handle.read(); + let source_again = unread_handle.commit(); + 'innermost: loop { + if b > 127 { + $non_ascii = b; + $handle = destination_handle; + continue 'middle; + } + // Testing on Haswell says that we should write the + // byte unconditionally instead of trying to unread it + // to make it part of the next SIMD stride. + let dest_again_again = + destination_handle.write_ascii(b); + if $ascii_punctuation && b < 60 { + // We've got punctuation + match source_again.check_available() { + Space::Full(src_consumed_again) => { + return (DecoderResult::InputEmpty, + src_consumed_again, + dest_again_again.written()); + } + Space::Available(source_handle_again) => { + match dest_again_again.$destination_check() { + Space::Full(dst_written_again) => { + return (DecoderResult::OutputFull, + source_handle_again.consumed(), + dst_written_again); + } + Space::Available(destination_handle_again) => { + { + let (b_again, _unread_handle_again) = + source_handle_again.read(); + b = b_again; + destination_handle = destination_handle_again; + continue 'innermost; + } + } + } + } + } + } + // We've got markup or ASCII text + continue $outermost; + } + } + } + } + } + } + } + } + } + }); +} + +macro_rules! ascii_compatible_two_byte_decoder_functions { + ( + $lead:block, + $trail:block, + $slf:ident, + $non_ascii:ident, + $byte:ident, + $lead_minus_offset:ident, + $unread_handle_trail:ident, + $source:ident, + $handle:ident, + $outermost:tt, + $copy_ascii:ident, + $destination_check:ident, + $ascii_punctuation:expr + ) => { + ascii_compatible_two_byte_decoder_function!( + $lead, + $trail, + $slf, + $non_ascii, + $byte, + $lead_minus_offset, + $unread_handle_trail, + $source, + $handle, + $outermost, + $copy_ascii, + $destination_check, + decode_to_utf8_raw, + u8, + Utf8Destination, + $ascii_punctuation + ); + ascii_compatible_two_byte_decoder_function!( + $lead, + $trail, + $slf, + $non_ascii, + $byte, + $lead_minus_offset, + $unread_handle_trail, + $source, + $handle, + $outermost, + $copy_ascii, + $destination_check, + decode_to_utf16_raw, + u16, + Utf16Destination, + $ascii_punctuation + ); + }; +} + +macro_rules! gb18030_decoder_function { + ($first_body:block, + $second_body:block, + $third_body:block, + $fourth_body:block, + $slf:ident, + $non_ascii:ident, + $first_minus_offset:ident, + $second:ident, + $second_minus_offset:ident, + $unread_handle_second:ident, + $third:ident, + $third_minus_offset:ident, + $unread_handle_third:ident, + $fourth:ident, + $fourth_minus_offset:ident, + $unread_handle_fourth:ident, + $source:ident, + $handle:ident, + $outermost:tt, + $name:ident, + $code_unit:ty, + $dest_struct:ident) => ( + #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] + pub fn $name(&mut $slf, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + let mut $source = ByteSource::new(src); + let mut dest = $dest_struct::new(dst); + { + if let Some(ascii) = $slf.pending_ascii { + match dest.check_space_bmp() { + Space::Full(_) => { + return (DecoderResult::OutputFull, 0, 0); + } + Space::Available(pending_ascii_handle) => { + $slf.pending_ascii = None; + pending_ascii_handle.write_ascii(ascii); + } + } + } + } + while !$slf.pending.is_none() { + match $source.check_available() { + Space::Full(src_consumed) => { + if last { + // Start non-boilerplate + let count = $slf.pending.count(); + $slf.pending = Gb18030Pending::None; + return (DecoderResult::Malformed(count as u8, 0), + src_consumed, + dest.written()); + // End non-boilerplate + } + return (DecoderResult::InputEmpty, src_consumed, dest.written()); + } + Space::Available(source_handle) => { + match dest.check_space_astral() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available($handle) => { + let (byte, unread_handle) = source_handle.read(); + match $slf.pending { + Gb18030Pending::One($first_minus_offset) => { + $slf.pending = Gb18030Pending::None; + let $second = byte; + let $unread_handle_second = unread_handle; + // If second is between 0x40 and 0x7E, + // inclusive, subtract offset 0x40. Else if + // second is between 0x80 and 0xFE, inclusive, + // subtract offset 0x41. In both cases, + // handle as a two-byte sequence. + // Else if second is between 0x30 and 0x39, + // inclusive, subtract offset 0x30 and + // handle as a four-byte sequence. + let $second_minus_offset = $second.wrapping_sub(0x30); + // It's not optimal to do this check first, + // but this results in more readable code. + if $second_minus_offset > (0x39 - 0x30) { + // Start non-boilerplate + $second_body + // End non-boilerplate + } else { + // Four-byte! + $slf.pending = Gb18030Pending::Two($first_minus_offset, + $second_minus_offset); + $handle.commit() + } + } + Gb18030Pending::Two($first_minus_offset, $second_minus_offset) => { + $slf.pending = Gb18030Pending::None; + let $third = byte; + let $unread_handle_third = unread_handle; + let $third_minus_offset = { + // Start non-boilerplate + $third_body + // End non-boilerplate + }; + $slf.pending = Gb18030Pending::Three($first_minus_offset, + $second_minus_offset, + $third_minus_offset); + $handle.commit() + } + Gb18030Pending::Three($first_minus_offset, + $second_minus_offset, + $third_minus_offset) => { + $slf.pending = Gb18030Pending::None; + let $fourth = byte; + let $unread_handle_fourth = unread_handle; + // Start non-boilerplate + $fourth_body + // End non-boilerplate + } + Gb18030Pending::None => unreachable!("Checked in loop condition"), + }; + } + } + } + } + } + $outermost: loop { + match dest.copy_ascii_from_check_space_astral(&mut $source) { + CopyAsciiResult::Stop(ret) => return ret, + CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { + 'middle: loop { + let dest_again = { + let $first_minus_offset = { + // Start non-boilerplate + $first_body + // End non-boilerplate + }; + match $source.check_available() { + Space::Full(src_consumed_trail) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_trail, + $handle.written()); + } + $slf.pending = Gb18030Pending::One($first_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_trail, + $handle.written()); + } + Space::Available(source_handle_trail) => { + let ($second, $unread_handle_second) = source_handle_trail.read(); + // Start non-boilerplate + // If second is between 0x40 and 0x7E, + // inclusive, subtract offset 0x40. Else if + // second is between 0x80 and 0xFE, inclusive, + // subtract offset 0x41. In both cases, + // handle as a two-byte sequence. + // Else if second is between 0x30 and 0x39, + // inclusive, subtract offset 0x30 and + // handle as a four-byte sequence. + let $second_minus_offset = $second.wrapping_sub(0x30); + // It's not optimal to do this check first, + // but this results in more readable code. + if $second_minus_offset > (0x39 - 0x30) { + // Start non-boilerplate + $second_body + // End non-boilerplate + } else { + // Four-byte! + match $unread_handle_second.commit().check_available() { + Space::Full(src_consumed_third) => { + if last { + return (DecoderResult::Malformed(2, 0), + src_consumed_third, + $handle.written()); + } + $slf.pending = + Gb18030Pending::Two($first_minus_offset, + $second_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_third, + $handle.written()); + } + Space::Available(source_handle_third) => { + let ($third, $unread_handle_third) = + source_handle_third.read(); + let $third_minus_offset = { + // Start non-boilerplate + $third_body + // End non-boilerplate + }; + match $unread_handle_third.commit() + .check_available() { + Space::Full(src_consumed_fourth) => { + if last { + return (DecoderResult::Malformed(3, 0), + src_consumed_fourth, + $handle.written()); + } + $slf.pending = Gb18030Pending::Three($first_minus_offset, $second_minus_offset, $third_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_fourth, + $handle.written()); + } + Space::Available(source_handle_fourth) => { + let ($fourth, $unread_handle_fourth) = + source_handle_fourth.read(); + // Start non-boilerplate + $fourth_body + // End non-boilerplate + } + } + } + } + } + // End non-boilerplate + } + } + }; + match $source.check_available() { + Space::Full(src_consumed) => { + return (DecoderResult::InputEmpty, + src_consumed, + dest_again.written()); + } + Space::Available(source_handle) => { + match dest_again.check_space_astral() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available(destination_handle) => { + let (b, _) = source_handle.read(); + loop { + if b > 127 { + $non_ascii = b; + $handle = destination_handle; + continue 'middle; + } + // Testing on Haswell says that we should write the + // byte unconditionally instead of trying to unread it + // to make it part of the next SIMD stride. + destination_handle.write_ascii(b); + // We've got markup or ASCII text + continue $outermost; + } + } + } + } + } + } + } + } + } + }); +} + +macro_rules! gb18030_decoder_functions { + ( + $first_body:block, + $second_body:block, + $third_body:block, + $fourth_body:block, + $slf:ident, + $non_ascii:ident, + $first_minus_offset:ident, + $second:ident, + $second_minus_offset:ident, + $unread_handle_second:ident, + $third:ident, + $third_minus_offset:ident, + $unread_handle_third:ident, + $fourth:ident, + $fourth_minus_offset:ident, + $unread_handle_fourth:ident, + $source:ident, + $handle:ident, + $outermost:tt + ) => { + gb18030_decoder_function!( + $first_body, + $second_body, + $third_body, + $fourth_body, + $slf, + $non_ascii, + $first_minus_offset, + $second, + $second_minus_offset, + $unread_handle_second, + $third, + $third_minus_offset, + $unread_handle_third, + $fourth, + $fourth_minus_offset, + $unread_handle_fourth, + $source, + $handle, + $outermost, + decode_to_utf8_raw, + u8, + Utf8Destination + ); + gb18030_decoder_function!( + $first_body, + $second_body, + $third_body, + $fourth_body, + $slf, + $non_ascii, + $first_minus_offset, + $second, + $second_minus_offset, + $unread_handle_second, + $third, + $third_minus_offset, + $unread_handle_third, + $fourth, + $fourth_minus_offset, + $unread_handle_fourth, + $source, + $handle, + $outermost, + decode_to_utf16_raw, + u16, + Utf16Destination + ); + }; +} + +macro_rules! euc_jp_decoder_function { + ($jis0802_trail_body:block, + $jis0812_lead_body:block, + $jis0812_trail_body:block, + $half_width_katakana_body:block, + $slf:ident, + $non_ascii:ident, + $jis0208_lead_minus_offset:ident, + $byte:ident, + $unread_handle_trail:ident, + $jis0212_lead_minus_offset:ident, + $lead:ident, + $unread_handle_jis0212:ident, + $source:ident, + $handle:ident, + $name:ident, + $code_unit:ty, + $dest_struct:ident) => ( + #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] + pub fn $name(&mut $slf, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + let mut $source = ByteSource::new(src); + let mut dest = $dest_struct::new(dst); + while !$slf.pending.is_none() { + match $source.check_available() { + Space::Full(src_consumed) => { + if last { + // Start non-boilerplate + let count = $slf.pending.count(); + $slf.pending = EucJpPending::None; + return (DecoderResult::Malformed(count as u8, 0), + src_consumed, + dest.written()); + // End non-boilerplate + } + return (DecoderResult::InputEmpty, src_consumed, dest.written()); + } + Space::Available(source_handle) => { + match dest.check_space_bmp() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available($handle) => { + let ($byte, $unread_handle_trail) = source_handle.read(); + match $slf.pending { + EucJpPending::Jis0208Lead($jis0208_lead_minus_offset) => { + $slf.pending = EucJpPending::None; + // Start non-boilerplate + $jis0802_trail_body + // End non-boilerplate + } + EucJpPending::Jis0212Shift => { + $slf.pending = EucJpPending::None; + let $lead = $byte; + let $unread_handle_jis0212 = $unread_handle_trail; + let $jis0212_lead_minus_offset = { + // Start non-boilerplate + $jis0812_lead_body + // End non-boilerplate + }; + $slf.pending = + EucJpPending::Jis0212Lead($jis0212_lead_minus_offset); + $handle.commit() + } + EucJpPending::Jis0212Lead($jis0212_lead_minus_offset) => { + $slf.pending = EucJpPending::None; + // Start non-boilerplate + $jis0812_trail_body + // End non-boilerplate + } + EucJpPending::HalfWidthKatakana => { + $slf.pending = EucJpPending::None; + // Start non-boilerplate + $half_width_katakana_body + // End non-boilerplate + } + EucJpPending::None => unreachable!("Checked in loop condition"), + }; + } + } + } + } + } + 'outermost: loop { + match dest.copy_ascii_from_check_space_bmp(&mut $source) { + CopyAsciiResult::Stop(ret) => return ret, + CopyAsciiResult::GoOn((mut $non_ascii, mut $handle)) => { + 'middle: loop { + let dest_again = { + // If lead is between 0xA1 and 0xFE, inclusive, + // subtract 0xA1. Else if lead is 0x8E, handle the + // next byte as half-width Katakana. Else if lead is + // 0x8F, expect JIS 0212. + let $jis0208_lead_minus_offset = $non_ascii.wrapping_sub(0xA1); + if $jis0208_lead_minus_offset <= (0xFE - 0xA1) { + // JIS 0208 + match $source.check_available() { + Space::Full(src_consumed_trail) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_trail, + $handle.written()); + } + $slf.pending = + EucJpPending::Jis0208Lead($jis0208_lead_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_trail, + $handle.written()); + } + Space::Available(source_handle_trail) => { + let ($byte, $unread_handle_trail) = + source_handle_trail.read(); + // Start non-boilerplate + $jis0802_trail_body + // End non-boilerplate + } + } + } else if $non_ascii == 0x8F { + match $source.check_available() { + Space::Full(src_consumed_jis0212) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_jis0212, + $handle.written()); + } + $slf.pending = EucJpPending::Jis0212Shift; + return (DecoderResult::InputEmpty, + src_consumed_jis0212, + $handle.written()); + } + Space::Available(source_handle_jis0212) => { + let ($lead, $unread_handle_jis0212) = + source_handle_jis0212.read(); + let $jis0212_lead_minus_offset = { + // Start non-boilerplate + $jis0812_lead_body + // End non-boilerplate + }; + match $unread_handle_jis0212.commit().check_available() { + Space::Full(src_consumed_trail) => { + if last { + return (DecoderResult::Malformed(2, 0), + src_consumed_trail, + $handle.written()); + } + $slf.pending = EucJpPending::Jis0212Lead($jis0212_lead_minus_offset); + return (DecoderResult::InputEmpty, + src_consumed_trail, + $handle.written()); + } + Space::Available(source_handle_trail) => { + let ($byte, $unread_handle_trail) = + source_handle_trail.read(); + // Start non-boilerplate + $jis0812_trail_body + // End non-boilerplate + } + } + } + } + } else if $non_ascii == 0x8E { + match $source.check_available() { + Space::Full(src_consumed_trail) => { + if last { + return (DecoderResult::Malformed(1, 0), + src_consumed_trail, + $handle.written()); + } + $slf.pending = EucJpPending::HalfWidthKatakana; + return (DecoderResult::InputEmpty, + src_consumed_trail, + $handle.written()); + } + Space::Available(source_handle_trail) => { + let ($byte, $unread_handle_trail) = + source_handle_trail.read(); + // Start non-boilerplate + $half_width_katakana_body + // End non-boilerplate + } + } + } else { + return (DecoderResult::Malformed(1, 0), + $source.consumed(), + $handle.written()); + } + }; + match $source.check_available() { + Space::Full(src_consumed) => { + return (DecoderResult::InputEmpty, + src_consumed, + dest_again.written()); + } + Space::Available(source_handle) => { + match dest_again.check_space_bmp() { + Space::Full(dst_written) => { + return (DecoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available(destination_handle) => { + let (b, _) = source_handle.read(); + loop { + if b > 127 { + $non_ascii = b; + $handle = destination_handle; + continue 'middle; + } + // Testing on Haswell says that we should write the + // byte unconditionally instead of trying to unread it + // to make it part of the next SIMD stride. + destination_handle.write_ascii(b); + // We've got markup or ASCII text + continue 'outermost; + } + } + } + } + } + } + } + } + } + }); +} + +macro_rules! euc_jp_decoder_functions { + ( + $jis0802_trail_body:block, + $jis0812_lead_body:block, + $jis0812_trail_body:block, + $half_width_katakana_body:block, + $slf:ident, + $non_ascii:ident, + $jis0208_lead_minus_offset:ident, + $byte:ident, + $unread_handle_trail:ident, + $jis0212_lead_minus_offset:ident, + $lead:ident, + $unread_handle_jis0212:ident, + $source:ident, + $handle:ident + ) => { + euc_jp_decoder_function!( + $jis0802_trail_body, + $jis0812_lead_body, + $jis0812_trail_body, + $half_width_katakana_body, + $slf, + $non_ascii, + $jis0208_lead_minus_offset, + $byte, + $unread_handle_trail, + $jis0212_lead_minus_offset, + $lead, + $unread_handle_jis0212, + $source, + $handle, + decode_to_utf8_raw, + u8, + Utf8Destination + ); + euc_jp_decoder_function!( + $jis0802_trail_body, + $jis0812_lead_body, + $jis0812_trail_body, + $half_width_katakana_body, + $slf, + $non_ascii, + $jis0208_lead_minus_offset, + $byte, + $unread_handle_trail, + $jis0212_lead_minus_offset, + $lead, + $unread_handle_jis0212, + $source, + $handle, + decode_to_utf16_raw, + u16, + Utf16Destination + ); + }; +} + +macro_rules! encoder_function { + ($eof:block, + $body:block, + $slf:ident, + $src_consumed:ident, + $source:ident, + $dest:ident, + $c:ident, + $destination_handle:ident, + $unread_handle:ident, + $destination_check:ident, + $name:ident, + $input:ty, + $source_struct:ident) => ( + pub fn $name(&mut $slf, + src: &$input, + dst: &mut [u8], + last: bool) + -> (EncoderResult, usize, usize) { + let mut $source = $source_struct::new(src); + let mut $dest = ByteDestination::new(dst); + loop { + match $source.check_available() { + Space::Full($src_consumed) => { + if last { + // Start non-boilerplate + $eof + // End non-boilerplate + } + return (EncoderResult::InputEmpty, $src_consumed, $dest.written()); + } + Space::Available(source_handle) => { + match $dest.$destination_check() { + Space::Full(dst_written) => { + return (EncoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available($destination_handle) => { + let ($c, $unread_handle) = source_handle.read(); + // Start non-boilerplate + $body + // End non-boilerplate + } + } + } + } + } + }); +} + +macro_rules! encoder_functions { + ( + $eof:block, + $body:block, + $slf:ident, + $src_consumed:ident, + $source:ident, + $dest:ident, + $c:ident, + $destination_handle:ident, + $unread_handle:ident, + $destination_check:ident + ) => { + encoder_function!( + $eof, + $body, + $slf, + $src_consumed, + $source, + $dest, + $c, + $destination_handle, + $unread_handle, + $destination_check, + encode_from_utf8_raw, + str, + Utf8Source + ); + encoder_function!( + $eof, + $body, + $slf, + $src_consumed, + $source, + $dest, + $c, + $destination_handle, + $unread_handle, + $destination_check, + encode_from_utf16_raw, + [u16], + Utf16Source + ); + }; +} + +macro_rules! ascii_compatible_encoder_function { + ($bmp_body:block, + $astral_body:block, + $bmp:ident, + $astral:ident, + $slf:ident, + $source:ident, + $handle:ident, + $copy_ascii:ident, + $destination_check:ident, + $name:ident, + $input:ty, + $source_struct:ident, + $ascii_punctuation:expr) => ( + pub fn $name(&mut $slf, + src: &$input, + dst: &mut [u8], + _last: bool) + -> (EncoderResult, usize, usize) { + let mut $source = $source_struct::new(src); + let mut dest = ByteDestination::new(dst); + 'outermost: loop { + match $source.$copy_ascii(&mut dest) { + CopyAsciiResult::Stop(ret) => return ret, + CopyAsciiResult::GoOn((mut non_ascii, mut $handle)) => { + 'middle: loop { + let dest_again = match non_ascii { + NonAscii::BmpExclAscii($bmp) => { + // Start non-boilerplate + $bmp_body + // End non-boilerplate + } + NonAscii::Astral($astral) => { + // Start non-boilerplate + $astral_body + // End non-boilerplate + } + }; + match $source.check_available() { + Space::Full(src_consumed) => { + return (EncoderResult::InputEmpty, + src_consumed, + dest_again.written()); + } + Space::Available(source_handle) => { + match dest_again.$destination_check() { + Space::Full(dst_written) => { + return (EncoderResult::OutputFull, + source_handle.consumed(), + dst_written); + } + Space::Available(mut destination_handle) => { + let (mut c, unread_handle) = source_handle.read_enum(); + let source_again = unread_handle.commit(); + 'innermost: loop { + let ascii = match c { + Unicode::NonAscii(non_ascii_again) => { + non_ascii = non_ascii_again; + $handle = destination_handle; + continue 'middle; + } + Unicode::Ascii(a) => a, + }; + // Testing on Haswell says that we should write the + // byte unconditionally instead of trying to unread it + // to make it part of the next SIMD stride. + let dest_again_again = + destination_handle.write_one(ascii); + if $ascii_punctuation && ascii < 60 { + // We've got punctuation + match source_again.check_available() { + Space::Full(src_consumed_again) => { + return (EncoderResult::InputEmpty, + src_consumed_again, + dest_again_again.written()); + } + Space::Available(source_handle_again) => { + match dest_again_again.$destination_check() { + Space::Full(dst_written_again) => { + return (EncoderResult::OutputFull, + source_handle_again.consumed(), + dst_written_again); + } + Space::Available(destination_handle_again) => { + { + let (c_again, _unread_handle_again) = + source_handle_again.read_enum(); + c = c_again; + destination_handle = destination_handle_again; + continue 'innermost; + } + } + } + } + } + } + // We've got markup or ASCII text + continue 'outermost; + } + } + } + } + } + } + } + } + } + }); +} + +macro_rules! ascii_compatible_encoder_functions { + ( + $bmp_body:block, + $astral_body:block, + $bmp:ident, + $astral:ident, + $slf:ident, + $source:ident, + $handle:ident, + $copy_ascii:ident, + $destination_check:ident, + $ascii_punctuation:expr + ) => { + ascii_compatible_encoder_function!( + $bmp_body, + $astral_body, + $bmp, + $astral, + $slf, + $source, + $handle, + $copy_ascii, + $destination_check, + encode_from_utf8_raw, + str, + Utf8Source, + $ascii_punctuation + ); + ascii_compatible_encoder_function!( + $bmp_body, + $astral_body, + $bmp, + $astral, + $slf, + $source, + $handle, + $copy_ascii, + $destination_check, + encode_from_utf16_raw, + [u16], + Utf16Source, + $ascii_punctuation + ); + }; +} + +macro_rules! ascii_compatible_bmp_encoder_function { + ( + $bmp_body:block, + $bmp:ident, + $slf:ident, + $source:ident, + $handle:ident, + $copy_ascii:ident, + $destination_check:ident, + $name:ident, + $input:ty, + $source_struct:ident, + $ascii_punctuation:expr + ) => { + ascii_compatible_encoder_function!( + $bmp_body, + { + return ( + EncoderResult::Unmappable(astral), + $source.consumed(), + $handle.written(), + ); + }, + $bmp, + astral, + $slf, + $source, + $handle, + $copy_ascii, + $destination_check, + $name, + $input, + $source_struct, + $ascii_punctuation + ); + }; +} + +macro_rules! ascii_compatible_bmp_encoder_functions { + ( + $bmp_body:block, + $bmp:ident, + $slf:ident, + $source:ident, + $handle:ident, + $copy_ascii:ident, + $destination_check:ident, + $ascii_punctuation:expr + ) => { + ascii_compatible_encoder_functions!( + $bmp_body, + { + return ( + EncoderResult::Unmappable(astral), + $source.consumed(), + $handle.written(), + ); + }, + $bmp, + astral, + $slf, + $source, + $handle, + $copy_ascii, + $destination_check, + $ascii_punctuation + ); + }; +} + +macro_rules! public_decode_function{ + ($(#[$meta:meta])*, + $decode_to_utf:ident, + $decode_to_utf_raw:ident, + $decode_to_utf_checking_end:ident, + $decode_to_utf_after_one_potential_bom_byte:ident, + $decode_to_utf_after_two_potential_bom_bytes:ident, + $decode_to_utf_checking_end_with_offset:ident, + $code_unit:ty) => ( + $(#[$meta])* + pub fn $decode_to_utf(&mut self, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + let mut offset = 0usize; + loop { + match self.life_cycle { + // The common case. (Post-sniffing.) + DecoderLifeCycle::Converting => { + return self.$decode_to_utf_checking_end(src, dst, last); + } + // The rest is all BOM sniffing! + DecoderLifeCycle::AtStart => { + debug_assert_eq!(offset, 0usize); + if src.is_empty() { + return (DecoderResult::InputEmpty, 0, 0); + } + match src[0] { + 0xEFu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf8First; + offset += 1; + continue; + } + 0xFEu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf16BeFirst; + offset += 1; + continue; + } + 0xFFu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf16LeFirst; + offset += 1; + continue; + } + _ => { + self.life_cycle = DecoderLifeCycle::Converting; + continue; + } + } + } + DecoderLifeCycle::AtUtf8Start => { + debug_assert_eq!(offset, 0usize); + if src.is_empty() { + return (DecoderResult::InputEmpty, 0, 0); + } + match src[0] { + 0xEFu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf8First; + offset += 1; + continue; + } + _ => { + self.life_cycle = DecoderLifeCycle::Converting; + continue; + } + } + } + DecoderLifeCycle::AtUtf16BeStart => { + debug_assert_eq!(offset, 0usize); + if src.is_empty() { + return (DecoderResult::InputEmpty, 0, 0); + } + match src[0] { + 0xFEu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf16BeFirst; + offset += 1; + continue; + } + _ => { + self.life_cycle = DecoderLifeCycle::Converting; + continue; + } + } + } + DecoderLifeCycle::AtUtf16LeStart => { + debug_assert_eq!(offset, 0usize); + if src.is_empty() { + return (DecoderResult::InputEmpty, 0, 0); + } + match src[0] { + 0xFFu8 => { + self.life_cycle = DecoderLifeCycle::SeenUtf16LeFirst; + offset += 1; + continue; + } + _ => { + self.life_cycle = DecoderLifeCycle::Converting; + continue; + } + } + } + DecoderLifeCycle::SeenUtf8First => { + if offset >= src.len() { + if last { + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xEFu8); + } + return (DecoderResult::InputEmpty, offset, 0); + } + if src[offset] == 0xBBu8 { + self.life_cycle = DecoderLifeCycle::SeenUtf8Second; + offset += 1; + continue; + } + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xEFu8); + } + DecoderLifeCycle::SeenUtf8Second => { + if offset >= src.len() { + if last { + return self.$decode_to_utf_after_two_potential_bom_bytes(src, + dst, + last, + offset); + } + return (DecoderResult::InputEmpty, offset, 0); + } + if src[offset] == 0xBFu8 { + self.life_cycle = DecoderLifeCycle::Converting; + offset += 1; + if self.encoding != UTF_8 { + self.encoding = UTF_8; + self.variant = UTF_8.new_variant_decoder(); + } + return self.$decode_to_utf_checking_end_with_offset(src, + dst, + last, + offset); + } + return self.$decode_to_utf_after_two_potential_bom_bytes(src, + dst, + last, + offset); + } + DecoderLifeCycle::SeenUtf16BeFirst => { + if offset >= src.len() { + if last { + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xFEu8); + } + return (DecoderResult::InputEmpty, offset, 0); + } + if src[offset] == 0xFFu8 { + self.life_cycle = DecoderLifeCycle::Converting; + offset += 1; + if self.encoding != UTF_16BE { + self.encoding = UTF_16BE; + self.variant = UTF_16BE.new_variant_decoder(); + } + return self.$decode_to_utf_checking_end_with_offset(src, + dst, + last, + offset); + } + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xFEu8); + } + DecoderLifeCycle::SeenUtf16LeFirst => { + if offset >= src.len() { + if last { + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xFFu8); + } + return (DecoderResult::InputEmpty, offset, 0); + } + if src[offset] == 0xFEu8 { + self.life_cycle = DecoderLifeCycle::Converting; + offset += 1; + if self.encoding != UTF_16LE { + self.encoding = UTF_16LE; + self.variant = UTF_16LE.new_variant_decoder(); + } + return self.$decode_to_utf_checking_end_with_offset(src, + dst, + last, + offset); + } + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + offset, + 0xFFu8); + } + DecoderLifeCycle::ConvertingWithPendingBB => { + debug_assert_eq!(offset, 0usize); + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + 0usize, + 0xBBu8); + } + DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."), + } + } + } + + fn $decode_to_utf_after_one_potential_bom_byte(&mut self, + src: &[u8], + dst: &mut [$code_unit], + last: bool, + offset: usize, + first_byte: u8) + -> (DecoderResult, usize, usize) { + self.life_cycle = DecoderLifeCycle::Converting; + if offset == 0usize { + // First byte was seen previously. + let first = [first_byte]; + let mut out_read = 0usize; + let (mut first_result, _, mut first_written) = + self.variant + .$decode_to_utf_raw(&first[..], dst, false); + match first_result { + DecoderResult::InputEmpty => { + let (result, read, written) = + self.$decode_to_utf_checking_end(src, &mut dst[first_written..], last); + first_result = result; + out_read = read; // Overwrite, don't add! + first_written += written; + } + DecoderResult::Malformed(_, _) => { + // Wasn't read from `src`!, leave out_read to 0 + } + DecoderResult::OutputFull => { + panic!("Output buffer must have been too small."); + } + } + return (first_result, out_read, first_written); + } + debug_assert_eq!(offset, 1usize); + // The first byte is in `src`, so no need to push it separately. + self.$decode_to_utf_checking_end(src, dst, last) + } + + fn $decode_to_utf_after_two_potential_bom_bytes(&mut self, + src: &[u8], + dst: &mut [$code_unit], + last: bool, + offset: usize) + -> (DecoderResult, usize, usize) { + self.life_cycle = DecoderLifeCycle::Converting; + if offset == 0usize { + // The first two bytes are not in the current buffer.. + let ef_bb = [0xEFu8, 0xBBu8]; + let (mut first_result, mut first_read, mut first_written) = + self.variant + .$decode_to_utf_raw(&ef_bb[..], dst, false); + match first_result { + DecoderResult::InputEmpty => { + let (result, read, written) = + self.$decode_to_utf_checking_end(src, &mut dst[first_written..], last); + first_result = result; + first_read = read; // Overwrite, don't add! + first_written += written; + } + DecoderResult::Malformed(_, _) => { + if first_read == 1usize { + // The first byte was malformed. We need to handle + // the second one, which isn't in `src`, later. + self.life_cycle = DecoderLifeCycle::ConvertingWithPendingBB; + } + first_read = 0usize; // Wasn't read from `src`! + } + DecoderResult::OutputFull => { + panic!("Output buffer must have been too small."); + } + } + return (first_result, first_read, first_written); + } + if offset == 1usize { + // The first byte isn't in the current buffer but the second one + // is. + return self.$decode_to_utf_after_one_potential_bom_byte(src, + dst, + last, + 0usize, + 0xEFu8); + + } + debug_assert_eq!(offset, 2usize); + // The first two bytes are in `src`, so no need to push them separately. + self.$decode_to_utf_checking_end(src, dst, last) + } + + /// Calls `$decode_to_utf_checking_end` with `offset` bytes omitted from + /// the start of `src` but adjusting the return values to show those bytes + /// as having been consumed. + fn $decode_to_utf_checking_end_with_offset(&mut self, + src: &[u8], + dst: &mut [$code_unit], + last: bool, + offset: usize) + -> (DecoderResult, usize, usize) { + debug_assert_eq!(self.life_cycle, DecoderLifeCycle::Converting); + let (result, read, written) = self.$decode_to_utf_checking_end(&src[offset..], dst, last); + (result, read + offset, written) + } + + /// Calls through to the delegate and adjusts life cycle iff `last` is + /// `true` and result is `DecoderResult::InputEmpty`. + fn $decode_to_utf_checking_end(&mut self, + src: &[u8], + dst: &mut [$code_unit], + last: bool) + -> (DecoderResult, usize, usize) { + debug_assert_eq!(self.life_cycle, DecoderLifeCycle::Converting); + let (result, read, written) = self.variant + .$decode_to_utf_raw(src, dst, last); + if last { + if let DecoderResult::InputEmpty = result { + self.life_cycle = DecoderLifeCycle::Finished; + } + } + (result, read, written) + }); +} |