mod lossy; mod read; pub use lossy::LossyDecoder; pub use read::{BufReadDecoder, BufReadDecoderError}; use std::cmp; use std::error::Error; use std::fmt; use std::str; /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; #[derive(Debug, Copy, Clone)] pub enum DecodeError<'a> { /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, /// then call `decode()` again with `remaining_input`. Invalid { valid_prefix: &'a str, invalid_sequence: &'a [u8], remaining_input: &'a [u8], }, /// Call the `incomplete_suffix.try_complete` method with more input when available. /// If no more input is available, this is an invalid byte sequence. Incomplete { valid_prefix: &'a str, incomplete_suffix: Incomplete, }, } impl<'a> fmt::Display for DecodeError<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { DecodeError::Invalid { valid_prefix, invalid_sequence, remaining_input, } => write!( f, "found invalid byte sequence {invalid_sequence:02x?} after \ {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ unprocessed bytes", invalid_sequence = invalid_sequence, valid_byte_count = valid_prefix.len(), unprocessed_byte_count = remaining_input.len() ), DecodeError::Incomplete { valid_prefix, incomplete_suffix, } => write!( f, "found incomplete byte sequence {incomplete_suffix:02x?} after \ {valid_byte_count} bytes", incomplete_suffix = incomplete_suffix, valid_byte_count = valid_prefix.len() ), } } } impl<'a> Error for DecodeError<'a> {} #[derive(Debug, Copy, Clone)] pub struct Incomplete { pub buffer: [u8; 4], pub buffer_len: u8, } pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { let error = match str::from_utf8(input) { Ok(valid) => return Ok(valid), Err(error) => error, }; // FIXME: separate function from here to guide inlining? let (valid, after_valid) = input.split_at(error.valid_up_to()); let valid = unsafe { str::from_utf8_unchecked(valid) }; match error.error_len() { Some(invalid_sequence_length) => { let (invalid, rest) = after_valid.split_at(invalid_sequence_length); Err(DecodeError::Invalid { valid_prefix: valid, invalid_sequence: invalid, remaining_input: rest }) } None => { Err(DecodeError::Incomplete { valid_prefix: valid, incomplete_suffix: Incomplete::new(after_valid), }) } } } impl Incomplete { pub fn empty() -> Self { Incomplete { buffer: [0, 0, 0, 0], buffer_len: 0, } } pub fn is_empty(&self) -> bool { self.buffer_len == 0 } pub fn new(bytes: &[u8]) -> Self { let mut buffer = [0, 0, 0, 0]; let len = bytes.len(); buffer[..len].copy_from_slice(bytes); Incomplete { buffer: buffer, buffer_len: len as u8, } } /// * `None`: still incomplete, call `try_complete` again with more input. /// If no more input is available, this is invalid byte sequence. /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. /// To keep decoding, pass `remaining_input` to `decode()`. pub fn try_complete<'input>(&mut self, input: &'input [u8]) -> Option<(Result<&str, &[u8]>, &'input [u8])> { let (consumed, opt_result) = self.try_complete_offsets(input); let result = opt_result?; let remaining_input = &input[consumed..]; let result_bytes = self.take_buffer(); let result = match result { Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), Err(()) => Err(result_bytes), }; Some((result, remaining_input)) } fn take_buffer(&mut self) -> &[u8] { let len = self.buffer_len as usize; self.buffer_len = 0; &self.buffer[..len as usize] } /// (consumed_from_input, None): not enough input /// (consumed_from_input, Some(Err(()))): error bytes in buffer /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { let initial_buffer_len = self.buffer_len as usize; let copied_from_input; { let unwritten = &mut self.buffer[initial_buffer_len..]; copied_from_input = cmp::min(unwritten.len(), input.len()); unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); } let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; match str::from_utf8(spliced) { Ok(_) => { self.buffer_len = spliced.len() as u8; (copied_from_input, Some(Ok(()))) } Err(error) => { let valid_up_to = error.valid_up_to(); if valid_up_to > 0 { let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); self.buffer_len = valid_up_to as u8; (consumed, Some(Ok(()))) } else { match error.error_len() { Some(invalid_sequence_length) => { let consumed = invalid_sequence_length .checked_sub(initial_buffer_len).unwrap(); self.buffer_len = invalid_sequence_length as u8; (consumed, Some(Err(()))) } None => { self.buffer_len = spliced.len() as u8; (copied_from_input, None) } } } } } } }