use std::io::{self, BufRead}; use std::error::Error; use std::fmt; use std::str; use super::*; /// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. pub struct BufReadDecoder { buf_read: B, bytes_consumed: usize, incomplete: Incomplete, } #[derive(Debug)] pub enum BufReadDecoderError<'a> { /// Represents one UTF-8 error in the byte stream. /// /// In lossy decoding, each such error should be replaced with U+FFFD. /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) InvalidByteSequence(&'a [u8]), /// An I/O error from the underlying byte stream Io(io::Error), } impl<'a> BufReadDecoderError<'a> { /// Replace UTF-8 errors with U+FFFD pub fn lossy(self) -> Result<&'static str, io::Error> { match self { BufReadDecoderError::Io(error) => Err(error), BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), } } } impl<'a> fmt::Display for BufReadDecoderError<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { BufReadDecoderError::InvalidByteSequence(bytes) => { write!(f, "invalid byte sequence: {:02x?}", bytes) } BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), } } } impl<'a> Error for BufReadDecoderError<'a> { fn source(&self) -> Option<&(dyn Error + 'static)> { match *self { BufReadDecoderError::InvalidByteSequence(_) => None, BufReadDecoderError::Io(ref err) => Some(err), } } } impl BufReadDecoder { /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. pub fn read_to_string_lossy(buf_read: B) -> io::Result { let mut decoder = Self::new(buf_read); let mut string = String::new(); while let Some(result) = decoder.next_lossy() { string.push_str(result?) } Ok(string) } pub fn new(buf_read: B) -> Self { Self { buf_read, bytes_consumed: 0, incomplete: Incomplete::empty(), } } /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. pub fn next_lossy(&mut self) -> Option> { self.next_strict().map(|result| result.or_else(|e| e.lossy())) } /// Decode and consume the next chunk of UTF-8 input. /// /// This method is intended to be called repeatedly until it returns `None`, /// which represents EOF from the underlying byte stream. /// This is similar to `Iterator::next`, /// except that decoded chunks borrow the decoder (~iterator) /// so they need to be handled or copied before the next chunk can start decoding. pub fn next_strict(&mut self) -> Option> { enum BytesSource { BufRead(usize), Incomplete, } macro_rules! try_io { ($io_result: expr) => { match $io_result { Ok(value) => value, Err(error) => return Some(Err(BufReadDecoderError::Io(error))) } } } let (source, result) = loop { if self.bytes_consumed > 0 { self.buf_read.consume(self.bytes_consumed); self.bytes_consumed = 0; } let buf = try_io!(self.buf_read.fill_buf()); // Force loop iteration to go through an explicit `continue` enum Unreachable {} let _: Unreachable = if self.incomplete.is_empty() { if buf.is_empty() { return None // EOF } match str::from_utf8(buf) { Ok(_) => { break (BytesSource::BufRead(buf.len()), Ok(())) } Err(error) => { let valid_up_to = error.valid_up_to(); if valid_up_to > 0 { break (BytesSource::BufRead(valid_up_to), Ok(())) } match error.error_len() { Some(invalid_sequence_length) => { break (BytesSource::BufRead(invalid_sequence_length), Err(())) } None => { self.bytes_consumed = buf.len(); self.incomplete = Incomplete::new(buf); // need more input bytes continue } } } } } else { if buf.is_empty() { break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point } let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); self.bytes_consumed = consumed; match opt_result { None => { // need more input bytes continue } Some(result) => { break (BytesSource::Incomplete, result) } } }; }; let bytes = match source { BytesSource::BufRead(byte_count) => { self.bytes_consumed = byte_count; let buf = try_io!(self.buf_read.fill_buf()); &buf[..byte_count] } BytesSource::Incomplete => { self.incomplete.take_buffer() } }; match result { Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), } } }