/*! Utilities for working with I/O using byte strings. This module currently only exports a single trait, `BufReadExt`, which provides facilities for conveniently and efficiently working with lines as byte strings. More APIs may be added in the future. */ use std::io; use crate::ext_slice::ByteSlice; use crate::ext_vec::ByteVec; /// An extention trait for /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) /// which provides convenience APIs for dealing with byte strings. pub trait BufReadExt: io::BufRead { /// Returns an iterator over the lines of this reader, where each line /// is represented as a byte string. /// /// Each item yielded by this iterator is a `io::Result>`, where /// an error is yielded if there was a problem reading from the underlying /// reader. /// /// On success, the next line in the iterator is returned. The line does /// *not* contain a trailing `\n` or `\r\n`. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// for result in cursor.byte_lines() { /// let line = result?; /// lines.push(line); /// } /// assert_eq!(lines.len(), 3); /// assert_eq!(lines[0], "lorem".as_bytes()); /// assert_eq!(lines[1], "ipsum".as_bytes()); /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn byte_lines(self) -> ByteLines where Self: Sized, { ByteLines { buf: self } } /// Returns an iterator over byte-terminated records of this reader, where /// each record is represented as a byte string. /// /// Each item yielded by this iterator is a `io::Result>`, where /// an error is yielded if there was a problem reading from the underlying /// reader. /// /// On success, the next record in the iterator is returned. The record /// does *not* contain its trailing terminator. /// /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in /// that it has no special handling for `\r`. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// for result in cursor.byte_records(b'\x00') { /// let record = result?; /// records.push(record); /// } /// assert_eq!(records.len(), 3); /// assert_eq!(records[0], "lorem".as_bytes()); /// assert_eq!(records[1], "ipsum".as_bytes()); /// assert_eq!(records[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn byte_records(self, terminator: u8) -> ByteRecords where Self: Sized, { ByteRecords { terminator, buf: self } } /// Executes the given closure on each line in the underlying reader. /// /// If the closure returns an error (or if the underlying reader returns an /// error), then iteration is stopped and the error is returned. If false /// is returned, then iteration is stopped and no error is returned. /// /// The closure given is called on exactly the same values as yielded by /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines) /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes. /// /// This routine is useful for iterating over lines as quickly as /// possible. Namely, a single allocation is reused for each line. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// cursor.for_byte_line(|line| { /// lines.push(line.to_vec()); /// Ok(true) /// })?; /// assert_eq!(lines.len(), 3); /// assert_eq!(lines[0], "lorem".as_bytes()); /// assert_eq!(lines[1], "ipsum".as_bytes()); /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_line(self, mut for_each_line: F) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result, { self.for_byte_line_with_terminator(|line| { for_each_line(&trim_line_slice(&line)) }) } /// Executes the given closure on each byte-terminated record in the /// underlying reader. /// /// If the closure returns an error (or if the underlying reader returns an /// error), then iteration is stopped and the error is returned. If false /// is returned, then iteration is stopped and no error is returned. /// /// The closure given is called on exactly the same values as yielded by /// the [`byte_records`](trait.BufReadExt.html#method.byte_records) /// iterator. Namely, records do _not_ contain a trailing terminator byte. /// /// This routine is useful for iterating over records as quickly as /// possible. Namely, a single allocation is reused for each record. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// cursor.for_byte_record(b'\x00', |record| { /// records.push(record.to_vec()); /// Ok(true) /// })?; /// assert_eq!(records.len(), 3); /// assert_eq!(records[0], "lorem".as_bytes()); /// assert_eq!(records[1], "ipsum".as_bytes()); /// assert_eq!(records[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_record( self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result, { self.for_byte_record_with_terminator(terminator, |chunk| { for_each_record(&trim_record_slice(&chunk, terminator)) }) } /// Executes the given closure on each line in the underlying reader. /// /// If the closure returns an error (or if the underlying reader returns an /// error), then iteration is stopped and the error is returned. If false /// is returned, then iteration is stopped and no error is returned. /// /// Unlike /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line), /// the lines given to the closure *do* include the line terminator, if one /// exists. /// /// This routine is useful for iterating over lines as quickly as /// possible. Namely, a single allocation is reused for each line. /// /// This is identical to `for_byte_record_with_terminator` with a /// terminator of `\n`. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// cursor.for_byte_line_with_terminator(|line| { /// lines.push(line.to_vec()); /// Ok(true) /// })?; /// assert_eq!(lines.len(), 3); /// assert_eq!(lines[0], "lorem\n".as_bytes()); /// assert_eq!(lines[1], "ipsum\r\n".as_bytes()); /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_line_with_terminator( self, for_each_line: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result, { self.for_byte_record_with_terminator(b'\n', for_each_line) } /// Executes the given closure on each byte-terminated record in the /// underlying reader. /// /// If the closure returns an error (or if the underlying reader returns an /// error), then iteration is stopped and the error is returned. If false /// is returned, then iteration is stopped and no error is returned. /// /// Unlike /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record), /// the lines given to the closure *do* include the record terminator, if /// one exists. /// /// This routine is useful for iterating over records as quickly as /// possible. Namely, a single allocation is reused for each record. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io; /// /// use bstr::B; /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// cursor.for_byte_record_with_terminator(b'\x00', |record| { /// records.push(record.to_vec()); /// Ok(true) /// })?; /// assert_eq!(records.len(), 3); /// assert_eq!(records[0], B(b"lorem\x00")); /// assert_eq!(records[1], B("ipsum\x00")); /// assert_eq!(records[2], B("dolor")); /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_record_with_terminator( mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result, { let mut bytes = vec![]; let mut res = Ok(()); let mut consumed = 0; 'outer: loop { // Lend out complete record slices from our buffer { let mut buf = self.fill_buf()?; while let Some(index) = buf.find_byte(terminator) { let (record, rest) = buf.split_at(index + 1); buf = rest; consumed += record.len(); match for_each_record(&record) { Ok(false) => break 'outer, Err(err) => { res = Err(err); break 'outer; } _ => (), } } // Copy the final record fragment to our local buffer. This // saves read_until() from re-scanning a buffer we know // contains no remaining terminators. bytes.extend_from_slice(&buf); consumed += buf.len(); } self.consume(consumed); consumed = 0; // N.B. read_until uses a different version of memchr that may // be slower than the memchr crate that bstr uses. However, this // should only run for a fairly small number of records, assuming a // decent buffer size. self.read_until(terminator, &mut bytes)?; if bytes.is_empty() || !for_each_record(&bytes)? { break; } bytes.clear(); } self.consume(consumed); res } } impl BufReadExt for B {} /// An iterator over lines from an instance of /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). /// /// This iterator is generally created by calling the /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines) /// method on the /// [`BufReadExt`](trait.BufReadExt.html) /// trait. #[derive(Debug)] pub struct ByteLines { buf: B, } /// An iterator over records from an instance of /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). /// /// A byte record is any sequence of bytes terminated by a particular byte /// chosen by the caller. For example, NUL separated byte strings are said to /// be NUL-terminated byte records. /// /// This iterator is generally created by calling the /// [`byte_records`](trait.BufReadExt.html#method.byte_records) /// method on the /// [`BufReadExt`](trait.BufReadExt.html) /// trait. #[derive(Debug)] pub struct ByteRecords { buf: B, terminator: u8, } impl Iterator for ByteLines { type Item = io::Result>; fn next(&mut self) -> Option>> { let mut bytes = vec![]; match self.buf.read_until(b'\n', &mut bytes) { Err(e) => Some(Err(e)), Ok(0) => None, Ok(_) => { trim_line(&mut bytes); Some(Ok(bytes)) } } } } impl Iterator for ByteRecords { type Item = io::Result>; fn next(&mut self) -> Option>> { let mut bytes = vec![]; match self.buf.read_until(self.terminator, &mut bytes) { Err(e) => Some(Err(e)), Ok(0) => None, Ok(_) => { trim_record(&mut bytes, self.terminator); Some(Ok(bytes)) } } } } fn trim_line(line: &mut Vec) { if line.last_byte() == Some(b'\n') { line.pop_byte(); if line.last_byte() == Some(b'\r') { line.pop_byte(); } } } fn trim_line_slice(mut line: &[u8]) -> &[u8] { if line.last_byte() == Some(b'\n') { line = &line[..line.len() - 1]; if line.last_byte() == Some(b'\r') { line = &line[..line.len() - 1]; } } line } fn trim_record(record: &mut Vec, terminator: u8) { if record.last_byte() == Some(terminator) { record.pop_byte(); } } fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { if record.last_byte() == Some(terminator) { record = &record[..record.len() - 1]; } record } #[cfg(test)] mod tests { use super::BufReadExt; use crate::bstring::BString; fn collect_lines>(slice: B) -> Vec { let mut lines = vec![]; slice .as_ref() .for_byte_line(|line| { lines.push(BString::from(line.to_vec())); Ok(true) }) .unwrap(); lines } fn collect_lines_term>(slice: B) -> Vec { let mut lines = vec![]; slice .as_ref() .for_byte_line_with_terminator(|line| { lines.push(BString::from(line.to_vec())); Ok(true) }) .unwrap(); lines } #[test] fn lines_without_terminator() { assert_eq!(collect_lines(""), Vec::::new()); assert_eq!(collect_lines("\n"), vec![""]); assert_eq!(collect_lines("\n\n"), vec!["", ""]); assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]); assert_eq!(collect_lines("a\nb"), vec!["a", "b"]); assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]); assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]); assert_eq!(collect_lines("\r\n"), vec![""]); assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]); assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]); assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]); assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]); assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]); assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]); } #[test] fn lines_with_terminator() { assert_eq!(collect_lines_term(""), Vec::::new()); assert_eq!(collect_lines_term("\n"), vec!["\n"]); assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]); assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]); assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]); assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]); assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]); assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]); assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]); assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]); assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]); assert_eq!( collect_lines_term("abc\r\nxyz\r\n"), vec!["abc\r\n", "xyz\r\n"] ); assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]); assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]); } }