use anstyle_parse::state::state_change; use anstyle_parse::state::Action; use anstyle_parse::state::State; /// Strip ANSI escapes from a `&str`, returning the printable content /// /// This can be used to take output from a program that includes escape sequences and write it /// somewhere that does not easily support them, such as a log file. /// /// For non-contiguous data, see [`StripStr`]. /// /// # Example /// /// ```rust /// use std::io::Write as _; /// /// let styled_text = "\x1b[32mfoo\x1b[m bar"; /// let plain_str = anstream::adapter::strip_str(&styled_text).to_string(); /// assert_eq!(plain_str, "foo bar"); /// ``` #[inline] pub fn strip_str(data: &str) -> StrippedStr<'_> { StrippedStr::new(data) } /// See [`strip_str`] #[derive(Default, Clone, Debug, PartialEq, Eq)] pub struct StrippedStr<'s> { bytes: &'s [u8], state: State, } impl<'s> StrippedStr<'s> { #[inline] fn new(data: &'s str) -> Self { Self { bytes: data.as_bytes(), state: State::Ground, } } /// Create a [`String`] of the printable content #[inline] #[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation pub fn to_string(&self) -> String { use std::fmt::Write as _; let mut stripped = String::with_capacity(self.bytes.len()); let _ = write!(&mut stripped, "{}", self); stripped } } impl<'s> std::fmt::Display for StrippedStr<'s> { /// **Note:** this does *not* exhaust the [`Iterator`] #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let iter = Self { bytes: self.bytes, state: self.state, }; for printable in iter { printable.fmt(f)?; } Ok(()) } } impl<'s> Iterator for StrippedStr<'s> { type Item = &'s str; #[inline] fn next(&mut self) -> Option { next_str(&mut self.bytes, &mut self.state) } } /// Incrementally strip non-contiguous data #[derive(Default, Clone, Debug, PartialEq, Eq)] pub struct StripStr { state: State, } impl StripStr { /// Initial state pub fn new() -> Self { Default::default() } /// Strip the next segment of data pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> { StripStrIter { bytes: data.as_bytes(), state: &mut self.state, } } } /// See [`StripStr`] #[derive(Debug, PartialEq, Eq)] pub struct StripStrIter<'s> { bytes: &'s [u8], state: &'s mut State, } impl<'s> Iterator for StripStrIter<'s> { type Item = &'s str; #[inline] fn next(&mut self) -> Option { next_str(&mut self.bytes, self.state) } } #[inline] fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> { let offset = bytes.iter().copied().position(|b| { let (next_state, action) = state_change(*state, b); if next_state != State::Anywhere { *state = next_state; } is_printable_str(action, b) }); let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len())); *bytes = next; *state = State::Ground; let offset = bytes.iter().copied().position(|b| { let (_next_state, action) = state_change(State::Ground, b); !is_printable_str(action, b) }); let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len())); *bytes = next; if printable.is_empty() { None } else { let printable = unsafe { from_utf8_unchecked( printable, "`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations", ) }; Some(printable) } } #[inline] unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str { if cfg!(debug_assertions) { // Catch problems more quickly when testing std::str::from_utf8(bytes).expect(safety_justification) } else { std::str::from_utf8_unchecked(bytes) } } #[inline] fn is_printable_str(action: Action, byte: u8) -> bool { // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not // ISO Latin-1, making it DEL and non-printable const DEL: u8 = 0x7f; (action == Action::Print && byte != DEL) || action == Action::BeginUtf8 // since we know the input is valid UTF-8, the only thing we can do with // continuations is to print them || is_utf8_continuation(byte) || (action == Action::Execute && byte.is_ascii_whitespace()) } #[inline] fn is_utf8_continuation(b: u8) -> bool { matches!(b, 0x80..=0xbf) } /// Strip ANSI escapes from bytes, returning the printable content /// /// This can be used to take output from a program that includes escape sequences and write it /// somewhere that does not easily support them, such as a log file. /// /// # Example /// /// ```rust /// use std::io::Write as _; /// /// let styled_text = "\x1b[32mfoo\x1b[m bar"; /// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec(); /// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]); /// ``` #[inline] pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> { StrippedBytes::new(data) } /// See [`strip_bytes`] #[derive(Default, Clone, Debug, PartialEq, Eq)] pub struct StrippedBytes<'s> { bytes: &'s [u8], state: State, utf8parser: Utf8Parser, } impl<'s> StrippedBytes<'s> { /// See [`strip_bytes`] #[inline] pub fn new(bytes: &'s [u8]) -> Self { Self { bytes, state: State::Ground, utf8parser: Default::default(), } } /// Strip the next slice of bytes /// /// Used when the content is in several non-contiguous slices /// /// # Panic /// /// May panic if it is not exhausted / empty #[inline] pub fn extend(&mut self, bytes: &'s [u8]) { debug_assert!( self.is_empty(), "current bytes must be processed to ensure we end at the right state" ); self.bytes = bytes; } /// Report the bytes has been exhausted #[inline] pub fn is_empty(&self) -> bool { self.bytes.is_empty() } /// Create a [`Vec`] of the printable content #[inline] pub fn into_vec(self) -> Vec { let mut stripped = Vec::with_capacity(self.bytes.len()); for printable in self { stripped.extend(printable); } stripped } } impl<'s> Iterator for StrippedBytes<'s> { type Item = &'s [u8]; #[inline] fn next(&mut self) -> Option { next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser) } } /// Incrementally strip non-contiguous data #[derive(Default, Clone, Debug, PartialEq, Eq)] pub struct StripBytes { state: State, utf8parser: Utf8Parser, } impl StripBytes { /// Initial state pub fn new() -> Self { Default::default() } /// Strip the next segment of data pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> { StripBytesIter { bytes, state: &mut self.state, utf8parser: &mut self.utf8parser, } } } /// See [`StripBytes`] #[derive(Debug, PartialEq, Eq)] pub struct StripBytesIter<'s> { bytes: &'s [u8], state: &'s mut State, utf8parser: &'s mut Utf8Parser, } impl<'s> Iterator for StripBytesIter<'s> { type Item = &'s [u8]; #[inline] fn next(&mut self) -> Option { next_bytes(&mut self.bytes, self.state, self.utf8parser) } } #[inline] fn next_bytes<'s>( bytes: &mut &'s [u8], state: &mut State, utf8parser: &mut Utf8Parser, ) -> Option<&'s [u8]> { let offset = bytes.iter().copied().position(|b| { if *state == State::Utf8 { true } else { let (next_state, action) = state_change(*state, b); if next_state != State::Anywhere { *state = next_state; } is_printable_bytes(action, b) } }); let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len())); *bytes = next; let offset = bytes.iter().copied().position(|b| { if *state == State::Utf8 { if utf8parser.add(b) { *state = State::Ground; } false } else { let (next_state, action) = state_change(State::Ground, b); if next_state != State::Anywhere { *state = next_state; } if *state == State::Utf8 { utf8parser.add(b); false } else { !is_printable_bytes(action, b) } } }); let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len())); *bytes = next; if printable.is_empty() { None } else { Some(printable) } } #[derive(Default, Clone, Debug, PartialEq, Eq)] pub struct Utf8Parser { utf8_parser: utf8parse::Parser, } impl Utf8Parser { fn add(&mut self, byte: u8) -> bool { let mut b = false; let mut receiver = VtUtf8Receiver(&mut b); self.utf8_parser.advance(&mut receiver, byte); b } } struct VtUtf8Receiver<'a>(&'a mut bool); impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> { fn codepoint(&mut self, _: char) { *self.0 = true; } fn invalid_sequence(&mut self) { *self.0 = true; } } #[inline] fn is_printable_bytes(action: Action, byte: u8) -> bool { // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not // ISO Latin-1, making it DEL and non-printable const DEL: u8 = 0x7f; // Continuations aren't included as they may also be control codes, requiring more context (action == Action::Print && byte != DEL) || action == Action::BeginUtf8 || (action == Action::Execute && byte.is_ascii_whitespace()) } #[cfg(test)] mod test { use super::*; use proptest::prelude::*; /// Model based off full parser fn parser_strip(bytes: &[u8]) -> String { #[derive(Default)] struct Strip(String); impl Strip { fn with_capacity(capacity: usize) -> Self { Self(String::with_capacity(capacity)) } } impl anstyle_parse::Perform for Strip { fn print(&mut self, c: char) { self.0.push(c); } fn execute(&mut self, byte: u8) { if byte.is_ascii_whitespace() { self.0.push(byte as char); } } } let mut stripped = Strip::with_capacity(bytes.len()); let mut parser = anstyle_parse::Parser::::new(); for byte in bytes { parser.advance(&mut stripped, *byte); } stripped.0 } /// Model verifying incremental parsing fn strip_char(mut s: &str) -> String { let mut result = String::new(); let mut state = StripStr::new(); while !s.is_empty() { let mut indices = s.char_indices(); indices.next(); // current let offset = indices.next().map(|(i, _)| i).unwrap_or_else(|| s.len()); let (current, remainder) = s.split_at(offset); for printable in state.strip_next(current) { result.push_str(printable); } s = remainder; } result } /// Model verifying incremental parsing fn strip_byte(s: &[u8]) -> Vec { let mut result = Vec::new(); let mut state = StripBytes::default(); for start in 0..s.len() { let current = &s[start..=start]; for printable in state.strip_next(current) { result.extend(printable); } } result } #[test] fn test_strip_bytes_multibyte() { let bytes = [240, 145, 141, 139]; let expected = parser_strip(&bytes); let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap(); assert_eq!(expected, actual); } #[test] fn test_strip_byte_multibyte() { let bytes = [240, 145, 141, 139]; let expected = parser_strip(&bytes); let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap(); assert_eq!(expected, actual); } #[test] fn test_strip_str_del() { let input = std::str::from_utf8(&[0x7f]).unwrap(); let expected = ""; let actual = strip_str(input).to_string(); assert_eq!(expected, actual); } #[test] fn test_strip_byte_del() { let bytes = [0x7f]; let expected = ""; let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap(); assert_eq!(expected, actual); } proptest! { #[test] #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 fn strip_str_no_escapes(s in "\\PC*") { let expected = parser_strip(s.as_bytes()); let actual = strip_str(&s).to_string(); assert_eq!(expected, actual); } #[test] #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 fn strip_char_no_escapes(s in "\\PC*") { let expected = parser_strip(s.as_bytes()); let actual = strip_char(&s); assert_eq!(expected, actual); } #[test] #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 fn strip_bytes_no_escapes(s in "\\PC*") { dbg!(&s); dbg!(s.as_bytes()); let expected = parser_strip(s.as_bytes()); let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap(); assert_eq!(expected, actual); } #[test] #[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253 fn strip_byte_no_escapes(s in "\\PC*") { dbg!(&s); dbg!(s.as_bytes()); let expected = parser_strip(s.as_bytes()); let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap(); assert_eq!(expected, actual); } } }