use std::{ borrow::Cow, iter::{FusedIterator, Peekable}, str::CharIndices, }; #[derive(Debug, Clone, Copy)] enum State { Start, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, Trap, } impl Default for State { fn default() -> Self { Self::Start } } impl State { fn is_final(&self) -> bool { #[allow(clippy::match_like_matches_macro)] match self { Self::S3 | Self::S5 | Self::S6 | Self::S7 | Self::S8 | Self::S9 | Self::S11 => true, _ => false, } } fn is_trapped(&self) -> bool { #[allow(clippy::match_like_matches_macro)] match self { Self::Trap => true, _ => false, } } fn transition(&mut self, c: char) { *self = match c { '\u{1b}' | '\u{9b}' => match self { Self::Start => Self::S1, _ => Self::Trap, }, '(' | ')' => match self { Self::S1 => Self::S2, Self::S2 | Self::S4 => Self::S4, _ => Self::Trap, }, ';' => match self { Self::S1 | Self::S2 | Self::S4 => Self::S4, Self::S5 | Self::S6 | Self::S7 | Self::S8 | Self::S10 => Self::S10, _ => Self::Trap, }, '[' | '#' | '?' => match self { Self::S1 | Self::S2 | Self::S4 => Self::S4, _ => Self::Trap, }, '0'..='2' => match self { Self::S1 | Self::S4 => Self::S5, Self::S2 => Self::S3, Self::S5 => Self::S6, Self::S6 => Self::S7, Self::S7 => Self::S8, Self::S8 => Self::S9, Self::S10 => Self::S5, _ => Self::Trap, }, '3'..='9' => match self { Self::S1 | Self::S4 => Self::S5, Self::S2 => Self::S5, Self::S5 => Self::S6, Self::S6 => Self::S7, Self::S7 => Self::S8, Self::S8 => Self::S9, Self::S10 => Self::S5, _ => Self::Trap, }, 'A'..='P' | 'R' | 'Z' | 'c' | 'f'..='n' | 'q' | 'r' | 'y' | '=' | '>' | '<' => { match self { Self::S1 | Self::S2 | Self::S4 | Self::S5 | Self::S6 | Self::S7 | Self::S8 | Self::S10 => Self::S11, _ => Self::Trap, } } _ => Self::Trap, }; } } #[derive(Debug)] struct Matches<'a> { s: &'a str, it: Peekable>, } impl<'a> Matches<'a> { fn new(s: &'a str) -> Self { let it = s.char_indices().peekable(); Self { s, it } } } #[derive(Debug)] struct Match<'a> { text: &'a str, start: usize, end: usize, } impl<'a> Match<'a> { #[inline] pub fn as_str(&self) -> &'a str { &self.text[self.start..self.end] } } impl<'a> Iterator for Matches<'a> { type Item = Match<'a>; fn next(&mut self) -> Option { find_ansi_code_exclusive(&mut self.it).map(|(start, end)| Match { text: self.s, start, end, }) } } impl<'a> FusedIterator for Matches<'a> {} fn find_ansi_code_exclusive(it: &mut Peekable) -> Option<(usize, usize)> { 'outer: loop { if let (start, '\u{1b}') | (start, '\u{9b}') = it.peek()? { let start = *start; let mut state = State::default(); let mut maybe_end = None; loop { let item = it.peek(); if let Some((idx, c)) = item { state.transition(*c); if state.is_final() { maybe_end = Some(*idx); } } // The match is greedy so run till we hit the trap state no matter what. A valid // match is just one that was final at some point if state.is_trapped() || item.is_none() { match maybe_end { Some(end) => { // All possible final characters are a single byte so it's safe to make // the end exclusive by just adding one return Some((start, end + 1)); } // The character we are peeking right now might be the start of a match so // we want to continue the loop without popping off that char None => continue 'outer, } } it.next(); } } it.next(); } } /// Helper function to strip ansi codes. pub fn strip_ansi_codes(s: &str) -> Cow { let mut char_it = s.char_indices().peekable(); match find_ansi_code_exclusive(&mut char_it) { Some(_) => { let stripped: String = AnsiCodeIterator::new(s) .filter_map(|(text, is_ansi)| if is_ansi { None } else { Some(text) }) .collect(); Cow::Owned(stripped) } None => Cow::Borrowed(s), } } /// An iterator over ansi codes in a string. /// /// This type can be used to scan over ansi codes in a string. /// It yields tuples in the form `(s, is_ansi)` where `s` is a slice of /// the original string and `is_ansi` indicates if the slice contains /// ansi codes or string values. pub struct AnsiCodeIterator<'a> { s: &'a str, pending_item: Option<(&'a str, bool)>, last_idx: usize, cur_idx: usize, iter: Matches<'a>, } impl<'a> AnsiCodeIterator<'a> { /// Creates a new ansi code iterator. pub fn new(s: &'a str) -> AnsiCodeIterator<'a> { AnsiCodeIterator { s, pending_item: None, last_idx: 0, cur_idx: 0, iter: Matches::new(s), } } /// Returns the string slice up to the current match. pub fn current_slice(&self) -> &str { &self.s[..self.cur_idx] } /// Returns the string slice from the current match to the end. pub fn rest_slice(&self) -> &str { &self.s[self.cur_idx..] } } impl<'a> Iterator for AnsiCodeIterator<'a> { type Item = (&'a str, bool); fn next(&mut self) -> Option<(&'a str, bool)> { if let Some(pending_item) = self.pending_item.take() { self.cur_idx += pending_item.0.len(); Some(pending_item) } else if let Some(m) = self.iter.next() { let s = &self.s[self.last_idx..m.start]; self.last_idx = m.end; if s.is_empty() { self.cur_idx = m.end; Some((m.as_str(), true)) } else { self.cur_idx = m.start; self.pending_item = Some((m.as_str(), true)); Some((s, false)) } } else if self.last_idx < self.s.len() { let rv = &self.s[self.last_idx..]; self.cur_idx = self.s.len(); self.last_idx = self.s.len(); Some((rv, false)) } else { None } } } impl<'a> FusedIterator for AnsiCodeIterator<'a> {} #[cfg(test)] mod tests { use super::*; use lazy_static::lazy_static; use proptest::prelude::*; use regex::Regex; // The manual dfa `State` is a handwritten translation from the previously used regex. That // regex is kept here and used to ensure that the new matches are the same as the old lazy_static! { static ref STRIP_ANSI_RE: Regex = Regex::new( r"[\x1b\x9b]([()][012AB]|[\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><])", ) .unwrap(); } impl<'a, 'b> PartialEq> for regex::Match<'b> { fn eq(&self, other: &Match<'a>) -> bool { self.start() == other.start && self.end() == other.end } } proptest! { #[test] fn dfa_matches_old_regex(s in r"([\x1b\x9b]?.*){0,5}") { let old_matches: Vec<_> = STRIP_ANSI_RE.find_iter(&s).collect(); let new_matches: Vec<_> = Matches::new(&s).collect(); assert_eq!(old_matches, new_matches); } } #[test] fn dfa_matches_regex_on_small_strings() { // To make sure the test runs in a reasonable time this is a slimmed down list of // characters to reduce the groups that are only used with each other along with one // arbitrarily chosen character not used in the regex (' ') const POSSIBLE_BYTES: &[u8] = &[b' ', 0x1b, 0x9b, b'(', b'0', b'[', b';', b'3', b'C']; fn check_all_strings_of_len(len: usize) { _check_all_strings_of_len(len, &mut Vec::with_capacity(len)); } fn _check_all_strings_of_len(len: usize, chunk: &mut Vec) { if len == 0 { if let Ok(s) = std::str::from_utf8(chunk) { let old_matches: Vec<_> = STRIP_ANSI_RE.find_iter(s).collect(); let new_matches: Vec<_> = Matches::new(s).collect(); assert_eq!(old_matches, new_matches); } return; } for b in POSSIBLE_BYTES { chunk.push(*b); _check_all_strings_of_len(len - 1, chunk); chunk.pop(); } } for str_len in 0..=6 { check_all_strings_of_len(str_len); } } #[test] fn complex_data() { let s = std::fs::read_to_string( std::path::Path::new("tests") .join("data") .join("sample_zellij_session.log"), ) .unwrap(); let old_matches: Vec<_> = STRIP_ANSI_RE.find_iter(&s).collect(); let new_matches: Vec<_> = Matches::new(&s).collect(); assert_eq!(old_matches, new_matches); } #[test] fn state_machine() { let ansi_code = "\x1b)B"; let mut state = State::default(); assert!(!state.is_final()); for c in ansi_code.chars() { state.transition(c); } assert!(state.is_final()); state.transition('A'); assert!(state.is_trapped()); } #[test] fn back_to_back_entry_char() { let s = "\x1b\x1bf"; let matches: Vec<_> = Matches::new(s).map(|m| m.as_str()).collect(); assert_eq!(&["\x1bf"], matches.as_slice()); } #[test] fn early_paren_can_use_many_chars() { let s = "\x1b(C"; let matches: Vec<_> = Matches::new(s).map(|m| m.as_str()).collect(); assert_eq!(&[s], matches.as_slice()); } #[test] fn long_run_of_digits() { let s = "\u{1b}00000"; let matches: Vec<_> = Matches::new(s).map(|m| m.as_str()).collect(); assert_eq!(&[s], matches.as_slice()); } #[test] fn test_ansi_iter_re_vt100() { let s = "\x1b(0lpq\x1b)Benglish"; let mut iter = AnsiCodeIterator::new(s); assert_eq!(iter.next(), Some(("\x1b(0", true))); assert_eq!(iter.next(), Some(("lpq", false))); assert_eq!(iter.next(), Some(("\x1b)B", true))); assert_eq!(iter.next(), Some(("english", false))); } #[test] fn test_ansi_iter_re() { use crate::style; let s = format!("Hello {}!", style("World").red().force_styling(true)); let mut iter = AnsiCodeIterator::new(&s); assert_eq!(iter.next(), Some(("Hello ", false))); assert_eq!(iter.current_slice(), "Hello "); assert_eq!(iter.rest_slice(), "\x1b[31mWorld\x1b[0m!"); assert_eq!(iter.next(), Some(("\x1b[31m", true))); assert_eq!(iter.current_slice(), "Hello \x1b[31m"); assert_eq!(iter.rest_slice(), "World\x1b[0m!"); assert_eq!(iter.next(), Some(("World", false))); assert_eq!(iter.current_slice(), "Hello \x1b[31mWorld"); assert_eq!(iter.rest_slice(), "\x1b[0m!"); assert_eq!(iter.next(), Some(("\x1b[0m", true))); assert_eq!(iter.current_slice(), "Hello \x1b[31mWorld\x1b[0m"); assert_eq!(iter.rest_slice(), "!"); assert_eq!(iter.next(), Some(("!", false))); assert_eq!(iter.current_slice(), "Hello \x1b[31mWorld\x1b[0m!"); assert_eq!(iter.rest_slice(), ""); assert_eq!(iter.next(), None); } #[test] fn test_ansi_iter_re_on_multi() { use crate::style; let s = format!("{}", style("a").red().bold().force_styling(true)); let mut iter = AnsiCodeIterator::new(&s); assert_eq!(iter.next(), Some(("\x1b[31m", true))); assert_eq!(iter.current_slice(), "\x1b[31m"); assert_eq!(iter.rest_slice(), "\x1b[1ma\x1b[0m"); assert_eq!(iter.next(), Some(("\x1b[1m", true))); assert_eq!(iter.current_slice(), "\x1b[31m\x1b[1m"); assert_eq!(iter.rest_slice(), "a\x1b[0m"); assert_eq!(iter.next(), Some(("a", false))); assert_eq!(iter.current_slice(), "\x1b[31m\x1b[1ma"); assert_eq!(iter.rest_slice(), "\x1b[0m"); assert_eq!(iter.next(), Some(("\x1b[0m", true))); assert_eq!(iter.current_slice(), "\x1b[31m\x1b[1ma\x1b[0m"); assert_eq!(iter.rest_slice(), ""); assert_eq!(iter.next(), None); } }