/*! Provides routines for interpolating capture group references. That is, if a replacement string contains references like `$foo` or `${foo1}`, then they are replaced with the corresponding capture values for the groups named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` is supported as well, with `1` corresponding to a capture group index and not a name. This module provides the free functions [`string`] and [`bytes`], which interpolate Rust Unicode strings and byte strings, respectively. # Format These routines support two different kinds of capture references: unbraced and braced. For the unbraced format, the format supported is `$ref` where `name` can be any character in the class `[0-9A-Za-z_]`. `ref` is always the longest possible parse. So for example, `$1a` corresponds to the capture group named `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then it is treated as a capture group index itself and not a name. For the braced format, the format supported is `${ref}` where `ref` can be any sequence of bytes except for `}`. If no closing brace occurs, then it is not considered a capture reference. As with the unbraced format, if `ref` matches `^[0-9]+$`, then it is treated as a capture group index and not a name. The braced format is useful for exerting precise control over the name of the capture reference. For example, `${1}a` corresponds to the capture group reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) corresponds to the capture group reference `1a`. The braced format is also useful for expressing capture group names that use characters not supported by the unbraced format. For example, `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. If a capture group reference is found and it does not refer to a valid capture group, then it will be replaced with the empty string. To write a literal `$`, use `$$`. To be clear, and as exhibited via the type signatures in the routines in this module, it is impossible for a replacement string to be invalid. A replacement string may not have the intended semantics, but the interpolation procedure itself can never fail. */ use alloc::{string::String, vec::Vec}; use crate::util::memchr::memchr; /// Accepts a replacement string and interpolates capture references with their /// corresponding values. /// /// `append` should be a function that appends the string value of a capture /// group at a particular index to the string given. If the capture group /// index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. /// /// # Example /// /// ``` /// use regex_automata::util::interpolate; /// /// let mut dst = String::new(); /// interpolate::string( /// "foo $bar baz", /// |index, dst| { /// if index == 0 { /// dst.push_str("BAR"); /// } /// }, /// |name| { /// if name == "bar" { /// Some(0) /// } else { /// None /// } /// }, /// &mut dst, /// ); /// assert_eq!("foo BAR baz", dst); /// ``` pub fn string( mut replacement: &str, mut append: impl FnMut(usize, &mut String), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut String, ) { while !replacement.is_empty() { match memchr(b'$', replacement.as_bytes()) { None => break, Some(i) => { dst.push_str(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { dst.push_str("$"); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { dst.push_str("$"); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.push_str(replacement); } /// Accepts a replacement byte string and interpolates capture references with /// their corresponding values. /// /// `append` should be a function that appends the byte string value of a /// capture group at a particular index to the byte string given. If the /// capture group index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. /// /// # Example /// /// ``` /// use regex_automata::util::interpolate; /// /// let mut dst = vec![]; /// interpolate::bytes( /// b"foo $bar baz", /// |index, dst| { /// if index == 0 { /// dst.extend_from_slice(b"BAR"); /// } /// }, /// |name| { /// if name == "bar" { /// Some(0) /// } else { /// None /// } /// }, /// &mut dst, /// ); /// assert_eq!(&b"foo BAR baz"[..], dst); /// ``` pub fn bytes( mut replacement: &[u8], mut append: impl FnMut(usize, &mut Vec), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut Vec, ) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, Some(i) => { dst.extend_from_slice(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.get(1).map_or(false, |&b| b == b'$') { dst.push(b'$'); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement) { Some(cap_ref) => cap_ref, None => { dst.push(b'$'); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.extend_from_slice(replacement); } /// `CaptureRef` represents a reference to a capture group inside some text. /// The reference is either a capture group name or a number. /// /// It is also tagged with the position in the text following the /// capture reference. #[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { cap: Ref<'a>, end: usize, } /// A reference to a capture group in some text. /// /// e.g., `$2`, `$foo`, `${foo}`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } impl<'a> From<&'a str> for Ref<'a> { fn from(x: &'a str) -> Ref<'a> { Ref::Named(x) } } impl From for Ref<'static> { fn from(x: usize) -> Ref<'static> { Ref::Number(x) } } /// Parses a possible reference to a capture group name in the given text, /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. /// /// Note that this returns a "possible" reference because this routine doesn't /// know whether the reference is to a valid group or not. If it winds up not /// being a valid reference, then it should be replaced with the empty string. fn find_cap_ref(replacement: &[u8]) -> Option> { let mut i = 0; let rep: &[u8] = replacement; if rep.len() <= 1 || rep[0] != b'$' { return None; } i += 1; if rep[i] == b'{' { return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { cap_end += 1; } if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check via an unchecked conversion or by parsing the number straight from // &[u8]. let cap = core::str::from_utf8(&rep[i..cap_end]) .expect("valid UTF-8 capture name"); Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: cap_end, }) } /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening /// brace has been found at `i-1` in `rep`. This then looks for a closing /// brace and returns the capture reference within the brace. fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); let start = i; while rep.get(i).map_or(false, |&b| b != b'}') { i += 1; } if !rep.get(i).map_or(false, |&b| b == b'}') { return None; } // When looking at braced names, we don't put any restrictions on the name, // so it's possible it could be invalid UTF-8. But a capture group name // can never be invalid UTF-8, so if we have invalid UTF-8, then we can // safely return None. let cap = match core::str::from_utf8(&rep[start..i]) { Err(_) => return None, Ok(cap) => cap, }; Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: i + 1, }) } /// Returns true if and only if the given byte is allowed in a capture name /// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, _ => false, } } #[cfg(test)] mod tests { use alloc::{string::String, vec, vec::Vec}; use super::{find_cap_ref, CaptureRef}; macro_rules! find { ($name:ident, $text:expr) => { #[test] fn $name() { assert_eq!(None, find_cap_ref($text.as_bytes())); } }; ($name:ident, $text:expr, $capref:expr) => { #[test] fn $name() { assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); } }; } macro_rules! c { ($name_or_number:expr, $pos:expr) => { CaptureRef { cap: $name_or_number.into(), end: $pos } }; } find!(find_cap_ref1, "$foo", c!("foo", 4)); find!(find_cap_ref2, "${foo}", c!("foo", 6)); find!(find_cap_ref3, "$0", c!(0, 2)); find!(find_cap_ref4, "$5", c!(5, 2)); find!(find_cap_ref5, "$10", c!(10, 3)); // See https://github.com/rust-lang/regex/pull/585 // for more on characters following numbers find!(find_cap_ref6, "$42a", c!("42a", 4)); find!(find_cap_ref7, "${42}a", c!(42, 5)); find!(find_cap_ref8, "${42"); find!(find_cap_ref9, "${42 "); find!(find_cap_ref10, " $0 "); find!(find_cap_ref11, "$"); find!(find_cap_ref12, " "); find!(find_cap_ref13, ""); find!(find_cap_ref14, "$1-$2", c!(1, 2)); find!(find_cap_ref15, "$1_$2", c!("1_", 3)); find!(find_cap_ref16, "$x-$y", c!("x", 2)); find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); find!(find_cap_ref20, "${¾}", c!("¾", 5)); find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); find!(find_cap_ref23, "${☃}", c!("☃", 6)); find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); find!(find_cap_ref26, "${名字}", c!("名字", 9)); fn interpolate_string( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = String::new(); super::string( replacement, |i, dst| { if let Some(&s) = caps.get(i) { dst.push_str(s); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); dst } fn interpolate_bytes( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = vec![]; super::bytes( replacement.as_bytes(), |i, dst| { if let Some(&s) = caps.get(i) { dst.extend_from_slice(s.as_bytes()); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); String::from_utf8(dst).unwrap() } macro_rules! interp { ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { #[test] fn $name() { assert_eq!( $expected, interpolate_string($map, $caps, $hay), "interpolate::string failed", ); assert_eq!( $expected, interpolate_bytes($map, $caps, $hay), "interpolate::bytes failed", ); } }; } interp!( interp1, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo test", "test xxx test", ); interp!( interp2, vec![("foo", 2)], vec!["", "", "xxx"], "test$footest", "test", ); interp!( interp3, vec![("foo", 2)], vec!["", "", "xxx"], "test${foo}test", "testxxxtest", ); interp!( interp4, vec![("foo", 2)], vec!["", "", "xxx"], "test$2test", "test", ); interp!( interp5, vec![("foo", 2)], vec!["", "", "xxx"], "test${2}test", "testxxxtest", ); interp!( interp6, vec![("foo", 2)], vec!["", "", "xxx"], "test $$foo test", "test $foo test", ); interp!( interp7, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo", "test xxx", ); interp!( interp8, vec![("foo", 2)], vec!["", "", "xxx"], "$foo test", "xxx test", ); interp!( interp9, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $bar$foo", "test yyyxxx", ); interp!( interp10, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $ test", "test $ test", ); interp!( interp11, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${} test", "test test", ); interp!( interp12, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${ } test", "test test", ); interp!( interp13, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a b} test", "test test", ); interp!( interp14, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a} test", "test test", ); // This is a funny case where a braced reference is never closed, but // within the unclosed braced reference, there is an unbraced reference. // In this case, the braced reference is just treated literally and the // unbraced reference is found. interp!( interp15, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${wat $bar ok", "test ${wat yyy ok", ); }