diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-07 05:48:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-07 05:48:48 +0000 |
commit | ef24de24a82fe681581cc130f342363c47c0969a (patch) | |
tree | 0d494f7e1a38b95c92426f58fe6eaa877303a86c /vendor/proptest/src/string.rs | |
parent | Releasing progress-linux version 1.74.1+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-ef24de24a82fe681581cc130f342363c47c0969a.tar.xz rustc-ef24de24a82fe681581cc130f342363c47c0969a.zip |
Merging upstream version 1.75.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/proptest/src/string.rs')
-rw-r--r-- | vendor/proptest/src/string.rs | 210 |
1 files changed, 144 insertions, 66 deletions
diff --git a/vendor/proptest/src/string.rs b/vendor/proptest/src/string.rs index 8777388f5..935cb21ca 100644 --- a/vendor/proptest/src/string.rs +++ b/vendor/proptest/src/string.rs @@ -16,14 +16,8 @@ use core::mem; use core::ops::RangeInclusive; use core::u32; -use regex_syntax::hir::{ - self, Hir, - HirKind::*, - Literal::*, - RepetitionKind::{self, *}, - RepetitionRange::*, -}; -use regex_syntax::{Error as ParseError, Parser}; +use regex_syntax::hir::{self, Hir, HirKind::*, Repetition}; +use regex_syntax::{Error as ParseError, ParserBuilder}; use crate::bool; use crate::char; @@ -33,7 +27,7 @@ use crate::test_runner::*; /// Wraps the regex that forms the `Strategy` for `String` so that a sensible /// `Default` can be given. The default is a string of non-control characters. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct StringParam(&'static str); impl From<StringParam> for &'static str { @@ -150,7 +144,8 @@ impl StrategyFromRegex for Vec<u8> { /// If you don't need error handling and aren't limited by setup time, it is /// also possible to directly use a `&str` as a strategy with the same effect. pub fn string_regex(regex: &str) -> ParseResult<String> { - string_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new().build().parse(regex)?; + string_regex_parsed(&hir) } /// Like `string_regex()`, but allows providing a pre-parsed expression. @@ -167,8 +162,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> { /// Creates a strategy which generates byte strings matching the given regular /// expression. +/// +/// By default, the byte strings generated by this strategy _will_ be valid +/// UTF-8. If you wish to generate byte strings that aren't (necessarily) +/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`. +/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.` +/// will generate newline characters (byte value `0x0A`). See the +/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support) +/// for more information. pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> { - bytes_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new() + .utf8(false) + .build() + .parse(regex)?; + bytes_regex_parsed(&hir) } /// Like `bytes_regex()`, but allows providing a pre-parsed expression. @@ -176,11 +183,7 @@ pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> { match expr.kind() { Empty => Ok(Just(vec![]).sboxed()), - Literal(lit) => Ok(Just(match lit { - Unicode(scalar) => to_bytes(*scalar), - Byte(byte) => vec![*byte], - }) - .sboxed()), + Literal(lit) => Ok(Just(lit.0.to_vec()).sboxed()), Class(class) => Ok(match class { hir::Class::Unicode(class) => { @@ -192,19 +195,13 @@ pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> { } }), - Repetition(rep) => Ok(vec( - bytes_regex_parsed(&rep.hir)?, - to_range(rep.kind.clone())?, - ) - .prop_map(|parts| { - parts.into_iter().fold(vec![], |mut acc, child| { - acc.extend(child); - acc - }) - }) - .sboxed()), + Repetition(rep) => { + Ok(vec(bytes_regex_parsed(&rep.sub)?, to_range(rep)?) + .prop_map(|parts| parts.concat()) + .sboxed()) + } - Group(group) => bytes_regex_parsed(&group.hir).map(|v| v.0), + Capture(capture) => bytes_regex_parsed(&capture.sub).map(|v| v.0), Concat(subs) => { let subs = ConcatIter { @@ -232,12 +229,8 @@ pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> { Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed()) } - Anchor(_) => { - unsupported("line/text anchors not supported for string generation") - } - - WordBoundary(_) => unsupported( - "word boundary tests not supported for string generation", + Look(_) => unsupported( + "anchors/boundaries not supported for string generation", ), } .map(RegexGeneratorStrategy) @@ -298,8 +291,7 @@ impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> { while let Some(next) = self.iter.next() { match next.kind() { // A literal. Accumulate: - Literal(Unicode(scalar)) => self.buf.extend(to_bytes(*scalar)), - Literal(Byte(byte)) => self.buf.push(*byte), + Literal(literal) => self.buf.extend_from_slice(&literal.0), // Encountered a non-literal. _ => { return if !self.buf.is_empty() { @@ -324,31 +316,35 @@ impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> { } } -fn to_range(kind: RepetitionKind) -> Result<SizeRange, Error> { - Ok(match kind { - ZeroOrOne => size_range(0..=1), - ZeroOrMore => size_range(0..=32), - OneOrMore => size_range(1..=32), - Range(range) => match range { - Exactly(count) if u32::MAX == count => { - return unsupported( - "Cannot have repetition of exactly u32::MAX", - ) - } - Exactly(count) => size_range(count as usize), - AtLeast(min) => { - let max = if min < u32::MAX as u32 / 2 { - min as usize * 2 - } else { - u32::MAX as usize - }; - size_range((min as usize)..max) - } - Bounded(_, max) if u32::MAX == max => { - return unsupported("Cannot have repetition max of u32::MAX") - } - Bounded(min, max) => size_range((min as usize)..(max as usize + 1)), - }, +fn to_range(rep: &Repetition) -> Result<SizeRange, Error> { + Ok(match (rep.min, rep.max) { + // Zero or one + (0, Some(1)) => size_range(0..=1), + // Zero or more + (0, None) => size_range(0..=32), + // One or more + (1, None) => size_range(1..=32), + // Exact count of u32::MAX + (u32::MAX, Some(u32::MAX)) => { + return unsupported("Cannot have repetition of exactly u32::MAX"); + } + // Exact count + (min, Some(max)) if min == max => size_range(min as usize), + // At least min + (min, None) => { + let max = if min < u32::MAX as u32 / 2 { + min as usize * 2 + } else { + u32::MAX as usize + }; + size_range((min as usize)..max) + } + // Bounded range with max of u32::MAX + (_, Some(u32::MAX)) => { + return unsupported("Cannot have repetition max of u32::MAX") + } + // Bounded range + (min, Some(max)) => size_range((min as usize)..(max as usize + 1)), }) } @@ -357,10 +353,6 @@ fn to_bytes(khar: char) -> Vec<u8> { khar.encode_utf8(&mut buf).as_bytes().to_owned() } -fn regex_to_hir(pattern: &str) -> Result<Hir, Error> { - Ok(Parser::new().parse(pattern)?) -} - fn unsupported<T>(error: &'static str) -> Result<T, Error> { Err(Error::UnsupportedRegex(error)) } @@ -370,9 +362,17 @@ mod test { use std::collections::HashSet; use regex::Regex; + use regex::bytes::Regex as BytesRegex; use super::*; + fn printable_ascii(v: &[u8]) -> String { + v.iter() + .flat_map(|c| std::ascii::escape_default(*c)) + .map(|c| char::from_u32(c.into()).unwrap()) + .collect() + } + fn do_test( pattern: &str, min_distinct: usize, @@ -396,6 +396,29 @@ mod test { ); } + fn do_test_bytes( + pattern: &str, + min_distinct: usize, + max_distinct: usize, + iterations: usize, + ) { + let generated = generate_byte_values_matching_regex(pattern, iterations); + assert!( + generated.len() >= min_distinct, + "Expected to generate at least {} strings, but only \ + generated {}", + min_distinct, + generated.len() + ); + assert!( + generated.len() <= max_distinct, + "Expected to generate at most {} strings, but \ + generated {}", + max_distinct, + generated.len() + ); + } + fn generate_values_matching_regex( pattern: &str, iterations: usize, @@ -432,6 +455,42 @@ mod test { generated } + fn generate_byte_values_matching_regex( + pattern: &str, + iterations: usize, + ) -> HashSet<Vec<u8>> { + let rx = BytesRegex::new(pattern).unwrap(); + let mut generated = HashSet::new(); + + let strategy = bytes_regex(pattern).unwrap(); + let mut runner = TestRunner::deterministic(); + for _ in 0..iterations { + let mut value = strategy.new_tree(&mut runner).unwrap(); + + loop { + let s = value.current(); + let ok = if let Some(matsch) = rx.find(&s) { + 0 == matsch.start() && s.len() == matsch.end() + } else { + false + }; + if !ok { + panic!( + "Generated string {:?} which does not match {:?}", + printable_ascii(&s), pattern + ); + } + + generated.insert(s); + + if !value.simplify() { + break; + } + } + } + generated + } + #[test] fn test_case_insensitive_produces_all_available_values() { let mut expected: HashSet<String> = HashSet::new(); @@ -445,6 +504,7 @@ mod test { #[test] fn test_literal() { do_test("foo", 1, 1, 8); + do_test_bytes("foo", 1, 1, 8); } #[test] @@ -455,36 +515,43 @@ mod test { #[test] fn test_alternation() { do_test("foo|bar|baz", 3, 3, 16); + do_test_bytes("foo|bar|baz", 3, 3, 16); } #[test] - fn test_repitition() { + fn test_repetition() { do_test("a{0,8}", 9, 9, 64); + do_test_bytes("a{0,8}", 9, 9, 64); } #[test] fn test_question() { do_test("a?", 2, 2, 16); + do_test_bytes("a?", 2, 2, 16); } #[test] fn test_star() { do_test("a*", 33, 33, 256); + do_test_bytes("a*", 33, 33, 256); } #[test] fn test_plus() { do_test("a+", 32, 32, 256); + do_test_bytes("a+", 32, 32, 256); } #[test] fn test_n_to_range() { do_test("a{4,}", 4, 4, 64); + do_test_bytes("a{4,}", 4, 4, 64); } #[test] fn test_concatenation() { do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32); + do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32); } #[test] @@ -505,6 +572,7 @@ mod test { #[test] fn test_dot_s() { do_test("(?s).", 200, 65536, 256); + do_test_bytes("(?s-u).", 256, 256, 2048); } #[test] @@ -512,6 +580,16 @@ mod test { do_test("\\d+", 1, 65536, 256); } + #[test] + fn test_non_utf8_byte_strings() { + do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512); + do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512); + do_test_bytes(r#"(?x-u) + \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf ) + ( \xed ( \xb0\x80 | \xbf\xbf ) )? ) + | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120); + } + fn assert_send_and_sync<T: Send + Sync>(_: T) {} #[test] |