1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
use std::error::Error;
use regex_automata::{
hybrid::{
dfa::{self, DFA},
regex::Regex,
OverlappingState,
},
nfa::thompson,
HalfMatch, MatchError, MatchKind, MultiMatch,
};
use crate::util::{BunkPrefilter, SubstringPrefilter};
// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
#[test]
#[cfg(target_pointer_width = "64")]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// This is a carefully chosen regex. The idea is to pick one that requires
// some decent number of states (hence the bounded repetition). But we
// specifically choose to create a class with an ASCII letter and a
// non-ASCII letter so that we can check that no new states are created
// once the cache is full. Namely, if we fill up the cache on a haystack
// of 'a's, then in order to match one 'β', a new state will need to be
// created since a 'β' is encoded with multiple bytes. Since there's no
// room for this state, the search should quit at the very first position.
let pattern = r"[aβ]{100}";
let dfa = DFA::builder()
.configure(
// Configure it so that we have the minimum cache capacity
// possible. And that if any resets occur, the search quits.
DFA::config()
.skip_cache_capacity_check(true)
.cache_capacity(0)
.minimum_cache_clear_count(Some(0)),
)
.build(pattern)?;
let mut cache = dfa.create_cache();
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 25 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
assert_eq!(
dfa.find_overlapping_fwd(
&mut cache,
&haystack,
&mut OverlappingState::start()
),
Err(err.clone())
);
let haystack = "β".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 0 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
// no need to test that other find routines quit, since we did that above
// OK, if we reset the cache, then we should be able to create more states
// and make more progress with searching for betas.
cache.reset(&dfa);
let err = MatchError::GaveUp { offset: 26 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
// ... switching back to ASCII still makes progress since it just needs to
// set transitions on existing states!
let haystack = "a".repeat(101).into_bytes();
let err = MatchError::GaveUp { offset: 13 };
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
Ok(())
}
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.build("[[:word:]]+$")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_overlapping_fwd(
&mut cache,
b"abcxyz",
&mut OverlappingState::start()
),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
Ok(())
}
// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder()
.configure(DFA::config().quit(b'x', true))
.thompson(thompson::Config::new().reverse(true))
.build("^[[:word:]]+")?;
let mut cache = dfa.create_cache();
assert_eq!(
dfa.find_earliest_rev(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
assert_eq!(
dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
Err(MatchError::Quit { byte: b'x', offset: 3 })
);
Ok(())
}
// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
}
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let mut config = DFA::config();
for b in 0x80..=0xFF {
config = config.quit(b, true);
}
let dfa = DFA::builder().configure(config).build(r"\b")?;
let mut cache = dfa.create_cache();
let expected = HalfMatch::must(0, 1);
assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
Ok(())
}
// Tests that we can provide a prefilter to a Regex, and the search reports
// correct results.
#[test]
fn prefilter_works() -> Result<(), Box<dyn Error>> {
let mut re = Regex::new(r"a[0-9]+").unwrap();
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
let mut cache = re.create_cache();
let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
let matches: Vec<(usize, usize)> = re
.find_leftmost_iter(&mut cache, text)
.map(|m| (m.start(), m.end()))
.collect();
assert_eq!(
matches,
vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
);
Ok(())
}
// This test confirms that a prefilter is active by using a prefilter that
// reports false negatives.
#[test]
fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
let text = b"za123";
let mut re = Regex::new(r"a[0-9]+").unwrap();
let mut cache = re.create_cache();
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
assert_eq!(
re.find_leftmost(&mut cache, b"za123"),
Some(MultiMatch::must(0, 1, 5))
);
assert_eq!(
re.find_leftmost(&mut cache, b"a123"),
Some(MultiMatch::must(0, 0, 4))
);
re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
// This checks that the prefilter is used when first starting the search,
// instead of waiting until at least one transition has occurred.
assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
Ok(())
}
|