summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/tests/hybrid/api.rs
blob: e82d808e34b3015136437f00014e521fff632f84 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
use std::error::Error;

use regex_automata::{
    hybrid::dfa::{OverlappingState, DFA},
    nfa::thompson,
    HalfMatch, Input, MatchError,
};

// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
//
// NOTE: If you change something in lazy DFA implementation that causes this
// test to fail by reporting different "gave up" positions, then it's generally
// okay to update the positions in the test below as long as you're sure your
// changes are correct. Namely, it is expected that if there are changes in the
// cache size (or changes in how big things are inside the cache), then its
// utilization may change slightly and thus impact where a search gives up.
// Precisely where a search gives up is not an API guarantee, so changing the
// offsets here is OK.
#[test]
#[cfg(target_pointer_width = "64")]
#[cfg(not(miri))]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
    // This is a carefully chosen regex. The idea is to pick one that requires
    // some decent number of states (hence the bounded repetition). But we
    // specifically choose to create a class with an ASCII letter and a
    // non-ASCII letter so that we can check that no new states are created
    // once the cache is full. Namely, if we fill up the cache on a haystack
    // of 'a's, then in order to match one 'β', a new state will need to be
    // created since a 'β' is encoded with multiple bytes.
    //
    // So we proceed by "filling" up the cache by searching a haystack of just
    // 'a's. The cache won't have enough room to add enough states to find the
    // match (because of the bounded repetition), which should result in it
    // giving up before it finds a match.
    //
    // Since there's now no more room to create states, we search a haystack
    // of 'β' and confirm that it gives up immediately.
    let pattern = r"[aβ]{99}";
    let dfa = DFA::builder()
        .configure(
            // Configure it so that we have the minimum cache capacity
            // possible. And that if any resets occur, the search quits.
            DFA::config()
                .skip_cache_capacity_check(true)
                .cache_capacity(0)
                .minimum_cache_clear_count(Some(0)),
        )
        .thompson(thompson::NFA::config())
        .build(pattern)?;
    let mut cache = dfa.create_cache();

    let haystack = "a".repeat(101).into_bytes();
    let err = MatchError::gave_up(25);
    // Notice that we make the same amount of progress in each search! That's
    // because the cache is reused and already has states to handle the first
    // N bytes.
    assert_eq!(
        Err(err.clone()),
        dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
    );
    assert_eq!(
        Err(err.clone()),
        dfa.try_search_overlapping_fwd(
            &mut cache,
            &Input::new(&haystack),
            &mut OverlappingState::start()
        ),
    );

    let haystack = "β".repeat(101).into_bytes();
    let err = MatchError::gave_up(2);
    assert_eq!(
        Err(err),
        dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
    );
    // no need to test that other find routines quit, since we did that above

    // OK, if we reset the cache, then we should be able to create more states
    // and make more progress with searching for betas.
    cache.reset(&dfa);
    let err = MatchError::gave_up(27);
    assert_eq!(
        Err(err),
        dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
    );

    // ... switching back to ASCII still makes progress since it just needs to
    // set transitions on existing states!
    let haystack = "a".repeat(101).into_bytes();
    let err = MatchError::gave_up(13);
    assert_eq!(
        Err(err),
        dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
    );

    Ok(())
}

// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
    let dfa = DFA::builder()
        .configure(DFA::config().quit(b'x', true))
        .build("[[:word:]]+$")?;
    let mut cache = dfa.create_cache();

    assert_eq!(
        dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")),
        Err(MatchError::quit(b'x', 3)),
    );
    assert_eq!(
        dfa.try_search_overlapping_fwd(
            &mut cache,
            &Input::new(b"abcxyz"),
            &mut OverlappingState::start()
        ),
        Err(MatchError::quit(b'x', 3)),
    );

    Ok(())
}

// Tests that quit bytes in the reverse direction work correctly.
#[test]
fn quit_rev() -> Result<(), Box<dyn Error>> {
    let dfa = DFA::builder()
        .configure(DFA::config().quit(b'x', true))
        .thompson(thompson::Config::new().reverse(true))
        .build("^[[:word:]]+")?;
    let mut cache = dfa.create_cache();

    assert_eq!(
        dfa.try_search_rev(&mut cache, &Input::new("abcxyz")),
        Err(MatchError::quit(b'x', 3)),
    );

    Ok(())
}

// Tests that if we heuristically enable Unicode word boundaries but then
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
// will panic.
#[test]
#[should_panic]
fn quit_panics() {
    DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
}

// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
#[test]
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
    let mut config = DFA::config();
    for b in 0x80..=0xFF {
        config = config.quit(b, true);
    }
    let dfa = DFA::builder().configure(config).build(r"\b")?;
    let mut cache = dfa.create_cache();
    let expected = HalfMatch::must(0, 1);
    assert_eq!(
        Ok(Some(expected)),
        dfa.try_search_fwd(&mut cache, &Input::new(" a")),
    );
    Ok(())
}