third_party/rust/regex/testdata/no-unicode.toml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

[[test]]
name = "invalid-utf8-literal1"
regex = '\xFF'
haystack = '\xFF'
matches = [[0, 1]]
unicode = false
utf8 = false
unescape = true


[[test]]
name = "mixed"
regex = '(?:.+)(?-u)(?:.+)'
haystack = '\xCE\x93\xCE\x94\xFF'
matches = [[0, 5]]
utf8 = false
unescape = true


[[test]]
name = "case1"
regex = "a"
haystack = "A"
matches = [[0, 1]]
case-insensitive = true
unicode = false

[[test]]
name = "case2"
regex = "[a-z]+"
haystack = "AaAaA"
matches = [[0, 5]]
case-insensitive = true
unicode = false

[[test]]
name = "case3"
regex = "[a-z]+"
haystack = "aA\u212AaA"
matches = [[0, 7]]
case-insensitive = true

[[test]]
name = "case4"
regex = "[a-z]+"
haystack = "aA\u212AaA"
matches = [[0, 2], [5, 7]]
case-insensitive = true
unicode = false


[[test]]
name = "negate1"
regex = "[^a]"
haystack = "δ"
matches = [[0, 2]]

[[test]]
name = "negate2"
regex = "[^a]"
haystack = "δ"
matches = [[0, 1], [1, 2]]
unicode = false
utf8 = false


[[test]]
name = "dotstar-prefix1"
regex = "a"
haystack = '\xFFa'
matches = [[1, 2]]
unicode = false
utf8 = false
unescape = true

[[test]]
name = "dotstar-prefix2"
regex = "a"
haystack = '\xFFa'
matches = [[1, 2]]
utf8 = false
unescape = true


[[test]]
name = "null-bytes1"
regex = '[^\x00]+\x00'
haystack = 'foo\x00'
matches = [[0, 4]]
unicode = false
utf8 = false
unescape = true


[[test]]
name = "word-ascii"
regex = '\w+'
haystack = "aδ"
matches = [[0, 1]]
unicode = false

[[test]]
name = "word-unicode"
regex = '\w+'
haystack = "aδ"
matches = [[0, 3]]

[[test]]
name = "decimal-ascii"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 1], [7, 8]]
unicode = false

[[test]]
name = "decimal-unicode"
regex = '\d+'
haystack = "1२३9"
matches = [[0, 8]]

[[test]]
name = "space-ascii"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 1]]
unicode = false

[[test]]
name = "space-unicode"
regex = '\s+'
haystack = " \u1680"
matches = [[0, 4]]


[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-bytes"
regex = ''
haystack = "☃"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
utf8 = false

[[test]]
# See: https://github.com/rust-lang/regex/issues/484
name = "iter1-utf8"
regex = ''
haystack = "☃"
matches = [[0, 0], [3, 3]]

[[test]]
# See: https://github.com/rust-lang/regex/issues/484
# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
name = "iter2-bytes"
regex = ''
haystack = 'b\xFFr'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
unescape = true
utf8 = false


# These test that unanchored prefixes can munch through invalid UTF-8 even when
# utf8 is enabled.
#
# This test actually reflects an interesting simplification in how the Thompson
# NFA is constructed. It used to be that the NFA could be built with an
# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
# But the latter turns out to be pretty precarious when it comes to prefilters,
# because if you search a haystack that contains invalid UTF-8 but have an
# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
# optimization because you actually have to check that everything is valid
# UTF-8.
#
# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
# order to guarantee that we only match at valid UTF-8 boundaries. But this
# isn't actually true! There are really only two things to consider here:
#
# 1) Will a regex match split an encoded codepoint? No. Because by construction,
# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
# all of the UTF-8 modes are enabled).
#
# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
# assuming all of the UTF-8 modes are enabled.
[[test]]
name = "unanchored-invalid-utf8-match-100"
regex = '[a-z]'
haystack = '\xFFa\xFF'
matches = [[1, 2]]
unescape = true
utf8 = false

# This test shows that we can still prevent a match from occurring by requiring
# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
# behavior of not munching through invalid UTF-8 anywhere is needed, then it
# can be achieved thusly.
[[test]]
name = "unanchored-invalid-utf8-nomatch"
regex = '^(?s:.)*?[a-z]'
haystack = '\xFFa\xFF'
matches = []
unescape = true
utf8 = false

# This is a tricky test that makes sure we don't accidentally do a kind of
# unanchored search when we've requested that a regex engine not report
# empty matches that split a codepoint. This test caught a regression during
# development where the code for skipping over bad empty matches would do so
# even if the search should have been anchored. This is ultimately what led to
# making 'anchored' an 'Input' option, so that it was always clear what kind
# of search was being performed. (Before that, whether a search was anchored
# or not was a config knob on the regex engine.) This did wind up making DFAs
# a little more complex to configure (with their 'StartKind' knob), but it
# generally smoothed out everything else.
#
# Great example of a test whose failure motivated a sweeping API refactoring.
[[test]]
name = "anchored-iter-empty-utf8"
regex = ''
haystack = 'a☃z'
matches = [[0, 0], [1, 1]]
unescape = false
utf8 = true
anchored = true