diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex/testdata/utf8.toml | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex/testdata/utf8.toml')
-rw-r--r-- | third_party/rust/regex/testdata/utf8.toml | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/third_party/rust/regex/testdata/utf8.toml b/third_party/rust/regex/testdata/utf8.toml new file mode 100644 index 0000000000..39e284b382 --- /dev/null +++ b/third_party/rust/regex/testdata/utf8.toml @@ -0,0 +1,399 @@ +# These test the UTF-8 modes expose by regex-automata. Namely, when utf8 is +# true, then we promise that the haystack is valid UTF-8. (Otherwise behavior +# is unspecified.) This also corresponds to building the regex engine with the +# following two guarantees: +# +# 1) For any non-empty match reported, its span is guaranteed to correspond to +# valid UTF-8. +# 2) All empty or zero-width matches reported must never split a UTF-8 +# encoded codepoint. If the haystack has invalid UTF-8, then this results in +# unspecified behavior. +# +# The (2) is in particular what we focus our testing on since (1) is generally +# guaranteed by regex-syntax's AST-to-HIR translator and is well tested there. +# The thing with (2) is that it can't be described in the HIR, so the regex +# engines have to handle that case. Thus, we test it here. +# +# Note that it is possible to build a regex that has property (1) but not +# (2), and vice versa. This is done by building the HIR with 'utf8=true' but +# building the Thompson NFA with 'utf8=false'. We don't test that here because +# the harness doesn't expose a way to enable or disable UTF-8 mode with that +# granularity. Instead, those combinations are lightly tested via doc examples. +# That's not to say that (1) without (2) is uncommon. Indeed, ripgrep uses it +# because it cannot guarantee that its haystack is valid UTF-8. + +# This tests that an empty regex doesn't split a codepoint. +[[test]] +name = "empty-utf8yes" +regex = '' +haystack = '☃' +matches = [[0, 0], [3, 3]] +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-overlapping" +regex = '' +haystack = '☃' +matches = [[0, 0], [3, 3]] +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# This tests that an empty regex DOES split a codepoint when utf=false. +[[test]] +name = "empty-utf8no" +regex = '' +haystack = '☃' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8no-overlapping" +regex = '' +haystack = '☃' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# This tests that an empty regex doesn't split a codepoint, even if we give +# it bounds entirely within the codepoint. +# +# This is one of the trickier cases and is what motivated the current UTF-8 +# mode design. In particular, at one point, this test failed the 'is_match' +# variant of the test but not 'find'. This is because the 'is_match' code path +# is specifically optimized for "was a match found" rather than "where is the +# match." In the former case, you don't really care about the empty-vs-non-empty +# matches, and thus, the codepoint splitting filtering logic wasn't getting +# applied. (In multiple ways across multiple regex engines.) In this way, you +# can wind up with a situation where 'is_match' says "yes," but 'find' says, +# "I didn't find anything." Which is... not great. +# +# I could have decided to say that providing boundaries that themselves split +# a codepoint would have unspecified behavior. But I couldn't quite convince +# myself that such boundaries were the only way to get an inconsistency between +# 'is_match' and 'find'. +# +# Note that I also tried to come up with a test like this that fails without +# using `bounds`. Specifically, a test where 'is_match' and 'find' disagree. +# But I couldn't do it, and I'm tempted to conclude it is impossible. The +# fundamental problem is that you need to simultaneously produce an empty match +# that splits a codepoint while *not* matching before or after the codepoint. +[[test]] +name = "empty-utf8yes-bounds" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [] +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-bounds-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [] +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# This tests that an empty regex splits a codepoint when the bounds are +# entirely within the codepoint. +[[test]] +name = "empty-utf8no-bounds" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [[1, 1], [2, 2], [3, 3]] +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8no-bounds-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [[1, 1], [2, 2], [3, 3]] +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# In this test, we anchor the search. Since the start position is also a UTF-8 +# boundary, we get a match. +[[test]] +name = "empty-utf8yes-anchored" +regex = '' +haystack = '𝛃' +matches = [[0, 0]] +anchored = true +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-anchored-overlapping" +regex = '' +haystack = '𝛃' +matches = [[0, 0]] +anchored = true +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# Same as above, except with UTF-8 mode disabled. It almost doesn't change the +# result, except for the fact that since this is an anchored search and we +# always find all matches, the test harness will keep reporting matches until +# none are found. Because it's anchored, matches will be reported so long as +# they are directly adjacent. Since with UTF-8 mode the next anchored search +# after the match at [0, 0] fails, iteration stops (and doesn't find the last +# match at [4, 4]). +[[test]] +name = "empty-utf8no-anchored" +regex = '' +haystack = '𝛃' +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +anchored = true +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +# +# Note that overlapping anchored searches are a little weird, and it's not +# totally clear what their semantics ought to be. For now, we just test the +# current behavior of our test shim that implements overlapping search. (This +# is one of the reasons why we don't really expose regex-level overlapping +# searches.) +[[test]] +name = "empty-utf8no-anchored-overlapping" +regex = '' +haystack = '𝛃' +matches = [[0, 0]] +anchored = true +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# In this test, we anchor the search, but also set bounds. The bounds start the +# search in the middle of a codepoint, so there should never be a match. +[[test]] +name = "empty-utf8yes-anchored-bounds" +regex = '' +haystack = '𝛃' +matches = [] +bounds = [1, 3] +anchored = true +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-anchored-bounds-overlapping" +regex = '' +haystack = '𝛃' +matches = [] +bounds = [1, 3] +anchored = true +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# Same as above, except with UTF-8 mode disabled. Without UTF-8 mode enabled, +# matching within a codepoint is allowed. And remember, as in the anchored test +# above with UTF-8 mode disabled, iteration will report all adjacent matches. +# The matches at [0, 0] and [4, 4] are not included because of the bounds of +# the search. +[[test]] +name = "empty-utf8no-anchored-bounds" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [[1, 1], [2, 2], [3, 3]] +anchored = true +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +# +# Note that overlapping anchored searches are a little weird, and it's not +# totally clear what their semantics ought to be. For now, we just test the +# current behavior of our test shim that implements overlapping search. (This +# is one of the reasons why we don't really expose regex-level overlapping +# searches.) +[[test]] +name = "empty-utf8no-anchored-bounds-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 3] +matches = [[1, 1]] +anchored = true +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# This tests that we find the match at the end of the string when the bounds +# exclude the first match. +[[test]] +name = "empty-utf8yes-startbound" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[4, 4]] +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-startbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[4, 4]] +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# Same as above, except since UTF-8 mode is disabled, we also find the matches +# inbetween that split the codepoint. +[[test]] +name = "empty-utf8no-startbound" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[1, 1], [2, 2], [3, 3], [4, 4]] +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8no-startbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[1, 1], [2, 2], [3, 3], [4, 4]] +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# This tests that we don't find any matches in an anchored search, even when +# the bounds include a match (at the end). +[[test]] +name = "empty-utf8yes-anchored-startbound" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [] +anchored = true +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-anchored-startbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [] +anchored = true +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# Same as above, except since UTF-8 mode is disabled, we also find the matches +# inbetween that split the codepoint. Even though this is an anchored search, +# since the matches are adjacent, we find all of them. +[[test]] +name = "empty-utf8no-anchored-startbound" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[1, 1], [2, 2], [3, 3], [4, 4]] +anchored = true +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +# +# Note that overlapping anchored searches are a little weird, and it's not +# totally clear what their semantics ought to be. For now, we just test the +# current behavior of our test shim that implements overlapping search. (This +# is one of the reasons why we don't really expose regex-level overlapping +# searches.) +[[test]] +name = "empty-utf8no-anchored-startbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [1, 4] +matches = [[1, 1]] +anchored = true +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" + +# This tests that we find the match at the end of the haystack in UTF-8 mode +# when our bounds only include the empty string at the end of the haystack. +[[test]] +name = "empty-utf8yes-anchored-endbound" +regex = '' +haystack = '𝛃' +bounds = [4, 4] +matches = [[4, 4]] +anchored = true +unicode = true +utf8 = true + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8yes-anchored-endbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [4, 4] +matches = [[4, 4]] +anchored = true +unicode = true +utf8 = true +match-kind = "all" +search-kind = "overlapping" + +# Same as above, but with UTF-8 mode disabled. Results remain the same since +# the only possible match does not split a codepoint. +[[test]] +name = "empty-utf8no-anchored-endbound" +regex = '' +haystack = '𝛃' +bounds = [4, 4] +matches = [[4, 4]] +anchored = true +unicode = true +utf8 = false + +# Tests the overlapping case of the above. +[[test]] +name = "empty-utf8no-anchored-endbound-overlapping" +regex = '' +haystack = '𝛃' +bounds = [4, 4] +matches = [[4, 4]] +anchored = true +unicode = true +utf8 = false +match-kind = "all" +search-kind = "overlapping" |