summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/testdata/no-unicode.toml
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex/testdata/no-unicode.toml
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex/testdata/no-unicode.toml')
-rw-r--r--third_party/rust/regex/testdata/no-unicode.toml222
1 files changed, 222 insertions, 0 deletions
diff --git a/third_party/rust/regex/testdata/no-unicode.toml b/third_party/rust/regex/testdata/no-unicode.toml
new file mode 100644
index 0000000000..0ddac4c96d
--- /dev/null
+++ b/third_party/rust/regex/testdata/no-unicode.toml
@@ -0,0 +1,222 @@
+[[test]]
+name = "invalid-utf8-literal1"
+regex = '\xFF'
+haystack = '\xFF'
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[test]]
+name = "mixed"
+regex = '(?:.+)(?-u)(?:.+)'
+haystack = '\xCE\x93\xCE\x94\xFF'
+matches = [[0, 5]]
+utf8 = false
+unescape = true
+
+
+[[test]]
+name = "case1"
+regex = "a"
+haystack = "A"
+matches = [[0, 1]]
+case-insensitive = true
+unicode = false
+
+[[test]]
+name = "case2"
+regex = "[a-z]+"
+haystack = "AaAaA"
+matches = [[0, 5]]
+case-insensitive = true
+unicode = false
+
+[[test]]
+name = "case3"
+regex = "[a-z]+"
+haystack = "aA\u212AaA"
+matches = [[0, 7]]
+case-insensitive = true
+
+[[test]]
+name = "case4"
+regex = "[a-z]+"
+haystack = "aA\u212AaA"
+matches = [[0, 2], [5, 7]]
+case-insensitive = true
+unicode = false
+
+
+[[test]]
+name = "negate1"
+regex = "[^a]"
+haystack = "δ"
+matches = [[0, 2]]
+
+[[test]]
+name = "negate2"
+regex = "[^a]"
+haystack = "δ"
+matches = [[0, 1], [1, 2]]
+unicode = false
+utf8 = false
+
+
+[[test]]
+name = "dotstar-prefix1"
+regex = "a"
+haystack = '\xFFa'
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+unescape = true
+
+[[test]]
+name = "dotstar-prefix2"
+regex = "a"
+haystack = '\xFFa'
+matches = [[1, 2]]
+utf8 = false
+unescape = true
+
+
+[[test]]
+name = "null-bytes1"
+regex = '[^\x00]+\x00'
+haystack = 'foo\x00'
+matches = [[0, 4]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[test]]
+name = "word-ascii"
+regex = '\w+'
+haystack = "aδ"
+matches = [[0, 1]]
+unicode = false
+
+[[test]]
+name = "word-unicode"
+regex = '\w+'
+haystack = "aδ"
+matches = [[0, 3]]
+
+[[test]]
+name = "decimal-ascii"
+regex = '\d+'
+haystack = "1२३9"
+matches = [[0, 1], [7, 8]]
+unicode = false
+
+[[test]]
+name = "decimal-unicode"
+regex = '\d+'
+haystack = "1२३9"
+matches = [[0, 8]]
+
+[[test]]
+name = "space-ascii"
+regex = '\s+'
+haystack = " \u1680"
+matches = [[0, 1]]
+unicode = false
+
+[[test]]
+name = "space-unicode"
+regex = '\s+'
+haystack = " \u1680"
+matches = [[0, 4]]
+
+
+[[test]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-bytes"
+regex = ''
+haystack = "☃"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+utf8 = false
+
+[[test]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-utf8"
+regex = ''
+haystack = "☃"
+matches = [[0, 0], [3, 3]]
+
+[[test]]
+# See: https://github.com/rust-lang/regex/issues/484
+# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
+name = "iter2-bytes"
+regex = ''
+haystack = 'b\xFFr'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+unescape = true
+utf8 = false
+
+
+# These test that unanchored prefixes can munch through invalid UTF-8 even when
+# utf8 is enabled.
+#
+# This test actually reflects an interesting simplification in how the Thompson
+# NFA is constructed. It used to be that the NFA could be built with an
+# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
+# But the latter turns out to be pretty precarious when it comes to prefilters,
+# because if you search a haystack that contains invalid UTF-8 but have an
+# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
+# optimization because you actually have to check that everything is valid
+# UTF-8.
+#
+# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
+# order to guarantee that we only match at valid UTF-8 boundaries. But this
+# isn't actually true! There are really only two things to consider here:
+#
+# 1) Will a regex match split an encoded codepoint? No. Because by construction,
+# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
+# all of the UTF-8 modes are enabled).
+#
+# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
+# assuming all of the UTF-8 modes are enabled.
+[[test]]
+name = "unanchored-invalid-utf8-match-100"
+regex = '[a-z]'
+haystack = '\xFFa\xFF'
+matches = [[1, 2]]
+unescape = true
+utf8 = false
+
+# This test shows that we can still prevent a match from occurring by requiring
+# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
+# behavior of not munching through invalid UTF-8 anywhere is needed, then it
+# can be achieved thusly.
+[[test]]
+name = "unanchored-invalid-utf8-nomatch"
+regex = '^(?s:.)*?[a-z]'
+haystack = '\xFFa\xFF'
+matches = []
+unescape = true
+utf8 = false
+
+# This is a tricky test that makes sure we don't accidentally do a kind of
+# unanchored search when we've requested that a regex engine not report
+# empty matches that split a codepoint. This test caught a regression during
+# development where the code for skipping over bad empty matches would do so
+# even if the search should have been anchored. This is ultimately what led to
+# making 'anchored' an 'Input' option, so that it was always clear what kind
+# of search was being performed. (Before that, whether a search was anchored
+# or not was a config knob on the regex engine.) This did wind up making DFAs
+# a little more complex to configure (with their 'StartKind' knob), but it
+# generally smoothed out everything else.
+#
+# Great example of a test whose failure motivated a sweeping API refactoring.
+[[test]]
+name = "anchored-iter-empty-utf8"
+regex = ''
+haystack = 'a☃z'
+matches = [[0, 0], [1, 1]]
+unescape = false
+utf8 = true
+anchored = true