summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:44 +0000
commitc23a457e72abe608715ac76f076f47dc42af07a5 (patch)
tree2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata
parentReleasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz
rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata')
-rw-r--r--vendor/regex-automata/.cargo-checksum.json2
-rw-r--r--vendor/regex-automata/COPYING3
-rw-r--r--vendor/regex-automata/Cargo.toml168
-rw-r--r--vendor/regex-automata/LICENSE-APACHE201
-rw-r--r--vendor/regex-automata/LICENSE-MIT40
-rw-r--r--vendor/regex-automata/PLANS.md165
-rw-r--r--vendor/regex-automata/README.md307
-rw-r--r--vendor/regex-automata/TODO13
-rw-r--r--vendor/regex-automata/UNLICENSE24
-rw-r--r--vendor/regex-automata/rustfmt.toml2
-rw-r--r--vendor/regex-automata/src/dfa/accel.rs89
-rw-r--r--vendor/regex-automata/src/dfa/automaton.rs1821
-rw-r--r--vendor/regex-automata/src/dfa/dense.rs2639
-rw-r--r--vendor/regex-automata/src/dfa/determinize.rs182
-rw-r--r--vendor/regex-automata/src/dfa/error.rs162
-rw-r--r--vendor/regex-automata/src/dfa/minimize.rs24
-rw-r--r--vendor/regex-automata/src/dfa/mod.rs135
-rw-r--r--vendor/regex-automata/src/dfa/onepass.rs3188
-rw-r--r--vendor/regex-automata/src/dfa/regex.rs1825
-rw-r--r--vendor/regex-automata/src/dfa/remapper.rs242
-rw-r--r--vendor/regex-automata/src/dfa/search.rs891
-rw-r--r--vendor/regex-automata/src/dfa/search_unsafe.rs321
-rw-r--r--vendor/regex-automata/src/dfa/sparse.rs1279
-rw-r--r--vendor/regex-automata/src/dfa/special.rs109
-rw-r--r--vendor/regex-automata/src/dfa/start.rs74
-rw-r--r--vendor/regex-automata/src/dfa/transducer.rs207
-rw-r--r--vendor/regex-automata/src/hybrid/dfa.rs2664
-rw-r--r--vendor/regex-automata/src/hybrid/error.rs35
-rw-r--r--vendor/regex-automata/src/hybrid/id.rs139
-rw-r--r--vendor/regex-automata/src/hybrid/mod.rs73
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs1925
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs1101
-rw-r--r--vendor/regex-automata/src/lib.rs651
-rw-r--r--vendor/regex-automata/src/macros.rs24
-rw-r--r--vendor/regex-automata/src/meta/error.rs241
-rw-r--r--vendor/regex-automata/src/meta/limited.rs267
-rw-r--r--vendor/regex-automata/src/meta/literal.rs81
-rw-r--r--vendor/regex-automata/src/meta/mod.rs62
-rw-r--r--vendor/regex-automata/src/meta/regex.rs3649
-rw-r--r--vendor/regex-automata/src/meta/reverse_inner.rs220
-rw-r--r--vendor/regex-automata/src/meta/stopat.rs224
-rw-r--r--vendor/regex-automata/src/meta/strategy.rs1908
-rw-r--r--vendor/regex-automata/src/meta/wrappers.rs1348
-rw-r--r--vendor/regex-automata/src/nfa/mod.rs54
-rw-r--r--vendor/regex-automata/src/nfa/thompson/backtrack.rs1884
-rw-r--r--vendor/regex-automata/src/nfa/thompson/builder.rs1337
-rw-r--r--vendor/regex-automata/src/nfa/thompson/compiler.rs2106
-rw-r--r--vendor/regex-automata/src/nfa/thompson/error.rs132
-rw-r--r--vendor/regex-automata/src/nfa/thompson/literal_trie.rs528
-rw-r--r--vendor/regex-automata/src/nfa/thompson/map.rs38
-rw-r--r--vendor/regex-automata/src/nfa/thompson/mod.rs1624
-rw-r--r--vendor/regex-automata/src/nfa/thompson/nfa.rs2101
-rw-r--r--vendor/regex-automata/src/nfa/thompson/pikevm.rs2409
-rw-r--r--vendor/regex-automata/src/nfa/thompson/range_trie.rs342
-rw-r--r--vendor/regex-automata/src/util/alphabet.rs669
-rw-r--r--vendor/regex-automata/src/util/captures.rs2547
-rw-r--r--vendor/regex-automata/src/util/determinize/mod.rs389
-rw-r--r--vendor/regex-automata/src/util/determinize/state.rs191
-rw-r--r--vendor/regex-automata/src/util/empty.rs265
-rw-r--r--vendor/regex-automata/src/util/escape.rs84
-rw-r--r--vendor/regex-automata/src/util/id.rs608
-rw-r--r--vendor/regex-automata/src/util/int.rs252
-rw-r--r--vendor/regex-automata/src/util/interpolate.rs579
-rw-r--r--vendor/regex-automata/src/util/iter.rs1027
-rw-r--r--vendor/regex-automata/src/util/lazy.rs494
-rw-r--r--vendor/regex-automata/src/util/look.rs1748
-rw-r--r--vendor/regex-automata/src/util/matchtypes.rs356
-rw-r--r--vendor/regex-automata/src/util/memchr.rs93
-rw-r--r--vendor/regex-automata/src/util/mod.rs312
-rw-r--r--vendor/regex-automata/src/util/pool.rs1142
-rw-r--r--vendor/regex-automata/src/util/prefilter.rs281
-rw-r--r--vendor/regex-automata/src/util/prefilter/aho_corasick.rs149
-rw-r--r--vendor/regex-automata/src/util/prefilter/byteset.rs58
-rw-r--r--vendor/regex-automata/src/util/prefilter/memchr.rs186
-rw-r--r--vendor/regex-automata/src/util/prefilter/memmem.rs88
-rw-r--r--vendor/regex-automata/src/util/prefilter/mod.rs696
-rw-r--r--vendor/regex-automata/src/util/prefilter/teddy.rs160
-rw-r--r--vendor/regex-automata/src/util/primitives.rs776
-rw-r--r--vendor/regex-automata/src/util/search.rs1969
-rw-r--r--vendor/regex-automata/src/util/sparse_set.rs78
-rw-r--r--vendor/regex-automata/src/util/start.rs341
-rw-r--r--vendor/regex-automata/src/util/syntax.rs252
-rw-r--r--vendor/regex-automata/src/util/unicode_data/mod.rs17
-rw-r--r--vendor/regex-automata/src/util/unicode_data/perl_word.rs781
-rw-r--r--vendor/regex-automata/src/util/utf8.rs196
-rw-r--r--vendor/regex-automata/src/util/wire.rs (renamed from vendor/regex-automata/src/util/bytes.rs)381
-rwxr-xr-xvendor/regex-automata/test95
-rw-r--r--vendor/regex-automata/tests/data/bytes.toml235
-rw-r--r--vendor/regex-automata/tests/data/crazy.toml302
-rw-r--r--vendor/regex-automata/tests/data/earliest.toml48
-rw-r--r--vendor/regex-automata/tests/data/empty.toml113
-rw-r--r--vendor/regex-automata/tests/data/expensive.toml12
-rw-r--r--vendor/regex-automata/tests/data/flags.toml67
-rw-r--r--vendor/regex-automata/tests/data/fowler/basic.toml1638
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/README24
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/basic.dat221
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat79
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat85
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/repetition.dat83
-rw-r--r--vendor/regex-automata/tests/data/fowler/nullsubexpr.toml405
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition-expensive.toml341
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition-long.toml341
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition.toml397
-rw-r--r--vendor/regex-automata/tests/data/iter.toml119
-rw-r--r--vendor/regex-automata/tests/data/misc.toml99
-rw-r--r--vendor/regex-automata/tests/data/multiline.toml275
-rw-r--r--vendor/regex-automata/tests/data/no-unicode.toml158
-rw-r--r--vendor/regex-automata/tests/data/overlapping.toml126
-rw-r--r--vendor/regex-automata/tests/data/regression.toml423
-rw-r--r--vendor/regex-automata/tests/data/set.toml523
-rw-r--r--vendor/regex-automata/tests/data/unicode.toml514
-rw-r--r--vendor/regex-automata/tests/data/word-boundary.toml771
-rw-r--r--vendor/regex-automata/tests/dfa/api.rs88
-rw-r--r--vendor/regex-automata/tests/dfa/mod.rs6
-rw-r--r--vendor/regex-automata/tests/dfa/onepass/mod.rs2
-rw-r--r--vendor/regex-automata/tests/dfa/onepass/suite.rs197
-rw-r--r--vendor/regex-automata/tests/dfa/regression.rs (renamed from vendor/regex-automata/tests/regression.rs)20
-rw-r--r--vendor/regex-automata/tests/dfa/suite.rs365
-rw-r--r--vendor/regex-automata/tests/fuzz/dense.rs52
-rw-r--r--vendor/regex-automata/tests/fuzz/mod.rs2
-rw-r--r--vendor/regex-automata/tests/fuzz/sparse.rs132
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9bin0 -> 1894 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9bin0 -> 1882 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000bin0 -> 941 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9bin0 -> 924 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98bin0 -> 933 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838bin0 -> 802 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570bin0 -> 924 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2bbin0 -> 922 bytes
-rw-r--r--vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9bin0 -> 728 bytes
-rw-r--r--vendor/regex-automata/tests/gen/README.md65
-rw-r--r--vendor/regex-automata/tests/gen/dense/mod.rs22
-rw-r--r--vendor/regex-automata/tests/gen/dense/multi_pattern_v2.rs43
-rw-r--r--vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfabin0 -> 11100 bytes
-rw-r--r--vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfabin0 -> 11100 bytes
-rw-r--r--vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfabin0 -> 7584 bytes
-rw-r--r--vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfabin0 -> 7584 bytes
-rw-r--r--vendor/regex-automata/tests/gen/mod.rs2
-rw-r--r--vendor/regex-automata/tests/gen/sparse/mod.rs22
-rw-r--r--vendor/regex-automata/tests/gen/sparse/multi_pattern_v2.rs37
-rw-r--r--vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfabin0 -> 3476 bytes
-rw-r--r--vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfabin0 -> 3476 bytes
-rw-r--r--vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfabin0 -> 1920 bytes
-rw-r--r--vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfabin0 -> 1920 bytes
-rw-r--r--vendor/regex-automata/tests/hybrid/api.rs140
-rw-r--r--vendor/regex-automata/tests/hybrid/mod.rs1
-rw-r--r--vendor/regex-automata/tests/hybrid/suite.rs327
-rw-r--r--vendor/regex-automata/tests/lib.rs114
-rw-r--r--vendor/regex-automata/tests/meta/mod.rs2
-rw-r--r--vendor/regex-automata/tests/meta/suite.rs200
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/backtrack/mod.rs2
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/backtrack/suite.rs213
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/mod.rs3
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs191
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs2
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs161
-rw-r--r--vendor/regex-automata/tests/tests.rs44
-rw-r--r--vendor/regex-automata/tests/util.rs57
158 files changed, 46399 insertions, 22052 deletions
diff --git a/vendor/regex-automata/.cargo-checksum.json b/vendor/regex-automata/.cargo-checksum.json
index 63e5b1a67..a6d7742a2 100644
--- a/vendor/regex-automata/.cargo-checksum.json
+++ b/vendor/regex-automata/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"0122506f03800693bb58426493e7faa1ec90c002e542fcbfaf5dbd086e56f2be","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","PLANS.md":"405c695de578604ab6a425709709ba8cb69db0b7fed103f44aad2e2069bef7ac","README.md":"de887d97b46825f6fde7c9b1066619eb9a729178b93492d900bc7c183337dd81","TODO":"296f208a1c13fa55c449452e5e0df7aeee7431c0bc81497a3f0c7d2b01483ddb","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/dfa/accel.rs":"cac45cfa62a3521684aee5583aa519425cc0de57d01a23f511433ad46ca426dc","src/dfa/automaton.rs":"9bd295a5a6e7ee99046703e1f8dc02c621e6ddac85344f7f37bb020b71383118","src/dfa/dense.rs":"4739d4959e415a9191d0c0dd0a07d2cc95ce6243831a806f7103bcfd509e9e2c","src/dfa/determinize.rs":"14666440637e91bf2a16a074e666b92cbdbd0b39b4ce21180be9235af47f541e","src/dfa/error.rs":"1f057e5a2f2ae87031676e5cce70c8226514de181dbcce2e491a930d28751b9e","src/dfa/minimize.rs":"a5e85fe9570307a053592eefb3bf6a72d9fdbcfb8311f5a0dd22e39085e87811","src/dfa/mod.rs":"bb02f594cae82e28f2bddea1453c35d8f38ea7f97fb5ee8cc588f628b1fcc667","src/dfa/regex.rs":"18eded661f818be36ef175acd49198140317ecb36d9171c3ebbabdf17c8fcf86","src/dfa/search.rs":"b3526fc40846c71cf687bf4a2f1f0e682b2615d7e3f62707e9e68bc79f2fe9a3","src/dfa/search_unsafe.rs":"047cd4fcdc4129c704e7269c0af2f71d6b8a64b0de01ad7174843c3fb9fbe013","src/dfa/sparse.rs":"c6c7540347e04c2be9b4e0b1b4eed9dc661707bba7386382805e492b704d113d","src/dfa/special.rs":"51d4254e3fcaa711e0739fecd8ee64392426e3bd4a6a74b37096157dc4dbf107","src/dfa/transducer.rs":"ad5766b1e781a8ec1f6113c4eaa53373c9d29260d357de0f71f7cc19a76f1f21","src/hybrid/dfa.rs":"2f6176a317c64716af2ff210c404e712e5a2eac64ad30617c5fda728e1342be9","src/hybrid/error.rs":"99c1e1a7a8d8e88724faaeee5e56383a05b284b74f33d864124d411c52c77361","src/hybrid/id.rs":"051ef2cfeb613fd20a19b42515ce5de8e812313007db6030fd1aaec13cafbabf","src/hybrid/mod.rs":"4f145341030bc6fd1392fbaf916dc9ba5cd1912562e94b758a6458da17abeef8","src/hybrid/regex.rs":"7c0ca05c9801e91af718b50a2f685d0e61fdaad0e88d8c3c23afe71c0a13bb14","src/hybrid/search.rs":"0eb9f26400c9cd949038c8a4c96b96a7879dac994a86a4cf9ed8837f3328e4d5","src/lib.rs":"06641dff57899f19ab7929404c92e21bc48835a65e3e08f366821c7b9ccfe08f","src/macros.rs":"a73da3a0725a7c0afbaf36cd64a185af12db5707fd7740bf08b188c2ae9872db","src/nfa/mod.rs":"3ec8d362fd16e3cb1742930bae77ba128e592c7f574cd186d342b98f39abd06f","src/nfa/thompson/compiler.rs":"9548c025a9fb9d551af9829cf68251084c3b24e1c5db3b365d6463b07ca02164","src/nfa/thompson/error.rs":"7c0c556cdc52635345a0efcfecce3add05cd91770dd8b9353c854d41a9f4b862","src/nfa/thompson/map.rs":"03f88cd3ee01cb46b542918d8eba7fd927a4409c0cf7080f57a19bbc9733020b","src/nfa/thompson/mod.rs":"0b5b274b29ede0a552728579396d74114bfc049c34576fb3bd9358c306ac9dd3","src/nfa/thompson/pikevm.rs":"cf97a464e3c307ffed65530ebf4d17b1d3a9961525e14a49542373b818f47ad1","src/nfa/thompson/range_trie.rs":"8576bc8a4d9fa3f66c88f15b22b3dbbf26534c17c5e621cbbec40801c8141628","src/util/alphabet.rs":"350829d2abf132486086d1f331826177748106c4d8a1c7cff839a82e04f323df","src/util/bytes.rs":"273dbd419f4d561fa1204990abb0c25fa58045b1d9dfeaa8ea40a747e08bfa59","src/util/determinize/mod.rs":"8539e34529589cc56e53dac6f0d29e150da9291e9b72f28f7821c12153dff1e9","src/util/determinize/state.rs":"ccff32679266cd8f4b18b4bf0beba3563167df53ca4f5dc46061fbc1222ca420","src/util/id.rs":"b6b3efabcdfdc0e56a277b903e40c684ba1182547b7e33cc4fbc1ad6ea348664","src/util/lazy.rs":"7ead513dd094d6c30c7196271afbb346b2b3601bbe33488fcd5284d9e8426027","src/util/matchtypes.rs":"24b05d62a95c271029170e73f9ff2bd16f264b6298abf01bcd4660ae2a86a6cd","src/util/mod.rs":"0e054937cc1a84f70dffa4ace1f0111d0b9a177154b423915b411185194a3c8f","src/util/prefilter.rs":"3dcc4f4a75c38fc00435b7ea88cfa9bb3481c8e5655e8325b0f0e1f2b8d1c65f","src/util/sparse_set.rs":"04aac2d8ae2299b85494df85ebafaef2891d36d3b856155cffa3b59fcc8993b4","src/util/start.rs":"2f8c28712bb97265139aefa961cef1b40bb0cbaa73cbbd1e6115ba4cc2bfe196","src/util/syntax.rs":"09f93982215c9bea3200ec2efd21b3d7ec53d5200546eb48a56040eda026db9a","tests/data/bytes.toml":"aee9df19c5cdd52ddac44490c6df6226cef33077a979d6b964ffe73aaf724bbf","tests/data/crazy.toml":"759293076a76d7efe8eb87b3207a0587c7e969637cd985ca985aa15f71dc0c57","tests/data/earliest.toml":"6ba10ea322fc8939ca0b849812b364a0d0b7594a3df1efee62fd03b7d048c399","tests/data/empty.toml":"45f314d2f9c624056665ba80ebcb4626b551a0bc4780d9c7ca160dd5caa6abaf","tests/data/expensive.toml":"d046774120b99f9516fa7893a3e51fa76182133c16e20d4582956125623775fb","tests/data/flags.toml":"b415e2c48a2520bb182a2f795e11229e56b6e2bf93f7177d64e30118b858cef8","tests/data/fowler/basic.toml":"226ea90327f02c62ed673fc747493bc2bb0f647629f08f92ce26a130f653a4fd","tests/data/fowler/dat/README":"441bb1ed49be2b02d99d3f65974313d7d274b154e53bfa3da2a3df0538b24f04","tests/data/fowler/dat/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","tests/data/fowler/dat/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","tests/data/fowler/dat/repetition-expensive.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","tests/data/fowler/dat/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","tests/data/fowler/nullsubexpr.toml":"3e975fc6ca8e14d615ed8483c7a35158d70a8cfcc7676ef15ed68ec5eef63164","tests/data/fowler/repetition-expensive.toml":"9d9203b6c3c380947afb41c717b834eb35746de4f21e124f6f15119a6460b150","tests/data/fowler/repetition-long.toml":"a598f6682e71a8689630edb35d69f43a1305090f77cfc39dff3f60e7284618e2","tests/data/fowler/repetition.toml":"ce1888a6550fce8a7986686684ef3eff762430459d50726bc4918d0e211c3847","tests/data/iter.toml":"d1995a7b65b12aa34b4226c3ca039fcf52dcaf96a6f061064da84e981e6796e0","tests/data/misc.toml":"a32697c95595b0ad28f3c12422caddf79eaba35047f32583f2df1c6b17bc0eaf","tests/data/multiline.toml":"70dabae358d0701132d55b4378d7aa78ae5aa3fabad38ff2a6a91e83b44b78bf","tests/data/no-unicode.toml":"11be343498e0e834b422ead1168204dbaac1fb32a5384e66f0b98cdb63b39057","tests/data/overlapping.toml":"8394b104f24abd62ebed5c4b8b4708db8dba7f973a6fd10f1711d340bf0e5b5c","tests/data/regression.toml":"718d151906584f521b5bb65bae8f03a516da6a0e87312b652b96d63a9a4be64c","tests/data/set.toml":"c2412cf09030ff7ef034e44c2b051e91841f0e2cd990576bb636bd1d1da18827","tests/data/unicode.toml":"af0ee5ba8ec93fbafe4647bbac97287003743db8b7eac3e2d4dfd17f02912328","tests/data/word-boundary.toml":"20cdd14cd0cab146e0fc541dfdf913e764340997db8ab4e2d80f94dd2f9b309d","tests/dfa/api.rs":"9de253770e6bc9b2ca32f1533655740677f245fd61e9188358acb51c6655f98e","tests/dfa/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/dfa/suite.rs":"2d3007c970a05e2ed7babd120d9a5a4e01b034780fc05b9d905e857a8255ab08","tests/hybrid/api.rs":"c954cdcbbc04ef939ae38d32aae3dee1847c6ea2a36ec6e2a4bedb19aaa861e4","tests/hybrid/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/hybrid/suite.rs":"1fd79a8699eb418a28897269daa3e86f7fc792ffa4fe9318c57aabfd10176f38","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/mod.rs":"ab5f818ad62de599a2ddcedfd1774bf51e3245060ab8e3864bb07f146fe81a5a","tests/nfa/thompson/pikevm/api.rs":"af39a4787bb089060ee6b87e5ab1979c1863731ebbd9d1b0ba1ac6e93f6c0633","tests/nfa/thompson/pikevm/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/nfa/thompson/pikevm/suite.rs":"9d56601bb80a67c935f1f9aa4c4d130e1766e827bc34a62a48fb20297d8af2db","tests/regression.rs":"2d72466e872be88941a59582216823eb95bda461a5b2237b438a1fbfdcf813ac","tests/tests.rs":"7cf459df359f75fad2a44f7929521bcbc6fc78da6576af4306aec5386d35ffe3","tests/util.rs":"97573ea40567a62b54babe14a91b689f1d8ff663e2cb5e77103c7dede443e977"},"package":"e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"} \ No newline at end of file
+{"files":{"Cargo.toml":"374956c91c0582e4437674cdf8e67c0ea70ea2d4a7c2fb49d322727dbfca047f","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"61db25dbf26092fc80e8db89165692e55f9fb86b14e8451ebb28303f45932254","src/dfa/accel.rs":"800dada38f7a1d0fa443821dc04a8611c6cf06ef431e517f16867a27cbb4f27d","src/dfa/automaton.rs":"565ebf211769b4710091c4a15b5733296e9fbbc2a137d6eeb1c521b7b69463a0","src/dfa/dense.rs":"73c9c7662c0b4b7901eb17632187aac10bb24e16a89a4dfe78a7bf17bc98f9f1","src/dfa/determinize.rs":"91b9f69d28bdd064aa86716fe0772e4145050fd458bb7869a28660b4f7b64872","src/dfa/minimize.rs":"b5cadb462b9f24cd4aa7a665e75fb813cd06858a92b8986c9c5ae7fd9a60dfab","src/dfa/mod.rs":"e7210af01805f2f390374cd5b40ee502f9dc7633d6a57d988dcb17dfd93864cb","src/dfa/onepass.rs":"d1b29d531545ce30167d58eb24ac15ba10bce158e73483c09f219d5971c8e83c","src/dfa/regex.rs":"f970028c874e2a156db1591bbdc3915027ffa7f47d66d5bd6e97dace5a6a3d5b","src/dfa/remapper.rs":"ca096abc0f8e45c43a2adf3a7743b8857714ae7411a623edea41cc3ce906a169","src/dfa/search.rs":"237cdb8c6239ece5fe8279c4b6209c8094502cdecc9a4e3f977e469e60fd32ec","src/dfa/sparse.rs":"c3a05451a0019850b538dfd640fb12c92ac127b8a69c55f23489bd42c1c1f289","src/dfa/special.rs":"c2e60de5b98e68c9c45aaffbc67a08f049831a764a1ed29d1d1db0fb68efdce5","src/dfa/start.rs":"46b1dbaf8e4518ddddda6bbe596621aae36f8ba694390483a22355d9d799be8e","src/hybrid/dfa.rs":"861b3602bb9ac8b10abe0eae18a2641b2145fcfc7fb22b250ed2e3a345762f4c","src/hybrid/error.rs":"ffc6e65fd9e4694a67902f3516970e3e6cd6e33a7f59a5ab2ac16f740a049d9c","src/hybrid/id.rs":"6168aad5c81c627494ba0575a24d61fd0ae7efabaaceeadb8ff28472275e2813","src/hybrid/mod.rs":"49abcf332f19d2fe87c0a729b1b7715a87794e64f411f4d2bab9d8a4331d6ace","src/hybrid/regex.rs":"9f40aa2cfa89d7a97f9c9e32cb2ae591f4b6f3d51ddec41308d99ce924e130cf","src/hybrid/search.rs":"2aae7ab24c7e6b8d1a1aa81a2f6081f949e9fa42e960fd3fea29f57db8db9f68","src/lib.rs":"4e831d41057760c5f2f1274a206fa5a42f59dbca8f98ad3e782fe0fba0d6c37f","src/macros.rs":"3e4b39252bfa471fad384160a43f113ebfec7bec46a85d16f006622881dd2081","src/meta/error.rs":"710a6813314b1b11ace1b016a827067fff8b2624d47e15c7f52043bff5ab57da","src/meta/limited.rs":"cf629b08d64cb2e1c17d196a1ad6084f733a41e1c947715d9c0ea99ba7f7657d","src/meta/literal.rs":"52da98bb30995dedd22786e4728cb84e84c6093a284168bd91196b999dd0f6ec","src/meta/mod.rs":"f3b10b96fa08efaba3e4c9b81883cf40aac6e4c1f6ae55a497a534cf5805b46d","src/meta/regex.rs":"12ec35a66b889172439c4abebde5f9fb41e85765d6613f4bf622429e83d47b3c","src/meta/reverse_inner.rs":"945d6c2d4c7538e1609dbd430a096784d22abd33db58b1ba65c9c9af45a7d3c0","src/meta/stopat.rs":"b786cd0bd21f66c6f63df2d4bc2e544cd041d548d8001b4a818be1e0f84b6747","src/meta/strategy.rs":"4ee8d21def7323105e5b1101bdb1e152c5befa870a11f2bf0fa85ffbac5a6609","src/meta/wrappers.rs":"6998ff14226905eded36697f885a8ca7508b50ffb05c4b78348ff0e9463857d5","src/nfa/mod.rs":"1a731e217ed4053714500e84e58cc127f402e4e075f7d0e5b9aea715cd52405a","src/nfa/thompson/backtrack.rs":"e9a986d71aa9b0145d9f871c92f466e1b992592d8ac87f7fde36ede2e8016324","src/nfa/thompson/builder.rs":"77bdd42a7fbdedb8d6756f0161d278e677ab1fbe622ca77115c8b506a2a6db21","src/nfa/thompson/compiler.rs":"9cc351398c2d9ce10ac11a1c285f675bc351ecb816d3f33321513dd6bfcdc335","src/nfa/thompson/error.rs":"78488c2fdb85f819f53cc30bb11c7f96169112da5dd14c351e5cc3bcccf0e10e","src/nfa/thompson/literal_trie.rs":"c2d1d09b44da4648db797386c2410cbf63337afef8cb62e6e78cf34786892a11","src/nfa/thompson/map.rs":"96cdf3195f7efb374bcb1791ef5cc12a1cde189ab90402bf01d9b46fb7796b60","src/nfa/thompson/mod.rs":"0651520debd6f023ae1a2c422806aab37f8491e5bb092e20dfdc4fe4179d695c","src/nfa/thompson/nfa.rs":"9782d44b05986370b7f948067977fb20120562e2eca0e4366e35d7d18e81a679","src/nfa/thompson/pikevm.rs":"aaf792832d1bf15fad8a8f0b2e6597170361eb3cbcb9343eb5bd242ff346d750","src/nfa/thompson/range_trie.rs":"c9614074628bb56c9d0a137c1db7e13259a6500e4a46cdc7ddc84bee8f7e928f","src/util/alphabet.rs":"94cd73ce2f4e34e0ae0a146d3efdc85478263afdfefd6dc105e0abf0ec79d82b","src/util/captures.rs":"7aee3aae2836a397c1ad6e4535e0e0d177faf2d99e61476e8fb2710f69763668","src/util/determinize/mod.rs":"32fea73cf4a7a04238c3d3b09ea7afc7fd7c85e87dc115c6152f464ab88bddb2","src/util/determinize/state.rs":"2a0082d5cd2bd47ab75c3f04488655a3c47f1f75075b5d6f9b6e4eeb8980823e","src/util/empty.rs":"13ec7d6cbd1520db5b4c1dae294f4419fa88d39d2bfc16f4ef258473d609f91c","src/util/escape.rs":"5b2731b41a55cb50ab688132bb5640dbd51f14f141adaa864b9db7f0aa092c74","src/util/int.rs":"b7eec0a6cab0798ba66707988fce3ecfc841b93418028a7b1408c5d0f6271351","src/util/interpolate.rs":"5e4e6b6fb6e5a7603e393bf05c609735d86a7d1f54c2436e42111b4e1409b6dd","src/util/iter.rs":"58ae97b4156d7160a46b909f4635d88d10354d9d892d2fcb4c5e18e24cf38f14","src/util/lazy.rs":"e489a96fce952e9d196fd3f5564cf8ea3374eb4aef630ff8f12d82f194ed4336","src/util/look.rs":"e7a5a51f8ed70c2f97edaf3dfbe8859de37b570341447634c6028cb89ff412d7","src/util/memchr.rs":"573109ce4983907083ae0b29a084a324b9b53da369b4d96f7f3a21fd5c8eb5c9","src/util/mod.rs":"16c5fd72263d3a4df994111b81aca36da17f591f4853f21a6a906ac725843f97","src/util/pool.rs":"acc5a4922b276bc9801f5fb58539824460cb69b34a575cecbd7eb56b1d3b4de0","src/util/prefilter/aho_corasick.rs":"c54fa95f4d9e7ab53e2c6463a43f8953df6a440997fc9cd528f225db0dd32582","src/util/prefilter/byteset.rs":"1c80fa432acc23223a75a5181e37c40034764dffe42410e4b77af6f24f48bd5c","src/util/prefilter/memchr.rs":"36c6fe6354b2e729db6830166dd4862e439bc48c9e59258d88e4b6c5654e20ef","src/util/prefilter/memmem.rs":"6f6ed9450b14abf3e4a33d395337e51fbaa9743a0a16aac0009f7680aa60c500","src/util/prefilter/mod.rs":"2818e2e92632aee1c46b0dc01b654e544bfbf460236be86d28a2d836e9fc189a","src/util/prefilter/teddy.rs":"ed54d26858b56e1c8c87e44afae5f63d81ab930787d79e671f3a3513f576e9cd","src/util/primitives.rs":"8a9cc19ef2e1ab183943cdc2d2f095b02252476e32b7e9fff4a06a251749b068","src/util/search.rs":"66bf320ebbe403c119a966f3dfbd53178de0ceebd2ca1922f1ddbb79aed36837","src/util/sparse_set.rs":"3d4aa30b6aa9fc875d36506487a5095dbe8ed528b89e4146a65c7e7497520a4d","src/util/start.rs":"8d2fe005698c0bd3680a0dbfc4a34eebfe2f51081ec1584968383ac4c86fd5fe","src/util/syntax.rs":"720ac0d6600fad33f5967b5afe4e3de2096b857e4cda6fa16ba93b10a8230cab","src/util/unicode_data/mod.rs":"54c3e10bbc393e9881bfac3295815b160f59e69e2056bc29ee7cf0addd8e3cf7","src/util/unicode_data/perl_word.rs":"2e1a5d889598bd4e73af17d3a9f7d6b4cf2f6ab24920a5336e496bb255281e56","src/util/utf8.rs":"7a068009fdf07e693e521b1f0264725c0e6118dbe1eab55da9d0eab21785fcc1","src/util/wire.rs":"bfdf52615c516b6c07db3ce9c333ea61fdc535bd0b79560bbd7f6864ab83946e","test":"39d79ce3532c31a51c0be89a2939816fad0e4868d2b03992c202cbe64dce9f6c","tests/dfa/api.rs":"cc28e366b6bcbfcf379265acd492a92c62743c3f20e7a2b273019679aa9e1291","tests/dfa/mod.rs":"924d8fff500b9b7b140082623023e78007058a87323151cd8e361462945e4f16","tests/dfa/onepass/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/dfa/onepass/suite.rs":"6d63ec5469e6876656ae607cdbe07e6a4e17ace7836b67435763c9b1d233438a","tests/dfa/regression.rs":"ebcf2645290286aa7531eb2b7951385e5ed8167532437aeca2ad2049768fd796","tests/dfa/suite.rs":"cf08499bc8838d2ff16ea9b20b07ad03c9b89d6efe093f081e2982a21ea6d666","tests/fuzz/dense.rs":"3e1099a0cce61e85abc0ad81bc592e85f497f159ef0e5d1d32bac1936aa6f20c","tests/fuzz/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/fuzz/sparse.rs":"ba61db4927ab28953037a4b20317399c86d01b4d774e46c020ade19029215e25","tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9":"8961279a8237c3e318452024dd971b1d5a26b058260c297382a74daca1b7f0d1","tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9":"c2d52e3dea78d3f159b5b521d433358a7fee45ce20ed1545067d461f45ef66b8","tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000":"5b2d273023de3fb04037eaf2e6b4f51cced4c5a08d2e6b44e4be540774f939b9","tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9":"e2e22e2f46a9a75b5c876476442276cf675fe244c5cf918789e4f6b14078fbd9","tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98":"24a12712e1f2ba0a40b5782707908a74dd19941dc372ef525d65a7134f91988c","tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838":"a97f39b2febf9c73535681f7a86201e4b06d5a1ffcf135299c96c1cabfa9f6c4","tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570":"44fe3ef878d35e2d51c2c17ff89bbbe3a4650e09d0cbbd48625c0f5e4dd0848b","tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b":"d5534be36653b4af6cb94a7c63be58869bb8c204c5c63d67a4d6c986b44bb2e1","tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9":"77b844898610560afa09f2b8de73a85a0ba9a3b8cee4ff1bbf26b8c97ad4e8a2","tests/gen/README.md":"c3bfdf2f9ced501dd5bd75d01509a34e503efb2dff2f5f7b260580dde5519ed4","tests/gen/dense/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/dense/multi_pattern_v2.rs":"29b1e9a799adecbdbe7cd05e9748f664c2b915b10b1d2f5d36cfb6453826d1d2","tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa":"8421d5a1bfc0b6c3bdc8fc90dff591a046b0aaf8e06ef7de7cc293004a35d061","tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa":"dcf2fd5fd49f5f53cf1ec66f61623402f39401cb3aea30d6677b98bb1e9541bf","tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa":"73c4f20d984e544dfa4cf05f3009d0a9b52fa84bc97b501ea0ccd179e2def4bc","tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa":"74471209f05754e8e20c8a0222a5877b1b15b8b8f33cd8cac89ea65f708b4aff","tests/gen/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/gen/sparse/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/sparse/multi_pattern_v2.rs":"e00fb2a510a215460aab84573196b1f51bb65884ff494c2382534c04f6fdbfe9","tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa":"3287956bd2003cd69653b125f82aade95d99adbb20229bfdbb4958b8877c0a0b","tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa":"bdf285901eaaac4596380115c5bbb20ab2f42f593d8d9e9238a00ed69863f9c9","tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa":"e466dc085dd68b2d2220932a0e4d28759edd161c1fdad652240aa3825fd85268","tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa":"80358d0c26c1cc7284065b0075f5b8804d83e673a8a8c8327f93a1c1ff455399","tests/hybrid/api.rs":"4b8592c412e6ad0ce4a27ed1c1496acc92366ccb1c7ec23c6fd0596fc6ebbdfb","tests/hybrid/mod.rs":"4856a49a4d9b5e9e079c2719a5e75c32408b37e9b76cbdea057b388a3537af6d","tests/hybrid/suite.rs":"688972275c5ef38cdc5112a1e6e54ccd2bf8290008ae2b17344c6c81e17e3a5a","tests/lib.rs":"5e8a014d53097dba1f865e5e35c35a69cd12f54fad74b5c49a387f8768c30847","tests/meta/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/meta/suite.rs":"4c441f9df82508a5e60dd08f266183f772fc9b2b236fbf69cab87650ecf3b424","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/backtrack/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/backtrack/suite.rs":"4e7baff70fc98b98b8297c6fd6d5818beb20343379e16cdb95bee46207ac4bd6","tests/nfa/thompson/mod.rs":"de9f5bcea1a8d1f03c85c55ad8c0747877d69e344fcd6c6886b0a402f0661291","tests/nfa/thompson/pikevm/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/pikevm/suite.rs":"263837ebf5b2e1906a06237982ea875386d83567e399b4ec1f669f10b1422599"},"package":"c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"} \ No newline at end of file
diff --git a/vendor/regex-automata/COPYING b/vendor/regex-automata/COPYING
deleted file mode 100644
index bb9c20a09..000000000
--- a/vendor/regex-automata/COPYING
+++ /dev/null
@@ -1,3 +0,0 @@
-This project is dual-licensed under the Unlicense and MIT licenses.
-
-You may use this code under the terms of either license.
diff --git a/vendor/regex-automata/Cargo.toml b/vendor/regex-automata/Cargo.toml
index 153f11fb3..fe4949f0d 100644
--- a/vendor/regex-automata/Cargo.toml
+++ b/vendor/regex-automata/Cargo.toml
@@ -10,20 +10,15 @@
# See Cargo.toml.orig for the original contents.
[package]
-edition = "2018"
+edition = "2021"
name = "regex-automata"
-version = "0.2.0"
-authors = ["Andrew Gallant <jamslam@gmail.com>"]
-exclude = [
- "/.github",
- "/scripts/*",
- "/regex-cli",
- "/regex-test",
+version = "0.3.8"
+authors = [
+ "The Rust Project Developers",
+ "Andrew Gallant <jamslam@gmail.com>",
]
autoexamples = false
-autotests = false
description = "Automata construction and matching using regular expressions."
-homepage = "https://github.com/BurntSushi/regex-automata"
documentation = "https://docs.rs/regex-automata"
readme = "README.md"
keywords = [
@@ -34,55 +29,154 @@ keywords = [
"nfa",
]
categories = ["text-processing"]
-license = "Unlicense/MIT"
-repository = "https://github.com/BurntSushi/regex-automata"
-resolver = "2"
-
-[profile.bench]
-debug = true
-
-[profile.dev]
-opt-level = 3
-debug = true
-
-[profile.release]
-debug = true
-
-[profile.test]
-opt-level = 3
-debug = true
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/rust-lang/regex/tree/master/regex-automata"
[lib]
bench = false
[[test]]
name = "integration"
-path = "tests/tests.rs"
+path = "tests/lib.rs"
-[dependencies.fst]
-version = "0.4.5"
+[dependencies.aho-corasick]
+version = "1.0.0"
optional = true
+default-features = false
[dependencies.log]
version = "0.4.14"
optional = true
[dependencies.memchr]
-version = "2.4.0"
+version = "2.6.0"
+optional = true
default-features = false
[dependencies.regex-syntax]
-version = "0.6.24"
+version = "0.7.4"
optional = true
+default-features = false
+
+[dev-dependencies.anyhow]
+version = "1.0.69"
+
+[dev-dependencies.bstr]
+version = "1.3.0"
+features = ["std"]
+default-features = false
+
+[dev-dependencies.doc-comment]
+version = "0.3.3"
+
+[dev-dependencies.env_logger]
+version = "0.9.3"
+features = [
+ "atty",
+ "humantime",
+ "termcolor",
+]
+default-features = false
+
+[dev-dependencies.quickcheck]
+version = "1.0.3"
+default-features = false
+
+[dev-dependencies.regex-test]
+version = "0.1.0"
[features]
-alloc = ["syntax"]
+alloc = []
default = [
"std",
+ "syntax",
+ "perf",
+ "unicode",
+ "meta",
+ "nfa",
+ "dfa",
+ "hybrid",
+]
+dfa = [
+ "dfa-build",
+ "dfa-search",
+ "dfa-onepass",
+]
+dfa-build = [
+ "nfa-thompson",
+ "dfa-search",
+]
+dfa-onepass = ["nfa-thompson"]
+dfa-search = []
+hybrid = [
"alloc",
+ "nfa-thompson",
+]
+internal-instrument = ["internal-instrument-pikevm"]
+internal-instrument-pikevm = [
+ "logging",
+ "std",
+]
+logging = [
+ "dep:log",
+ "aho-corasick?/logging",
+ "memchr?/logging",
+]
+meta = [
"syntax",
+ "nfa-pikevm",
+]
+nfa = [
+ "nfa-thompson",
+ "nfa-pikevm",
+ "nfa-backtrack",
+]
+nfa-backtrack = ["nfa-thompson"]
+nfa-pikevm = ["nfa-thompson"]
+nfa-thompson = ["alloc"]
+perf = [
+ "perf-inline",
+ "perf-literal",
+]
+perf-inline = []
+perf-literal = [
+ "perf-literal-substring",
+ "perf-literal-multisubstring",
+]
+perf-literal-multisubstring = [
+ "std",
+ "dep:aho-corasick",
+]
+perf-literal-substring = [
+ "aho-corasick?/perf-literal",
+ "dep:memchr",
+]
+std = [
+ "regex-syntax?/std",
+ "memchr?/std",
+ "aho-corasick?/std",
+ "alloc",
+]
+syntax = [
+ "dep:regex-syntax",
+ "alloc",
+]
+unicode = [
+ "unicode-age",
+ "unicode-bool",
+ "unicode-case",
+ "unicode-gencat",
+ "unicode-perl",
+ "unicode-script",
+ "unicode-segment",
+ "unicode-word-boundary",
+ "regex-syntax?/unicode",
]
-logging = ["log"]
-std = []
-syntax = ["regex-syntax"]
-transducer = ["fst"]
+unicode-age = ["regex-syntax?/unicode-age"]
+unicode-bool = ["regex-syntax?/unicode-bool"]
+unicode-case = ["regex-syntax?/unicode-case"]
+unicode-gencat = ["regex-syntax?/unicode-gencat"]
+unicode-perl = ["regex-syntax?/unicode-perl"]
+unicode-script = ["regex-syntax?/unicode-script"]
+unicode-segment = ["regex-syntax?/unicode-segment"]
+unicode-word-boundary = []
diff --git a/vendor/regex-automata/LICENSE-APACHE b/vendor/regex-automata/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/vendor/regex-automata/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/vendor/regex-automata/LICENSE-MIT b/vendor/regex-automata/LICENSE-MIT
index 3b0a5dc09..39d4bdb5a 100644
--- a/vendor/regex-automata/LICENSE-MIT
+++ b/vendor/regex-automata/LICENSE-MIT
@@ -1,21 +1,25 @@
-The MIT License (MIT)
+Copyright (c) 2014 The Rust Project Developers
-Copyright (c) 2015 Andrew Gallant
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/vendor/regex-automata/PLANS.md b/vendor/regex-automata/PLANS.md
deleted file mode 100644
index 2fa9392ef..000000000
--- a/vendor/regex-automata/PLANS.md
+++ /dev/null
@@ -1,165 +0,0 @@
-pattern_limit should not be defined inside nfa::thompson, but rather at the
-top-level.
-
------
-
-Main problem right now is exemplified by the set60 and set70 failing tests. In
-particular, when finding the starting position while matching multiple regexes
-simultaneously, the reverse search is messed up. The reverse search doesn't
-depend on which regex matched in the forward direction, which means it won't
-always find the correcting starting location. Unfortunately, the only way to
-fix this, as far as I can tell, is to add a group of start states for every
-regex in the DFA. Then once we do the reverse search, we need to choose the
-correct start state based on which regex matched in the forward direction.
-
-This is a nasty change.
-
-So it looks like this only applies when doing an overlapping search in reverse
-to find the start of a match. That means we should make this configurable
-but enable it by default for the reverse automata. It should be configurable
-so that folks can construct a regex that doesn't have the ability to do
-overlapping searches correctly. If an overlapping search is attempted with
-a reverse automaton that lacks starting states for each pattern, then the
-implementation should panic.
-
-BUT! It is also convenient to provide this option in general for folks that
-want a DFA that can match any pattern while also being able to match a specific
-pattern.
-
-Straw man:
-
-* Update dense::Config to have a `starts_for_each_pattern` option. It should
- be disabled by default.
-* In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration
- to have the aforementioned option enabled.
-* It would be interesting to add new APIs to `Regex` that support matching
- specific patterns, but I think this is a complication. If we did want to do
- this, then we should just add it to the `_at` variants and leave the rest of
- the API untouched.
-* Add a `pattern_id: Option<PatternID>` parameter to each of the five
- `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the
- existing behavior. A `Some` value means that the starting state for that
- specific pattern must be chosen, which in turn implies an anchored search.
- (This means `starts_for_each_pattern` has utility for single-pattern DFAs
- since it makes it possible to build a DFA that can do both unanchored and
- anchored searches.)
-* Thread this new parameter down into the various functions in `dfa::search`
- all the way down into `init_fwd` and `init_rev`. These functions will then
- pass it to `dfa.start_state_{forward,reverse}`.
-* This is where things get gruesome since we now need to completely re-work how
- start states are represented in dense and sparse DFAs _and_ it needs to be
- configurable. It looks like the `Start` type from `dfa::automaton` can
- basically remain unchanged, since it still represents one of the four
- possible starting states that will need to be applied for every pattern.
-* For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only
- header is the state ID count, which is always 4. We'll want to change this
- to the stride and add a new header value that encodes the number of patterns.
- When the number of patterns is zero, then existing behavior is preserved and
- represents the case where `starts_for_each_pattern` is disabled (or in the
- case of an empty DFA). When non-zero, a table of starting state IDs is
- encoded with each row corresponding to the 4 starting states for each
- pattern. Before this table (even if it's empty), the 4 starting states for
- the entire DFA are encoded.
-* For `dfa::sparse`, do the same as above. They are essentially the same right
- now anyway, with the only difference being that sparse DFAs use `&[u8]`
- instead of `&[S]` (because sparse DFAs don't have any alignment
- requirements).
-* Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when
- true, creates a start table with the header, the start states for the entire
- DFA and a row of start states for each pattern. When false, no rows are
- added.
-* Expose whether there are starting states for each pattern via a predicate
- on the DFA.
-* Modify the determinizer's `add_starts` method to basically do what it does,
- but also do it for each pattern when the DFA is configured for it. It should
- continue to reuse states as appropriate or not generate new states if they
- aren't needed. This will want to use the `NFA::start_pattern` method, which
- provides the starting NFA state ID for the given pattern.
-* Fix the dense->sparse conversion. At this point, this piece should be fairly
- straight-forward since the sparse representation of starting states is
- basically identical to the dense representation.
-
-At this point, I think the bug should resolve itself.
-
-^^^^ DONE! IT WORKS!
-
------
-
-
-Add top-level SyntaxConfig (or some such) that has all of the regex-syntax
-options forwarded, but with automata oriented docs. Then use this for all of
-the engines instead of having to repeat every option for every builder.
-
------
-
-These produce different results. PCRE2 looks correct. Basically, we should be
-using the context around the `at` position correctly, which we aren't doing
-right now. Seems tricky to get right, particularly when confirming the match
-with a reverse DFA.
-
-Maybe our 'at' functions need to take a full range... Sigh. This is indeed what
-RE2 does. GAH.
-
-fn main() {
- let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap();
- let s = "foo bar baz";
- println!("{:?}", re.find_at(s, 3).map(|m| m.as_str()));
-
- let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap();
- let s = "foo bar baz";
- println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap());
-}
-
-^^^^ This is fixed now, but we still need to find a way to add test coverage
-for "context" searches. It'd be nice to do this automatically, but we'll
-probably just added a new 'context = [start, end]' option.
-
------
-
-
-* Create regex-test crate, based on glob-test. Try to anticipate the needs for
- the full regex test suite.
- * See if we can clean up tests.
- * Provide a way to mark a test as expensive.
- * Provide a way to test is_match_at and find_at.
- * Test shortest_match_at too? Huge pain. Add tests for it.
- * Port ALL tests from the regex crate. Will probably need a way to mark a
- test as skipped.
- * Document tests better.
-* Find a way to remove byteorder dependency.
-* Reorganize crate API:
- * Have errors contain `Box<Error+Send+Sync>` instead of `String`.
- * Make errors non-exhaustive.
- * Audit `StateID` trait for safety.
- * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA
- have inefficient implementations of some methods. Maybe use multiple
- traits? Answer: get rid of premultiply/classes knobs and just enable
- them by default. Should remove a huge amount of code.
- * Check whether `unsafe` is really needed to eliminate bounds checks. Use
- micro-benchmarks and bigger CLI workloads using `regex-automata-debug`.
- * Re-write module docs for `dfa` as they are no longer top-level. Keep most.
- * Retain any pertinent top-level crate docs, but don't rewrite yet.
- * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess
- right now.
- * Clean up and add 'always_match' and 'never_match' constructors for every
- regex engine.
- * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode
- version of \b unfortunately. Carefully scrutinize how the regex crate's
- lazy DFA does it and try to make it comprehensible. Done! Except for the
- part about making it comprehensible.
-* Rethink prefilters?
-* Add `regex-automata-generate` CLI tool. This should just be a copy of
- the `ucd-generate dfa` and `ucd-generate regex` commands.
-
-Then build new public `nfa` sub-module.
- * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into
- source for fast checking. That way, we don't need to ever do explicit UTF-8
- decoding anywhere. Yay.
-
-Then `lazy` sub-module.
-
-Then `onepass`.
-
-Then `jit`.
-
-... and beyond? CRAZY. But it can be done! Build STRONG base layers.
diff --git a/vendor/regex-automata/README.md b/vendor/regex-automata/README.md
index 23e0bffe0..c12b07012 100644
--- a/vendor/regex-automata/README.md
+++ b/vendor/regex-automata/README.md
@@ -1,15 +1,13 @@
regex-automata
==============
-A low level regular expression library that uses deterministic finite automata.
-It supports a rich syntax with Unicode support, has extensive options for
-configuring the best space vs time trade off for your use case and provides
-support for cheap deserialization of automata for use in `no_std` environments.
+This crate exposes a variety of regex engines used by the `regex` crate.
+It provides a vast, sprawling and "expert" level API to each regex engine.
+The regex engines provided by this crate focus heavily on finite automata
+implementations and specifically guarantee worst case `O(m * n)` time
+complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.)
-[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions)
+[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
[![Crates.io](https://img.shields.io/crates/v/regex-automata.svg)](https://crates.io/crates/regex-automata)
-![Minimum Supported Rust Version 1.41](https://img.shields.io/badge/rustc-1.41-green)
-
-Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
### Documentation
@@ -17,206 +15,103 @@ Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
https://docs.rs/regex-automata
-### Usage
-
-Add this to your `Cargo.toml`:
-
-```toml
-[dependencies]
-regex-automata = "0.1"
-```
-
-**WARNING**: The `master` branch currently contains code for the `0.2` release,
-but this README still targets the `0.1` release. Namely, it is recommended to
-stick with the `0.1` release. The `0.2` release was made prematurely in order
-to unblock some folks.
+### Example
-
-### Example: basic regex searching
-
-This example shows how to compile a regex using the default configuration
-and then use it to find matches in a byte string:
+This example shows how to search for matches of multiple regexes, where each
+regex uses the same capture group names to parse different key-value formats.
```rust
-use regex_automata::Regex;
-
-let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
+use regex_automata::{meta::Regex, PatternID};
+
+let re = Regex::new_many(&[
+ r#"(?m)^(?<key>[[:word:]]+)=(?<val>[[:word:]]+)$"#,
+ r#"(?m)^(?<key>[[:word:]]+)="(?<val>[^"]+)"$"#,
+ r#"(?m)^(?<key>[[:word:]]+)='(?<val>[^']+)'$"#,
+ r#"(?m)^(?<key>[[:word:]]+):\s*(?<val>[[:word:]]+)$"#,
+]).unwrap();
+let hay = r#"
+best_album="Blow Your Face Out"
+best_quote='"then as it was, then again it will be"'
+best_year=1973
+best_simpsons_episode: HOMR
+"#;
+let mut kvs = vec![];
+for caps in re.captures_iter(hay) {
+ // N.B. One could use capture indices '1' and '2' here
+ // as well. Capture indices are local to each pattern.
+ // (Just like names are.)
+ let key = &hay[caps.get_group_by_name("key").unwrap()];
+ let val = &hay[caps.get_group_by_name("val").unwrap()];
+ kvs.push((key, val));
+}
+assert_eq!(kvs, vec![
+ ("best_album", "Blow Your Face Out"),
+ ("best_quote", "\"then as it was, then again it will be\""),
+ ("best_year", "1973"),
+ ("best_simpsons_episode", "HOMR"),
+]);
```
-For more examples and information about the various knobs that can be turned,
-please see the [docs](https://docs.rs/regex-automata/0.1).
-
-
-### Support for `no_std`
-
-This crate comes with a `std` feature that is enabled by default. When the
-`std` feature is enabled, the API of this crate will include the facilities
-necessary for compiling, serializing, deserializing and searching with regular
-expressions. When the `std` feature is disabled, the API of this crate will
-shrink such that it only includes the facilities necessary for deserializing
-and searching with regular expressions.
-
-The intended workflow for `no_std` environments is thus as follows:
-
-* Write a program with the `std` feature that compiles and serializes a
- regular expression. Serialization should only happen after first converting
- the DFAs to use a fixed size state identifier instead of the default `usize`.
- You may also need to serialize both little and big endian versions of each
- DFA. (So that's 4 DFAs in total for each regex.)
-* In your `no_std` environment, follow the examples above for deserializing
- your previously serialized DFAs into regexes. You can then search with them
- as you would any regex.
-
-Deserialization can happen anywhere. For example, with bytes embedded into a
-binary or with a file memory mapped at runtime.
-
-Note that the
-[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
-tool will do the first step for you with its `dfa` or `regex` sub-commands.
-
-
-### Cargo features
-
-* `std` - **Enabled** by default. This enables the ability to compile finite
- automata. This requires the `regex-syntax` dependency. Without this feature
- enabled, finite automata can only be used for searching (using the approach
- described above).
-* `transducer` - **Disabled** by default. This provides implementations of the
- `Automaton` trait found in the `fst` crate. This permits using finite
- automata generated by this crate to search finite state transducers. This
- requires the `fst` dependency.
-
-
-### Differences with the regex crate
-
-The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
-general purpose regular expression engine. It aims to automatically balance low
-compile times, fast search times and low memory usage, while also providing
-a convenient API for users. In contrast, this crate provides a lower level
-regular expression interface that is a bit less convenient while providing more
-explicit control over memory usage and search times.
-
-Here are some specific negative differences:
-
-* **Compilation can take an exponential amount of time and space** in the size
- of the regex pattern. While most patterns do not exhibit worst case
- exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
- build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
- not be compiled with this library. (In the future, the API may expose an
- option to return an error if the DFA gets too big.)
-* This crate does not support sub-match extraction, which can be achieved with
- the regex crate's "captures" API. This may be added in the future, but is
- unlikely.
-* While the regex crate doesn't necessarily sport fast compilation times, the
- regexes in this crate are almost universally slow to compile, especially when
- they contain large Unicode character classes. For example, on my system,
- compiling `\w{3}` with byte classes enabled takes just over 1 second and
- almost 5MB of memory! (Compiling a sparse regex takes about the same time
- but only uses about 500KB of memory.) Conversly, compiling the same regex
- without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
- less than 5KB of memory. For this reason, you should only use Unicode
- character classes if you absolutely need them!
-* This crate does not support regex sets.
-* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
- `\B`.
-* As a lower level crate, this library does not do literal optimizations. In
- exchange, you get predictable performance regardless of input. The
- philosophy here is that literal optimizations should be applied at a higher
- level, although there is no easy support for this in the ecosystem yet.
-* There is no `&str` API like in the regex crate. In this crate, all APIs
- operate on `&[u8]`. By default, match indices are guaranteed to fall on
- UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled.
-
-With some of the downsides out of the way, here are some positive differences:
-
-* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
- deserialized. Deserialization always takes constant time since searching can
- be performed directly on the raw serialized bytes of a DFA.
-* This crate was specifically designed so that the searching phase of a DFA has
- minimal runtime requirements, and can therefore be used in `no_std`
- environments. While `no_std` environments cannot compile regexes, they can
- deserialize pre-compiled regexes.
-* Since this crate builds DFAs ahead of time, it will generally out-perform
- the `regex` crate on equivalent tasks. The performance difference is likely
- not large. However, because of a complex set of optimizations in the regex
- crate (like literal optimizations), an accurate performance comparison may be
- difficult to do.
-* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
- performance a small amount, but uses much less storage space. Potentially
- even less than what the regex crate uses.
-* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`,
- which enables one to do less work in some cases. For example, if you only
- need the end of a match and not the start of a match, then you can use a DFA
- directly without building a `Regex`, which always requires a second DFA to
- find the start of a match.
-* Aside from choosing between dense and sparse DFAs, there are several options
- for configuring the space usage vs search time trade off. These include
- things like choosing a smaller state identifier representation, to
- premultiplying state identifiers and splitting a DFA's alphabet into
- equivalence classes. Finally, DFA minimization is also provided, but can
- increase compilation times dramatically.
-
-
-### Future work
-
-* Look into being smarter about generating NFA states for large Unicode
- character classes. These can create a lot of additional work for both the
- determinizer and the minimizer, and I suspect this is the key thing we'll
- want to improve if we want to make DFA compile times faster. I *believe*
- it's possible to potentially build minimal or nearly minimal NFAs for the
- special case of Unicode character classes by leveraging Daciuk's algorithms
- for building minimal automata in linear time for sets of strings. See
- https://blog.burntsushi.net/transducers/#construction for more details. The
- key adaptation I think we need to make is to modify the algorithm to operate
- on byte ranges instead of enumerating every codepoint in the set. Otherwise,
- it might not be worth doing.
-* Add support for regex sets. It should be possible to do this by "simply"
- introducing more match states. I think we can also report the positions at
- each match, similar to how Aho-Corasick works. I think the long pole in the
- tent here is probably the API design work and arranging it so that we don't
- introduce extra overhead into the non-regex-set case without duplicating a
- lot of code. It seems doable.
-* Stretch goal: support capturing groups by implementing "tagged" DFA
- (transducers). Laurikari's paper is the usual reference here, but Trofimovich
- has a much more thorough treatment here:
- https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf
- I've only read the paper once. I suspect it will require at least a few more
- read throughs before I understand it.
- See also: https://re2c.org
-* Possibly less ambitious goal: can we select a portion of Trofimovich's work
- to make small fixed length look-around work? It would be really nice to
- support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $.
-* Experiment with code generating Rust code. There is an early experiment in
- src/codegen.rs that is thoroughly bit-rotted. At the time, I was
- experimenting with whether or not codegen would significant decrease the size
- of a DFA, since if you squint hard enough, it's kind of like a sparse
- representation. However, it didn't shrink as much as I thought it would, so
- I gave up. The other problem is that Rust doesn't support gotos, so I don't
- even know whether the "match on each state" in a loop thing will be fast
- enough. Either way, it's probably a good option to have. For one thing, it
- would be endian independent where as the serialization format of the DFAs in
- this crate are endian dependent (so you need two versions of every DFA, but
- you only need to compile one of them for any given arch).
-* Experiment with unrolling the match loops and fill out the benchmarks.
-* Add some kind of streaming API. I believe users of the library can already
- implement something for this outside of the crate, but it would be good to
- provide an official API. The key thing here is figuring out the API. I
- suspect we might want to support several variants.
-* Make a decision on whether or not there is room for literal optimizations
- in this crate. My original intent was to not let this crate sink down into
- that very very very deep rabbit hole. But instead, we might want to provide
- some way for literal optimizations to hook into the match routines. The right
- path forward here is to probably build something outside of the crate and
- then see about integrating it. After all, users can implement their own
- match routines just as efficiently as what the crate provides.
-* A key downside of DFAs is that they can take up a lot of memory and can be
- quite costly to build. Their worst case compilation time is O(2^n), where
- n is the number of NFA states. A paper by Yang and Prasanna (2011) actually
- seems to provide a way to character state blow up such that it is detectable.
- If we could know whether a regex will exhibit state explosion or not, then
- we could make an intelligent decision about whether to ahead-of-time compile
- a DFA.
- See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf
+
+### Safety
+
+**I welcome audits of `unsafe` code.**
+
+This crate tries to be extremely conservative in its use of `unsafe`, but does
+use it in a few spots. In general, I am very open to removing uses of `unsafe`
+if it doesn't result in measurable performance regressions and doesn't result
+in significantly more complex code.
+
+Below is an outline of how `unsafe` is used in this crate.
+
+* `util::pool::Pool` makes use of `unsafe` to implement a fast path for
+accessing an element of the pool. The fast path applies to the first thread
+that uses the pool. In effect, the fast path is fast because it avoid a mutex
+lock. `unsafe` is also used in the no-std version of `Pool` to implement a spin
+lock for synchronization.
+* `util::lazy::Lazy` uses `unsafe` to implement a variant of
+`once_cell::sync::Lazy` that works in no-std environments. A no-std no-alloc
+implementation is also provided that requires use of `unsafe`.
+* The `dfa` module makes extensive use of `unsafe` to support zero-copy
+deserialization of DFAs. The high level problem is that you need to get from
+`&[u8]` to the internal representation of a DFA without doing any copies.
+This is required for support in no-std no-alloc environments. It also makes
+deserialization extremely cheap.
+* The `dfa` and `hybrid` modules use `unsafe` to explicitly elide bounds checks
+in the core search loops. This makes the codegen tighter and typically leads to
+consistent 5-10% performance improvements on some workloads.
+
+In general, the above reflect the only uses of `unsafe` throughout the entire
+`regex` crate. At present, there are no plans to meaningfully expand the use
+of `unsafe`. With that said, one thing folks have been asking for is cheap
+deserialization of a `regex::Regex`. My sense is that this feature will require
+a lot more `unsafe` in places to support zero-copy deserialization. It is
+unclear at this point whether this will be pursued.
+
+
+### Motivation
+
+I started out building this crate because I wanted to re-work the `regex`
+crate internals to make it more amenable to optimizations. It turns out that
+there are a lot of different ways to build regex engines and even more ways to
+compose them. Moreover, heuristic literal optimizations are often tricky to
+get correct, but the fruit they bear is attractive. All of these things were
+difficult to expand upon without risking the introduction of more bugs. So I
+decided to tear things down and start fresh.
+
+In the course of doing so, I ended up designing strong boundaries between each
+component so that each component could be reasoned and tested independently.
+This also made it somewhat natural to expose the components as a library unto
+itself. Namely, folks have been asking for more capabilities in the regex
+crate for a long time, but these capabilities usually come with additional API
+complexity that I didn't want to introduce in the `regex` crate proper. But
+exposing them in an "expert" level crate like `regex-automata` seemed quite
+fine.
+
+In the end, I do still somewhat consider this crate an experiment. It is
+unclear whether the strong boundaries between components will be an impediment
+to ongoing development or not. De-coupling tends to lead to slower development
+in my experience, and when you mix in the added cost of not introducing
+breaking changes all of the time, things can get quite complicated. But, I
+don't think anyone has ever release the internals of a regex engine as a
+library before. So it will be interesting to see how it plays out!
diff --git a/vendor/regex-automata/TODO b/vendor/regex-automata/TODO
deleted file mode 100644
index 68f018799..000000000
--- a/vendor/regex-automata/TODO
+++ /dev/null
@@ -1,13 +0,0 @@
-* Consider refactoring the NFA representation such that it can be instantly
- loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
- could negatively impact using the NFA with deserialization costs. Before
- doing this, we should write PikeVM and backtracking implementations so that
- they can be benchmarked.
-* Add captures to NFA.
-* Once we're happy, re-organize the public API such that NFAs are exported
- and usable on their own.
-
-* Investigate why NFA shrinking seems to produce bigger DFAs after
- determinization, even though it makes determinization substantially
- faster. This might be because of its use of sparse NFA states, which have
- a lower constant overhead associated with them.
diff --git a/vendor/regex-automata/UNLICENSE b/vendor/regex-automata/UNLICENSE
deleted file mode 100644
index 68a49daad..000000000
--- a/vendor/regex-automata/UNLICENSE
+++ /dev/null
@@ -1,24 +0,0 @@
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org/>
diff --git a/vendor/regex-automata/rustfmt.toml b/vendor/regex-automata/rustfmt.toml
deleted file mode 100644
index aa37a218b..000000000
--- a/vendor/regex-automata/rustfmt.toml
+++ /dev/null
@@ -1,2 +0,0 @@
-max_width = 79
-use_small_heuristics = "max"
diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs
index dbfeb7932..5ea2423dd 100644
--- a/vendor/regex-automata/src/dfa/accel.rs
+++ b/vendor/regex-automata/src/dfa/accel.rs
@@ -49,12 +49,14 @@
//
// accels.get((id - min_accel_id) / dfa_stride)
-use core::convert::{TryFrom, TryInto};
-
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
use alloc::{vec, vec::Vec};
-use crate::util::bytes::{self, DeserializeError, Endian, SerializeError};
+use crate::util::{
+ int::Pointer,
+ memchr,
+ wire::{self, DeserializeError, Endian, SerializeError},
+};
/// The base type used to represent a collection of accelerators.
///
@@ -87,7 +89,7 @@ const ACCEL_CAP: usize = 8;
/// Search for between 1 and 3 needle bytes in the given haystack, starting the
/// search at the given position. If `needles` has a length other than 1-3,
/// then this panics.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_fwd(
needles: &[u8],
haystack: &[u8],
@@ -107,7 +109,7 @@ pub(crate) fn find_fwd(
/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
/// starting the search at the given position. If `needles` has a length other
/// than 1-3, then this panics.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_rev(
needles: &[u8],
haystack: &[u8],
@@ -138,7 +140,7 @@ pub(crate) struct Accels<A> {
accels: A,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl Accels<Vec<AccelTy>> {
/// Create an empty sequence of accelerators for a DFA.
pub fn empty() -> Accels<Vec<AccelTy>> {
@@ -180,48 +182,48 @@ impl<'a> Accels<&'a [AccelTy]> {
///
/// Callers may check the validity of every accelerator with the `validate`
/// method.
- pub unsafe fn from_bytes_unchecked(
+ pub fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
- let (count, _) =
- bytes::try_read_u32_as_usize(slice, "accelerators count")?;
- // The accelerator count is part of the accel_tys slice that
+ let (accel_len, _) =
+ wire::try_read_u32_as_usize(slice, "accelerators length")?;
+ // The accelerator length is part of the accel_tys slice that
// we deserialize. This is perhaps a bit idiosyncratic. It would
- // probably be better to split out the count into a real field.
+ // probably be better to split out the length into a real field.
- let accel_tys_count = bytes::add(
- bytes::mul(count, 2, "total number of accelerator accel_tys")?,
+ let accel_tys_len = wire::add(
+ wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
1,
"total number of accel_tys",
)?;
- let accel_tys_len = bytes::mul(
+ let accel_tys_bytes_len = wire::mul(
ACCEL_TY_SIZE,
- accel_tys_count,
+ accel_tys_len,
"total number of bytes in accelerators",
)?;
- bytes::check_slice_len(slice, accel_tys_len, "accelerators")?;
- bytes::check_alignment::<AccelTy>(slice)?;
- let accel_tys = &slice[..accel_tys_len];
- slice = &slice[accel_tys_len..];
+ wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
+ wire::check_alignment::<AccelTy>(slice)?;
+ let accel_tys = &slice[..accel_tys_bytes_len];
+ slice = &slice[accel_tys_bytes_len..];
// SAFETY: We've checked the length and alignment above, and since
- // slice is just bytes, we can safely cast to a slice of &[AccelTy].
- #[allow(unused_unsafe)]
+ // slice is just bytes and AccelTy is just a u32, we can safely cast to
+ // a slice of &[AccelTy].
let accels = unsafe {
core::slice::from_raw_parts(
- accel_tys.as_ptr() as *const AccelTy,
- accel_tys_count,
+ accel_tys.as_ptr().cast::<AccelTy>(),
+ accel_tys_len,
)
};
- Ok((Accels { accels }, slice.as_ptr() as usize - slice_start))
+ Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
}
}
impl<A: AsRef<[AccelTy]>> Accels<A> {
/// Return an owned version of the accelerators.
#[cfg(feature = "alloc")]
- pub fn to_owned(&self) -> Accels<Vec<AccelTy>> {
+ pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
Accels { accels: self.accels.as_ref().to_vec() }
}
@@ -237,7 +239,7 @@ impl<A: AsRef<[AccelTy]>> Accels<A> {
// and u8 always has a smaller alignment.
unsafe {
core::slice::from_raw_parts(
- accels.as_ptr() as *const u8,
+ accels.as_ptr().cast::<u8>(),
accels.len() * ACCEL_TY_SIZE,
)
}
@@ -261,14 +263,14 @@ impl<A: AsRef<[AccelTy]>> Accels<A> {
/// states are stored contiguously in the DFA and have an ordering implied
/// by their respective state IDs. The state's index in that sequence
/// corresponds to the index of its corresponding accelerator.
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
pub fn needles(&self, i: usize) -> &[u8] {
if i >= self.len() {
panic!("invalid accelerator index {}", i);
}
let bytes = self.as_bytes();
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
- let len = bytes[offset] as usize;
+ let len = usize::from(bytes[offset]);
&bytes[offset + 1..offset + 1 + len]
}
@@ -398,7 +400,7 @@ pub(crate) struct Accel {
impl Accel {
/// Returns an empty accel, where no bytes are accelerated.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn new() -> Accel {
Accel { bytes: [0; ACCEL_CAP] }
}
@@ -420,7 +422,7 @@ impl Accel {
///
/// If the given bytes are invalid, then this returns an error.
fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
- if bytes[0] as usize >= ACCEL_LEN {
+ if usize::from(bytes[0]) >= ACCEL_LEN {
return Err(DeserializeError::generic(
"accelerator bytes cannot have length more than 3",
));
@@ -438,18 +440,25 @@ impl Accel {
}
/// Attempts to add the given byte to this accelerator. If the accelerator
- /// is already full then this returns false. Otherwise, returns true.
+ /// is already full or thinks the byte is a poor accelerator, then this
+ /// returns false. Otherwise, returns true.
///
/// If the given byte is already in this accelerator, then it panics.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn add(&mut self, byte: u8) -> bool {
if self.len() >= 3 {
return false;
}
+ // As a special case, we totally reject trying to accelerate a state
+ // with an ASCII space. In most cases, it occurs very frequently, and
+ // tends to result in worse overall performance.
+ if byte == b' ' {
+ return false;
+ }
assert!(
!self.contains(byte),
"accelerator already contains {:?}",
- crate::util::DebugByte(byte)
+ crate::util::escape::DebugByte(byte)
);
self.bytes[self.len() + 1] = byte;
self.bytes[0] += 1;
@@ -458,11 +467,11 @@ impl Accel {
/// Return the number of bytes in this accelerator.
pub fn len(&self) -> usize {
- self.bytes[0] as usize
+ usize::from(self.bytes[0])
}
/// Returns true if and only if there are no bytes in this accelerator.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
@@ -476,13 +485,13 @@ impl Accel {
/// Returns true if and only if this accelerator will accelerate the given
/// byte.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn contains(&self, byte: u8) -> bool {
self.needles().iter().position(|&b| b == byte).is_some()
}
/// Returns the accelerator bytes as an array of AccelTys.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn as_accel_tys(&self) -> [AccelTy; 2] {
assert_eq!(ACCEL_CAP, 8);
// These unwraps are OK since ACCEL_CAP is set to 8.
@@ -499,7 +508,7 @@ impl core::fmt::Debug for Accel {
write!(f, "Accel(")?;
let mut set = f.debug_set();
for &b in self.needles() {
- set.entry(&crate::util::DebugByte(b));
+ set.entry(&crate::util::escape::DebugByte(b));
}
set.finish()?;
write!(f, ")")
diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs
index 08bd6722a..7e2be9a15 100644
--- a/vendor/regex-automata/src/dfa/automaton.rs
+++ b/vendor/regex-automata/src/dfa/automaton.rs
@@ -1,9 +1,12 @@
+#[cfg(feature = "alloc")]
+use crate::util::search::PatternSet;
use crate::{
dfa::search,
util::{
- id::{PatternID, StateID},
- matchtypes::{HalfMatch, MatchError},
- prefilter,
+ empty,
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, HalfMatch, Input, MatchError},
},
};
@@ -27,8 +30,8 @@ use crate::{
/// * A DFA can search for multiple patterns simultaneously. This
/// means extra information is returned when a match occurs. Namely,
/// a match is not just an offset, but an offset plus a pattern ID.
-/// [`Automaton::pattern_count`] returns the number of patterns compiled into
-/// the DFA, [`Automaton::match_count`] returns the total number of patterns
+/// [`Automaton::pattern_len`] returns the number of patterns compiled into
+/// the DFA, [`Automaton::match_len`] returns the total number of patterns
/// that match in a particular state and [`Automaton::match_pattern`] permits
/// iterating over the patterns that match in a particular state.
/// * A DFA can have multiple start states, and the choice of which start
@@ -76,12 +79,10 @@ use crate::{
/// the state can be queried via the [`Automaton::accelerator`] method.
///
/// There are a number of provided methods on this trait that implement
-/// efficient searching (for forwards and backwards) with a DFA using all of
-/// the above features of this trait. In particular, given the complexity of
-/// all these features, implementing a search routine in this trait is not
-/// straight forward. If you need to do this for specialized reasons, then
-/// it's recommended to look at the source of this crate. It is intentionally
-/// well commented to help with this. With that said, it is possible to
+/// efficient searching (for forwards and backwards) with a DFA using
+/// all of the above features of this trait. In particular, given the
+/// complexity of all these features, implementing a search routine in
+/// this trait can be a little subtle. With that said, it is possible to
/// somewhat simplify the search routine. For example, handling accelerated
/// states is strictly optional, since it is always correct to assume that
/// `Automaton::is_accel_state` returns false. However, one complex part of
@@ -90,13 +91,19 @@ use crate::{
///
/// # Safety
///
-/// This trait is unsafe to implement because DFA searching may rely on the
-/// correctness of the implementation for memory safety. For example, DFA
-/// searching may use explicit bounds check elision, which will in turn rely
-/// on the correctness of every function that returns a state ID.
+/// This trait is not safe to implement so that code may rely on the
+/// correctness of implementations of this trait to avoid undefined behavior.
+/// The primary correctness guarantees are:
///
-/// When implementing this trait, one must uphold the documented correctness
-/// guarantees. Otherwise, undefined behavior may occur.
+/// * `Automaton::start_state` always returns a valid state ID or an error or
+/// panics.
+/// * `Automaton::next_state`, when given a valid state ID, always returns
+/// a valid state ID for all values of `anchored` and `byte`, or otherwise
+/// panics.
+///
+/// In general, the rest of the methods on `Automaton` need to uphold their
+/// contracts as well. For example, `Automaton::is_dead` should only returns
+/// true if the given state ID is actually a dead state.
pub unsafe trait Automaton {
/// Transitions from the current state to the next state, given the next
/// byte of input.
@@ -118,16 +125,14 @@ pub unsafe trait Automaton {
/// by using the `next_state` method.
///
/// ```
- /// use regex_automata::dfa::{Automaton, dense};
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
///
/// let dfa = dense::DFA::new(r"[a-z]+r")?;
/// let haystack = "bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
- /// let mut state = dfa.start_state_forward(
- /// None, haystack, 0, haystack.len(),
- /// );
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
/// state = dfa.next_state(state, b);
@@ -195,16 +200,17 @@ pub unsafe trait Automaton {
/// and then finishing the search with the final EOI transition.
///
/// ```
- /// use regex_automata::dfa::{Automaton, dense};
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
///
/// let dfa = dense::DFA::new(r"[a-z]+r")?;
/// let haystack = "bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
- /// let mut state = dfa.start_state_forward(
- /// None, haystack, 0, haystack.len(),
- /// );
+ /// //
+ /// // The unwrap is OK because we aren't requesting a start state for a
+ /// // specific pattern.
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
/// state = dfa.next_state(state, b);
@@ -220,78 +226,118 @@ pub unsafe trait Automaton {
/// ```
fn next_eoi_state(&self, current: StateID) -> StateID;
- /// Return the ID of the start state for this DFA when executing a forward
- /// search.
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
- /// * The pattern ID, if present. When the underlying DFA has been compiled
- /// with multiple patterns _and_ the DFA has been configured to compile
- /// an anchored start state for each pattern, then a pattern ID may be
- /// specified to execute an anchored search for that specific pattern.
- /// If `pattern_id` is invalid or if the DFA doesn't have start states
- /// compiled for each pattern, then implementations must panic. DFAs in
- /// this crate can be configured to compile start states for each pattern
- /// via
- /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
- /// * When `start > 0`, the byte at index `start - 1` may influence the
- /// start state if the regex uses `^` or `\b`.
- /// * Similarly, when `start == 0`, it may influence the start state when
- /// the regex uses `^` or `\A`.
- /// * Currently, `end` is unused.
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for forward searches.
///
- /// # Panics
+ /// # Errors
///
- /// Implementations must panic if `start..end` is not a valid sub-slice of
- /// `bytes`. Implementations must also panic if `pattern_id` is non-None
- /// and does not refer to a valid pattern, or if the DFA was not compiled
- /// with anchored start states for each pattern.
+ /// This may return a [`MatchError`] if the search needs to give up
+ /// when determining the start state (for example, if it sees a "quit"
+ /// byte). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
fn start_state_forward(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID;
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError>;
- /// Return the ID of the start state for this DFA when executing a reverse
- /// search.
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// reverse search.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
- /// * The pattern ID, if present. When the underlying DFA has been compiled
- /// with multiple patterns _and_ the DFA has been configured to compile an
- /// anchored start state for each pattern, then a pattern ID may be
- /// specified to execute an anchored search for that specific pattern. If
- /// `pattern_id` is invalid or if the DFA doesn't have start states compiled
- /// for each pattern, then implementations must panic. DFAs in this crate
- /// can be configured to compile start states for each pattern via
- /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
- /// * When `end < bytes.len()`, the byte at index `end` may influence the
- /// start state if the regex uses `$` or `\b`.
- /// * Similarly, when `end == bytes.len()`, it may influence the start
- /// state when the regex uses `$` or `\z`.
- /// * Currently, `start` is unused.
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for reverse searches.
///
- /// # Panics
+ /// # Errors
///
- /// Implementations must panic if `start..end` is not a valid sub-slice of
- /// `bytes`. Implementations must also panic if `pattern_id` is non-None
- /// and does not refer to a valid pattern, or if the DFA was not compiled
- /// with anchored start states for each pattern.
+ /// This may return a [`MatchError`] if the search needs to give up
+ /// when determining the start state (for example, if it sees a "quit"
+ /// byte). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
fn start_state_reverse(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID;
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError>;
+
+ /// If this DFA has a universal starting state for the given anchor mode
+ /// and the DFA supports universal starting states, then this returns that
+ /// state's identifier.
+ ///
+ /// A DFA is said to have a universal starting state when the starting
+ /// state is invariant with respect to the haystack. Usually, the starting
+ /// state is chosen depending on the bytes immediately surrounding the
+ /// starting position of a search. However, the starting state only differs
+ /// when one or more of the patterns in the DFA have look-around assertions
+ /// in its prefix.
+ ///
+ /// Stated differently, if none of the patterns in a DFA have look-around
+ /// assertions in their prefix, then the DFA has a universal starting state
+ /// and _may_ be returned by this method.
+ ///
+ /// It is always correct for implementations to return `None`, and indeed,
+ /// this is what the default implementation does. When this returns `None`,
+ /// callers must use either `start_state_forward` or `start_state_reverse`
+ /// to get the starting state.
+ ///
+ /// # Use case
+ ///
+ /// There are a few reasons why one might want to use this:
+ ///
+ /// * If you know your regex patterns have no look-around assertions in
+ /// their prefix, then calling this routine is likely cheaper and perhaps
+ /// more semantically meaningful.
+ /// * When implementing prefilter support in a DFA regex implementation,
+ /// it is necessary to re-compute the start state after a candidate
+ /// is returned from the prefilter. However, this is only needed when
+ /// there isn't a universal start state. When one exists, one can avoid
+ /// re-computing the start state.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// Anchored,
+ /// };
+ ///
+ /// // There are no look-around assertions in the prefixes of any of the
+ /// // patterns, so we get a universal start state.
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?;
+ /// assert!(dfa.universal_start_state(Anchored::No).is_some());
+ /// assert!(dfa.universal_start_state(Anchored::Yes).is_some());
+ ///
+ /// // One of the patterns has a look-around assertion in its prefix,
+ /// // so this means there is no longer a universal start state.
+ /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?;
+ /// assert!(!dfa.universal_start_state(Anchored::No).is_some());
+ /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn universal_start_state(&self, _mode: Anchored) -> Option<StateID> {
+ None
+ }
/// Returns true if and only if the given identifier corresponds to a
/// "special" state. A special state is one or more of the following:
@@ -322,10 +368,10 @@ pub unsafe trait Automaton {
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
- /// HalfMatch, MatchError, PatternID,
+ /// HalfMatch, MatchError, Input,
/// };
///
- /// fn find_leftmost_first<A: Automaton>(
+ /// fn find<A: Automaton>(
/// dfa: &A,
/// haystack: &[u8],
/// ) -> Result<Option<HalfMatch>, MatchError> {
@@ -333,9 +379,7 @@ pub unsafe trait Automaton {
/// // initial bytes of the haystack. Note that start states can never
/// // be match states (since DFAs in this crate delay matches by 1
/// // byte), so we don't need to check if the start state is a match.
- /// let mut state = dfa.start_state_forward(
- /// None, haystack, 0, haystack.len(),
- /// );
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
/// let mut last_match = None;
/// // Walk all the bytes in the haystack. We can quit early if we see
/// // a dead or a quit state. The former means the automaton will
@@ -358,7 +402,7 @@ pub unsafe trait Automaton {
/// if last_match.is_some() {
/// return Ok(last_match);
/// }
- /// return Err(MatchError::Quit { byte: b, offset: i });
+ /// return Err(MatchError::quit(b, i));
/// }
/// // Implementors may also want to check for start or accel
/// // states and handle them differently for performance
@@ -383,7 +427,7 @@ pub unsafe trait Automaton {
/// // early. Greediness is built into the automaton.
/// let dfa = dense::DFA::new(r"[a-z]+")?;
/// let haystack = "123 foobar 4567".as_bytes();
- /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// let mat = find(&dfa, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 10);
///
@@ -393,7 +437,7 @@ pub unsafe trait Automaton {
/// // found until the final byte in the haystack.
/// let dfa = dense::DFA::new(r"[0-9]{4}")?;
/// let haystack = "123 foobar 4567".as_bytes();
- /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// let mat = find(&dfa, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
@@ -402,13 +446,13 @@ pub unsafe trait Automaton {
/// // the appropriate pattern ID for us.
/// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
/// let haystack = "123 foobar 4567".as_bytes();
- /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// let mat = find(&dfa, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 3);
- /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap();
+ /// let mat = find(&dfa, &haystack[3..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 7);
- /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap();
+ /// let mat = find(&dfa, &haystack[10..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 5);
///
@@ -458,13 +502,6 @@ pub unsafe trait Automaton {
/// since state identifiers are pre-multiplied by the state machine's
/// alphabet stride, and the alphabet stride varies between DFAs.)
///
- /// By default, state machines created by this crate will never enter a
- /// quit state. Since entering a quit state is the only way for a DFA
- /// in this crate to fail at search time, it follows that the default
- /// configuration can never produce a match error. Nevertheless, handling
- /// quit states is necessary to correctly support all configurations in
- /// this crate.
- ///
/// The typical way in which a quit state can occur is when heuristic
/// support for Unicode word boundaries is enabled via the
/// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary)
@@ -474,9 +511,8 @@ pub unsafe trait Automaton {
/// purpose of the quit state is to provide a way to execute a fast DFA
/// in common cases while delegating to slower routines when the DFA quits.
///
- /// The default search implementations provided by this crate will return
- /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state
- /// is entered.
+ /// The default search implementations provided by this crate will return a
+ /// [`MatchError::quit`] error when a quit state is entered.
///
/// # Example
///
@@ -513,8 +549,10 @@ pub unsafe trait Automaton {
/// method correctly.
fn is_match_state(&self, id: StateID) -> bool;
- /// Returns true if and only if the given identifier corresponds to a
- /// start state. A start state is a state in which a DFA begins a search.
+ /// Returns true only if the given identifier corresponds to a start
+ /// state
+ ///
+ /// A start state is a state in which a DFA begins a search.
/// All searches begin in a start state. Moreover, since all matches are
/// delayed by one byte, a start state can never be a match state.
///
@@ -531,25 +569,38 @@ pub unsafe trait Automaton {
/// begin with that prefix, then skipping ahead to occurrences of that
/// prefix may be much faster than executing the DFA.
///
+ /// As mentioned in the documentation for
+ /// [`is_special_state`](Automaton::is_special_state) implementations
+ /// _may_ always return false, even if the given identifier is a start
+ /// state. This is because knowing whether a state is a start state or not
+ /// is not necessary for correctness and is only treated as a potential
+ /// performance optimization. (For example, the implementations of this
+ /// trait in this crate will only return true when the given identifier
+ /// corresponds to a start state and when [specialization of start
+ /// states](crate::dfa::dense::Config::specialize_start_states) was enabled
+ /// during DFA construction. If start state specialization is disabled
+ /// (which is the default), then this method will always return false.)
+ ///
/// # Example
///
/// This example shows how to implement your own search routine that does
/// a prefix search whenever the search enters a start state.
///
- /// Note that you do not need to implement your own search routine to
- /// make use of prefilters like this. The search routines provided
- /// by this crate already implement prefilter support via the
- /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various
- /// `find_*_at` routines on this trait support the `Prefilter` trait
- /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is
- /// meant to show how you might deal with prefilters in a simplified case
- /// if you are implementing your own search routine.
+ /// Note that you do not need to implement your own search routine
+ /// to make use of prefilters like this. The search routines
+ /// provided by this crate already implement prefilter support via
+ /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait.
+ /// A prefilter can be added to your search configuration with
+ /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for
+ /// dense and sparse DFAs in this crate.
+ ///
+ /// This example is meant to show how you might deal with prefilters in a
+ /// simplified case if you are implementing your own search routine.
///
/// ```
/// use regex_automata::{
- /// MatchError, PatternID,
/// dfa::{Automaton, dense},
- /// HalfMatch,
+ /// HalfMatch, MatchError, Input,
/// };
///
/// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> {
@@ -558,7 +609,7 @@ pub unsafe trait Automaton {
/// slice[at..].iter().position(|&b| b == byte).map(|i| at + i)
/// }
///
- /// fn find_leftmost_first<A: Automaton>(
+ /// fn find<A: Automaton>(
/// dfa: &A,
/// haystack: &[u8],
/// prefix_byte: Option<u8>,
@@ -566,9 +617,7 @@ pub unsafe trait Automaton {
/// // See the Automaton::is_special_state example for similar code
/// // with more comments.
///
- /// let mut state = dfa.start_state_forward(
- /// None, haystack, 0, haystack.len(),
- /// );
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
/// let mut last_match = None;
/// let mut pos = 0;
/// while pos < haystack.len() {
@@ -590,9 +639,7 @@ pub unsafe trait Automaton {
/// if last_match.is_some() {
/// return Ok(last_match);
/// }
- /// return Err(MatchError::Quit {
- /// byte: b, offset: pos - 1,
- /// });
+ /// return Err(MatchError::quit(b, pos - 1));
/// } else if dfa.is_start_state(state) {
/// // If we're in a start state and know all matches begin
/// // with a particular byte, then we can quickly skip to
@@ -620,22 +667,27 @@ pub unsafe trait Automaton {
/// }
///
/// // In this example, it's obvious that all occurrences of our pattern
- /// // begin with 'Z', so we pass in 'Z'.
- /// let dfa = dense::DFA::new(r"Z[a-z]+")?;
+ /// // begin with 'Z', so we pass in 'Z'. Note also that we need to
+ /// // enable start state specialization, or else it won't be possible to
+ /// // detect start states during a search. ('is_start_state' would always
+ /// // return false.)
+ /// let dfa = dense::DFA::builder()
+ /// .configure(dense::DFA::config().specialize_start_states(true))
+ /// .build(r"Z[a-z]+")?;
/// let haystack = "123 foobar Zbaz quux".as_bytes();
- /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap();
+ /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
/// // But note that we don't need to pass in a prefix byte. If we don't,
/// // then the search routine does no acceleration.
- /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap();
+ /// let mat = find(&dfa, haystack, None)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
/// // However, if we pass an incorrect byte, then the prefix search will
/// // result in incorrect results.
- /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None);
+ /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -695,13 +747,13 @@ pub unsafe trait Automaton {
///
/// # Example
///
- /// This example shows the pattern count for a DFA that never matches:
+ /// This example shows the pattern length for a DFA that never matches:
///
/// ```
/// use regex_automata::dfa::{Automaton, dense::DFA};
///
/// let dfa: DFA<Vec<u32>> = DFA::never_match()?;
- /// assert_eq!(dfa.pattern_count(), 0);
+ /// assert_eq!(dfa.pattern_len(), 0);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -711,7 +763,7 @@ pub unsafe trait Automaton {
/// use regex_automata::dfa::{Automaton, dense::DFA};
///
/// let dfa: DFA<Vec<u32>> = DFA::always_match()?;
- /// assert_eq!(dfa.pattern_count(), 1);
+ /// assert_eq!(dfa.pattern_len(), 1);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -721,10 +773,10 @@ pub unsafe trait Automaton {
/// use regex_automata::dfa::{Automaton, dense::DFA};
///
/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
- /// assert_eq!(dfa.pattern_count(), 3);
+ /// assert_eq!(dfa.pattern_len(), 3);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- fn pattern_count(&self) -> usize;
+ fn pattern_len(&self) -> usize;
/// Returns the total number of patterns that match in this state.
///
@@ -734,8 +786,8 @@ pub unsafe trait Automaton {
/// If the DFA was compiled with one pattern, then this must necessarily
/// always return `1` for all match states.
///
- /// Implementations must guarantee that [`Automaton::match_pattern`] can
- /// be called with indices up to (but not including) the count returned by
+ /// Implementations must guarantee that [`Automaton::match_pattern`] can be
+ /// called with indices up to (but not including) the length returned by
/// this routine without panicking.
///
/// # Panics
@@ -750,12 +802,13 @@ pub unsafe trait Automaton {
/// patterns have matched in a particular state, but also how to access
/// which specific patterns have matched.
///
- /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+ /// Notice that we must use
+ /// [`MatchKind::All`](crate::MatchKind::All)
/// when building the DFA. If we used
/// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
- /// instead, then the DFA would not be constructed in a way that supports
- /// overlapping matches. (It would only report a single pattern that
- /// matches at any particular point in time.)
+ /// instead, then the DFA would not be constructed in a way that
+ /// supports overlapping matches. (It would only report a single pattern
+ /// that matches at any particular point in time.)
///
/// Another thing to take note of is the patterns used and the order in
/// which the pattern IDs are reported. In the example below, pattern `3`
@@ -766,23 +819,19 @@ pub unsafe trait Automaton {
/// other.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// MatchKind,
- /// };
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind};
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().match_kind(MatchKind::All))
/// .build_many(&[
- /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+ /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+",
/// ])?;
/// let haystack = "@bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
- /// let mut state = dfa.start_state_forward(
- /// None, haystack, 0, haystack.len(),
- /// );
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
/// state = dfa.next_state(state, b);
@@ -790,8 +839,8 @@ pub unsafe trait Automaton {
/// state = dfa.next_eoi_state(state);
///
/// assert!(dfa.is_match_state(state));
- /// assert_eq!(dfa.match_count(state), 3);
- /// // The following calls are guaranteed to not panic since `match_count`
+ /// assert_eq!(dfa.match_len(state), 3);
+ /// // The following calls are guaranteed to not panic since `match_len`
/// // returned `3` above.
/// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3);
/// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0);
@@ -799,19 +848,19 @@ pub unsafe trait Automaton {
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- fn match_count(&self, id: StateID) -> usize;
+ fn match_len(&self, id: StateID) -> usize;
/// Returns the pattern ID corresponding to the given match index in the
/// given state.
///
- /// See [`Automaton::match_count`] for an example of how to use this
+ /// See [`Automaton::match_len`] for an example of how to use this
/// method correctly. Note that if you know your DFA is compiled with a
/// single pattern, then this routine is never necessary since it will
/// always return a pattern ID of `0` for an index of `0` when `id`
/// corresponds to a match state.
///
/// Typically, this routine is used when implementing an overlapping
- /// search, as the example for `Automaton::match_count` does.
+ /// search, as the example for `Automaton::match_len` does.
///
/// # Panics
///
@@ -822,12 +871,182 @@ pub unsafe trait Automaton {
/// `PatternID`.
fn match_pattern(&self, id: StateID, index: usize) -> PatternID;
+ /// Returns true if and only if this automaton can match the empty string.
+ /// When it returns false, all possible matches are guaranteed to have a
+ /// non-zero length.
+ ///
+ /// This is useful as cheap way to know whether code needs to handle the
+ /// case of a zero length match. This is particularly important when UTF-8
+ /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
+ /// split a codepoint must never be reported. This extra handling can
+ /// sometimes be costly, and since regexes matching an empty string are
+ /// somewhat rare, it can be beneficial to treat such regexes specially.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different DFAs and whether they match the
+ /// empty string or not. Notice the empty string isn't merely a matter
+ /// of a string of length literally `0`, but rather, whether a match can
+ /// occur between specific pairs of bytes.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax};
+ ///
+ /// // The empty regex matches the empty string.
+ /// let dfa = DFA::new("")?;
+ /// assert!(dfa.has_empty(), "empty matches empty");
+ /// // The '+' repetition operator requires at least one match, and so
+ /// // does not match the empty string.
+ /// let dfa = DFA::new("a+")?;
+ /// assert!(!dfa.has_empty(), "+ does not match empty");
+ /// // But the '*' repetition operator does.
+ /// let dfa = DFA::new("a*")?;
+ /// assert!(dfa.has_empty(), "* does match empty");
+ /// // And wrapping '+' in an operator that can match an empty string also
+ /// // causes it to match the empty string too.
+ /// let dfa = DFA::new("(a+)*")?;
+ /// assert!(dfa.has_empty(), "+ inside of * matches empty");
+ ///
+ /// // If a regex is just made of a look-around assertion, even if the
+ /// // assertion requires some kind of non-empty string around it (such as
+ /// // \b), then it is still treated as if it matches the empty string.
+ /// // Namely, if a match occurs of just a look-around assertion, then the
+ /// // match returned is empty.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().unicode_word_boundary(true))
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
+ /// assert!(dfa.has_empty(), "assertions match empty");
+ /// // Even when an assertion is wrapped in a '+', it still matches the
+ /// // empty string.
+ /// let dfa = DFA::new(r"^+")?;
+ /// assert!(dfa.has_empty(), "+ of an assertion matches empty");
+ ///
+ /// // An alternation with even one branch that can match the empty string
+ /// // is also said to match the empty string overall.
+ /// let dfa = DFA::new("foo|(bar)?|quux")?;
+ /// assert!(dfa.has_empty(), "alternations can match empty");
+ ///
+ /// // An NFA that matches nothing does not match the empty string.
+ /// let dfa = DFA::new("[a&&b]")?;
+ /// assert!(!dfa.has_empty(), "never matching means not matching empty");
+ /// // But if it's wrapped in something that doesn't require a match at
+ /// // all, then it can match the empty string!
+ /// let dfa = DFA::new("[a&&b]*")?;
+ /// assert!(dfa.has_empty(), "* on never-match still matches empty");
+ /// // Since a '+' requires a match, using it on something that can never
+ /// // match will itself produce a regex that can never match anything,
+ /// // and thus does not match the empty string.
+ /// let dfa = DFA::new("[a&&b]+")?;
+ /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn has_empty(&self) -> bool;
+
+ /// Whether UTF-8 mode is enabled for this DFA or not.
+ ///
+ /// When UTF-8 mode is enabled, all matches reported by a DFA are
+ /// guaranteed to correspond to spans of valid UTF-8. This includes
+ /// zero-width matches. For example, the DFA must guarantee that the empty
+ /// regex will not match at the positions between code units in the UTF-8
+ /// encoding of a single codepoint.
+ ///
+ /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for
+ /// more information.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let re = DFA::new("")?;
+ /// assert!(re.is_utf8());
+ /// let mut input = Input::new("☃");
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 0)), got);
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = DFA::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// assert!(!re.is_utf8());
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 1)), got);
+ ///
+ /// input.set_start(2);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 2)), got);
+ ///
+ /// input.set_start(3);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
+ ///
+ /// input.set_start(4);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(None, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_utf8(&self) -> bool;
+
+ /// Returns true if and only if this DFA is limited to returning matches
+ /// whose start position is `0`.
+ ///
+ /// Note that if you're using DFAs provided by
+ /// this crate, then this is _orthogonal_ to
+ /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind).
+ ///
+ /// This is useful in some cases because if a DFA is limited to producing
+ /// matches that start at offset `0`, then a reverse search is never
+ /// required for finding the start of a match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::{dense::DFA, Automaton};
+ ///
+ /// // The empty regex matches anywhere
+ /// let dfa = DFA::new("")?;
+ /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere");
+ /// // 'a' matches anywhere.
+ /// let dfa = DFA::new("a")?;
+ /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere");
+ /// // '^' only matches at offset 0!
+ /// let dfa = DFA::new("^a")?;
+ /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0");
+ /// // But '(?m:^)' matches at 0 but at other offsets too.
+ /// let dfa = DFA::new("(?m:^)a")?;
+ /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_always_start_anchored(&self) -> bool;
+
/// Return a slice of bytes to accelerate for the given state, if possible.
///
/// If the given state has no accelerator, then an empty slice must be
- /// returned. If `Automaton::is_accel_state` returns true for the given
- /// ID, then this routine _must_ return a non-empty slice, but it is not
- /// required to do so.
+ /// returned. If `Automaton::is_accel_state` returns true for the given ID,
+ /// then this routine _must_ return a non-empty slice. But note that it is
+ /// not required for an implementation of this trait to ever return `true`
+ /// for `is_accel_state`, even if the state _could_ be accelerated. That
+ /// is, acceleration is an optional optimization. But the return values of
+ /// `is_accel_state` and `accelerator` must be in sync.
///
/// If the given ID is not a valid state ID for this automaton, then
/// implementations may panic or produce incorrect results.
@@ -844,22 +1063,19 @@ pub unsafe trait Automaton {
///
/// ```
/// use regex_automata::{
- /// nfa::thompson,
/// dfa::{Automaton, dense},
- /// util::id::StateID,
- /// SyntaxConfig,
+ /// util::{primitives::StateID, syntax},
/// };
///
/// let dfa = dense::Builder::new()
/// // We disable Unicode everywhere and permit the regex to match
- /// // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid
- /// // UTF-8.
- /// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
- /// // This makes the implicit `(?s:.)*?` prefix added to the regex
- /// // match through arbitrary bytes instead of being UTF-8 aware. This
- /// // isn't necessary to get acceleration to work in this case, but
- /// // it does make the DFA substantially simpler.
- /// .thompson(thompson::Config::new().utf8(false))
+ /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid
+ /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8
+ /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'.
+ /// // That translates to a much more complicated DFA, and also
+ /// // inhibits the 'accelerator' optimization that we are trying to
+ /// // demonstrate in this example.
+ /// .syntax(syntax::Config::new().unicode(false).utf8(false))
/// .build("[^abc]+a")?;
///
/// // Here we just pluck out the state that we know is accelerated.
@@ -875,154 +1091,58 @@ pub unsafe trait Automaton {
/// assert_eq!(accelerator, &[b'a', b'b', b'c']);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[inline]
fn accelerator(&self, _id: StateID) -> &[u8] {
&[]
}
- /// Executes a forward search and returns the end position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state. This is useful for implementing boolean `is_match`-like
- /// routines, where as little work is done as possible.
- ///
- /// See [`Automaton::find_earliest_fwd_at`] for additional functionality,
- /// such as providing a prefilter, a specific pattern to match and the
- /// bounds of the search within the haystack. This routine is meant as
- /// a convenience for common cases where the additional functionality is
- /// not needed.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
- /// how the position returned might differ from what one might expect when
- /// executing a traditional leftmost search.
- ///
- /// ```
- /// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch,
- /// };
- ///
- /// let dfa = dense::DFA::new("foo[0-9]+")?;
- /// // Normally, the end of the leftmost first match here would be 8,
- /// // corresponding to the end of the input. But the "earliest" semantics
- /// // this routine cause it to stop as soon as a match is known, which
- /// // occurs once 'foo[0-9]' has matched.
- /// let expected = HalfMatch::must(0, 4);
- /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?);
- ///
- /// let dfa = dense::DFA::new("abc|a")?;
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the shortest match semantics detect a match earlier.
- /// let expected = HalfMatch::must(0, 1);
- /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?);
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- #[inline]
- fn find_earliest_fwd(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len())
- }
-
- /// Executes a reverse search and returns the start position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state.
- ///
- /// Note that while it is not technically necessary to build a reverse
- /// automaton to use a reverse search, it is likely that you'll want to do
- /// so. Namely, the typical use of a reverse search is to find the starting
- /// location of a match once its end is discovered from a forward search. A
- /// reverse DFA automaton can be built by configuring the intermediate NFA
- /// to be reversed via
- /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+ /// Returns the prefilter associated with a DFA, if one exists.
///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
- /// how the position returned might differ from what one might expect when
- /// executing a traditional leftmost reverse search.
- ///
- /// ```
- /// use regex_automata::{
- /// nfa::thompson,
- /// dfa::{Automaton, dense},
- /// HalfMatch,
- /// };
+ /// The default implementation of this trait always returns `None`. And
+ /// indeed, it is always correct to return `None`.
///
- /// let dfa = dense::Builder::new()
- /// .thompson(thompson::Config::new().reverse(true))
- /// .build("[a-z]+[0-9]+")?;
- /// // Normally, the end of the leftmost first match here would be 0,
- /// // corresponding to the beginning of the input. But the "earliest"
- /// // semantics of this routine cause it to stop as soon as a match is
- /// // known, which occurs once '[a-z][0-9]+' has matched.
- /// let expected = HalfMatch::must(0, 2);
- /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?);
+ /// For DFAs in this crate, a prefilter can be attached to a DFA via
+ /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter).
///
- /// let dfa = dense::Builder::new()
- /// .thompson(thompson::Config::new().reverse(true))
- /// .build("abc|c")?;
- /// // Normally, the end of the leftmost first match here would be 0,
- /// // but the shortest match semantics detect a match earlier.
- /// let expected = HalfMatch::must(0, 2);
- /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?);
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
+ /// Do note that prefilters are not serialized by DFAs in this crate.
+ /// So if you deserialize a DFA that had a prefilter attached to it
+ /// at serialization time, then it will not have a prefilter after
+ /// deserialization.
#[inline]
- fn find_earliest_rev(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_earliest_rev_at(None, bytes, 0, bytes.len())
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ None
}
/// Executes a forward search and returns the end position of the leftmost
/// match that is found. If no match exists, then `None` is returned.
///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// When a search cannot complete, callers cannot know whether a match
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Notes for implementors
///
/// Implementors of this trait are not required to implement any particular
/// match semantics (such as leftmost-first), which are instead manifest in
- /// the DFA's transitions.
+ /// the DFA's transitions. But this search routine should behave as a
+ /// general "leftmost" search.
///
/// In particular, this method must continue searching even after it enters
/// a match state. The search should only terminate once it has reached
@@ -1036,47 +1156,124 @@ pub unsafe trait Automaton {
/// # Example
///
/// This example shows how to use this method with a
- /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses
- /// "leftmost first" match semantics.
- ///
- /// Leftmost first match semantics corresponds to the match with the
- /// smallest starting offset, but where the end offset is determined by
- /// preferring earlier branches in the original regular expression. For
- /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
- /// will match `Samwise` in `Samwise`.
- ///
- /// Generally speaking, the "leftmost first" match is how most backtracking
- /// regular expressions tend to work. This is in contrast to POSIX-style
- /// regular expressions that yield "leftmost longest" matches. Namely,
- /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
- /// leftmost longest semantics. (This crate does not currently support
- /// leftmost longest semantics.)
+ /// [`dense::DFA`](crate::dfa::dense::DFA).
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::new("foo[0-9]+")?;
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?);
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the leftmost first match semantics demand that we find the earliest
/// // match that prefers earlier parts of the pattern over latter parts.
/// let dfa = dense::DFA::new("abc|a")?;
- /// let expected = HalfMatch::must(0, 3);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?);
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-DFA that permits searching for
+ /// specific patterns.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// Anchored, HalfMatch, PatternID, Input,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let haystack = "foo123".as_bytes();
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries.
+ /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let input = Input::new(&haystack[3..6]);
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let input = Input::new(haystack).range(3..6);
+ /// let expected = None;
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- fn find_leftmost_fwd(
+ fn try_search_fwd(
&self,
- bytes: &[u8],
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len())
+ let utf8empty = self.has_empty() && self.is_utf8();
+ let hm = match search::find_fwd(&self, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ // We get to this point when we know our DFA can match the empty string
+ // AND when UTF-8 mode is enabled. In this case, we skip any matches
+ // whose offset splits a codepoint. Such a match is necessarily a
+ // zero-width match, because UTF-8 mode requires the underlying NFA
+ // to be built such that all non-empty matches span valid UTF-8.
+ // Therefore, any match that ends in the middle of a codepoint cannot
+ // be part of a span of valid UTF-8 and thus must be an empty match.
+ // In such cases, we skip it, so as not to report matches that split a
+ // codepoint.
+ //
+ // Note that this is not a checked assumption. Callers *can* provide an
+ // NFA with UTF-8 mode enabled but produces non-empty matches that span
+ // invalid UTF-8. But doing so is documented to result in unspecified
+ // behavior.
+ empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+ let got = search::find_fwd(&self, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
}
/// Executes a reverse search and returns the start of the position of the
@@ -1085,52 +1282,42 @@ pub unsafe trait Automaton {
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Notes for implementors
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// Implementors of this trait are not required to implement any particular
- /// match semantics (such as leftmost-first), which are instead manifest in
- /// the DFA's transitions.
- ///
- /// In particular, this method must continue searching even after it enters
- /// a match state. The search should only terminate once it has reached
- /// the end of the input or when it has entered a dead or quit state. Upon
- /// termination, the position of the last byte seen while still in a match
- /// state is returned.
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// Since this trait provides an implementation for this method by default,
- /// it's unlikely that one will need to implement this.
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
///
/// # Example
///
/// This example shows how to use this method with a
- /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine
- /// is principally useful when used in conjunction with the
+ /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this
+ /// routine is principally useful when used in conjunction with the
/// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
- /// configuration. In general, it's unlikely to be correct to use both
- /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any
- /// particular DFA will only support searching in one direction with
+ /// configuration. In general, it's unlikely to be correct to use
+ /// both `try_search_fwd` and `try_search_rev` with the same DFA since
+ /// any particular DFA will only support searching in one direction with
/// respect to the pattern.
///
/// ```
/// use regex_automata::{
/// nfa::thompson,
/// dfa::{Automaton, dense},
- /// HalfMatch,
+ /// HalfMatch, Input,
/// };
///
/// let dfa = dense::Builder::new()
/// .thompson(thompson::Config::new().reverse(true))
/// .build("foo[0-9]+")?;
- /// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?);
///
/// // Even though a match is found after reading the last byte (`c`),
/// // the leftmost first match semantics demand that we find the earliest
@@ -1138,21 +1325,134 @@ pub unsafe trait Automaton {
/// let dfa = dense::Builder::new()
/// .thompson(thompson::Config::new().reverse(true))
/// .build("abc|c")?;
- /// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?);
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: UTF-8 mode
+ ///
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
+ ///
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build(r"")?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// original NFA disabled (which results in disabling UTF-8 mode on the
+ /// DFA):
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build(r"")?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- fn find_leftmost_rev(
+ fn try_search_rev(
&self,
- bytes: &[u8],
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- self.find_leftmost_rev_at(None, bytes, 0, bytes.len())
+ let utf8empty = self.has_empty() && self.is_utf8();
+ let hm = match search::find_rev(self, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ empty::skip_splits_rev(input, hm, hm.offset(), |input| {
+ let got = search::find_rev(self, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
}
- /// Executes an overlapping forward search and returns the end position of
- /// matches as they are found. If no match exists, then `None` is returned.
+ /// Executes an overlapping forward search. Matches, if one exists, can be
+ /// obtained via the [`OverlappingState::get_match`] method.
///
/// This routine is principally only useful when searching for multiple
/// patterns on inputs where multiple patterns may match the same regions
@@ -1160,14 +1460,30 @@ pub unsafe trait Automaton {
/// state from prior calls so that the implementation knows where the last
/// match occurred.
///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should always be set to the end
+ /// of the last match. If more patterns match at the previous location,
+ /// then they will be immediately returned. (This is tracked by the given
+ /// overlapping state.) Otherwise, the search continues at the starting
+ /// position given.
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// When a search cannot complete, callers cannot know whether a match
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
@@ -1187,21 +1503,21 @@ pub unsafe trait Automaton {
/// to find totally new matches (potentially of other patterns).
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::{Automaton, OverlappingState, dense},
- /// HalfMatch,
- /// MatchKind,
+ /// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let haystack = "@foo".as_bytes();
+ /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?;
+ /// let haystack = "@foo";
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
- /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
+ /// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
@@ -1209,394 +1525,260 @@ pub unsafe trait Automaton {
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
- /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
+ /// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- fn find_overlapping_fwd(
+ fn try_search_overlapping_fwd(
&self,
- bytes: &[u8],
+ input: &Input<'_>,
state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state)
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ search::find_overlapping_fwd(self, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_fwd(self, input, state)
+ },
+ ),
+ }
}
- /// Executes a forward search and returns the end position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state. This is useful for implementing boolean `is_match`-like
- /// routines, where as little work is done as possible.
- ///
- /// This is like [`Automaton::find_earliest_fwd`], except it provides some
- /// additional control over how the search is executed:
- ///
- /// * `pre` is a prefilter scanner that, when given, is used whenever the
- /// DFA enters its starting state. This is meant to speed up searches where
- /// one or a small number of literal prefixes are known.
- /// * `pattern_id` specifies a specific pattern in the DFA to run an
- /// anchored search for. If not given, then a search for any pattern is
- /// performed. For DFAs built by this crate,
- /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern)
- /// must be enabled to use this functionality.
- /// * `start` and `end` permit searching a specific region of the haystack
- /// `bytes`. This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `bytes`. (Because the existence of look-around
- /// operations such as `\b`, `^` and `$` need to take the surrounding
- /// context into account. This cannot be done if the haystack doesn't
- /// contain it.)
- ///
- /// The examples below demonstrate each of these additional parameters.
+ /// Executes a reverse overlapping forward search. Matches, if one exists,
+ /// can be obtained via the [`OverlappingState::get_match`] method.
///
- /// # Errors
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should remain invariant throughout
+ /// iteration. The `OverlappingState` given to the search will keep track
+ /// of the current position of the search. (This is because multiple
+ /// matches may be reported at the same position, so only the search
+ /// implementation itself knows when to advance the position.)
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// # Errors
///
- /// # Panics
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// This routine must panic if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// It must also panic if the given haystack range is not valid.
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example: UTF-8 mode
///
- /// # Example: prefilter
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
///
- /// This example shows how to provide a prefilter for a pattern where all
- /// matches start with a `z` byte.
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
///
/// ```
/// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// util::prefilter::{Candidate, Prefilter, Scanner, State},
- /// HalfMatch,
+ /// dfa::{dense::DFA, Automaton, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
/// };
///
- /// #[derive(Debug)]
- /// pub struct ZPrefilter;
- ///
- /// impl Prefilter for ZPrefilter {
- /// fn next_candidate(
- /// &self,
- /// _: &mut State,
- /// haystack: &[u8],
- /// at: usize,
- /// ) -> Candidate {
- /// // Try changing b'z' to b'q' and observe this test fail since
- /// // the prefilter will skip right over the match.
- /// match haystack.iter().position(|&b| b == b'z') {
- /// None => Candidate::None,
- /// Some(i) => Candidate::PossibleStartOfMatch(at + i),
- /// }
- /// }
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build_many(&[r"", r"☃"])?;
///
- /// fn heap_bytes(&self) -> usize {
- /// 0
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
/// }
/// }
///
- /// let dfa = dense::DFA::new("z[0-9]{3}")?;
- /// let haystack = "foobar z123 q123".as_bytes();
- /// // A scanner executes a prefilter while tracking some state that helps
- /// // determine whether a prefilter is still "effective" or not.
- /// let mut scanner = Scanner::new(&ZPrefilter);
- ///
- /// let expected = Some(HalfMatch::must(0, 11));
- /// let got = dfa.find_earliest_fwd_at(
- /// Some(&mut scanner),
- /// None,
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
- /// # Example: specific pattern search
- ///
- /// This example shows how to build a multi-DFA that permits searching for
- /// specific patterns.
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// original NFA disabled (which results in disabling UTF-8 mode on the
+ /// DFA):
///
/// ```
/// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch,
- /// PatternID,
- /// };
- ///
- /// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().starts_for_each_pattern(true))
- /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
- /// let haystack = "foo123".as_bytes();
- ///
- /// // Since we are using the default leftmost-first match and both
- /// // patterns match at the same starting position, only the first pattern
- /// // will be returned in this case when doing a search for any of the
- /// // patterns.
- /// let expected = Some(HalfMatch::must(0, 6));
- /// let got = dfa.find_earliest_fwd_at(
- /// None,
- /// None,
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
- ///
- /// // But if we want to check whether some other pattern matches, then we
- /// // can provide its pattern ID.
- /// let expected = Some(HalfMatch::must(1, 6));
- /// let got = dfa.find_earliest_fwd_at(
- /// None,
- /// Some(PatternID::must(1)),
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- ///
- /// # Example: specifying the bounds of a search
- ///
- /// This example shows how providing the bounds of a search can produce
- /// different results than simply sub-slicing the haystack.
- ///
- /// ```
- /// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch,
+ /// dfa::{dense::DFA, Automaton, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
/// };
///
- /// // N.B. We disable Unicode here so that we use a simple ASCII word
- /// // boundary. Alternatively, we could enable heuristic support for
- /// // Unicode word boundaries.
- /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
- /// let haystack = "foo123bar".as_bytes();
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build_many(&[r"", r"☃"])?;
///
- /// // Since we sub-slice the haystack, the search doesn't know about the
- /// // larger context and assumes that `123` is surrounded by word
- /// // boundaries. And of course, the match position is reported relative
- /// // to the sub-slice as well, which means we get `3` instead of `6`.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// let got = dfa.find_earliest_fwd_at(
- /// None,
- /// None,
- /// &haystack[3..6],
- /// 0,
- /// haystack[3..6].len(),
- /// )?;
- /// assert_eq!(expected, got);
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
+ /// }
+ /// }
///
- /// // But if we provide the bounds of the search within the context of the
- /// // entire haystack, then the search can take the surrounding context
- /// // into account. (And if we did find a match, it would be reported
- /// // as a valid offset into `haystack` instead of its sub-slice.)
- /// let expected = None;
- /// let got = dfa.find_earliest_fwd_at(
- /// None,
- /// None,
- /// haystack,
- /// 3,
- /// 6,
- /// )?;
- /// assert_eq!(expected, got);
+ /// // Now *all* positions match, even within a codepoint,
+ /// // because we lifted the requirement that matches
+ /// // correspond to valid UTF-8 spans.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- fn find_earliest_fwd_at(
+ fn try_search_overlapping_rev(
&self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ search::find_overlapping_rev(self, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_rev(self, input, state)
+ },
+ ),
+ }
}
- /// Executes a reverse search and returns the start position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state.
- ///
- /// This is like [`Automaton::find_earliest_rev`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
+ /// Writes the set of patterns that match anywhere in the given search
+ /// configuration to `patset`. If multiple patterns match at the same
+ /// position and the underlying DFA supports overlapping matches, then all
+ /// matching patterns are written to the given set.
///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// Unless all of the patterns in this DFA are anchored, then generally
+ /// speaking, this will visit every byte in the haystack.
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// This search routine *does not* clear the pattern set. This gives some
+ /// flexibility to the caller (e.g., running multiple searches with the
+ /// same pattern set), but does make the API bug-prone if you're reusing
+ /// the same pattern set for multiple searches but intended them to be
+ /// independent.
///
- /// # Panics
- ///
- /// This routine must panic if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It must also panic if the given haystack range is not valid.
- #[inline]
- fn find_earliest_rev_at(
- &self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_earliest_rev(self, pattern_id, bytes, start, end)
- }
-
- /// Executes a forward search and returns the end position of the leftmost
- /// match that is found. If no match exists, then `None` is returned.
- ///
- /// This is like [`Automaton::find_leftmost_fwd`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
+ /// If a pattern ID matched but the given `PatternSet` does not have
+ /// sufficient capacity to store it, then it is not inserted and silently
+ /// dropped.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Panics
- ///
- /// This routine must panic if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It must also panic if the given haystack range is not valid.
- #[inline]
- fn find_leftmost_fwd_at(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end)
- }
-
- /// Executes a reverse search and returns the start of the position of the
- /// leftmost match that is found. If no match exists, then `None` is
- /// returned.
- ///
- /// This is like [`Automaton::find_leftmost_rev`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
- ///
- /// # Errors
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// When a search cannot complete, callers cannot know whether a match
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
- /// # Panics
- ///
- /// This routine must panic if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It must also panic if the given haystack range is not valid.
- #[inline]
- fn find_leftmost_rev_at(
- &self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_leftmost_rev(self, pattern_id, bytes, start, end)
- }
-
- /// Executes an overlapping forward search and returns the end position of
- /// matches as they are found. If no match exists, then `None` is returned.
- ///
- /// This routine is principally only useful when searching for multiple
- /// patterns on inputs where multiple patterns may match the same regions
- /// of text. In particular, callers must preserve the automaton's search
- /// state from prior calls so that the implementation knows where the last
- /// match occurred.
- ///
- /// This is like [`Automaton::find_overlapping_fwd`], except it provides
- /// some additional control over how the search is executed. See the
- /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
- ///
- /// When using this routine to implement an iterator of overlapping
- /// matches, the `start` of the search should always be set to the end
- /// of the last match. If more patterns match at the previous location,
- /// then they will be immediately returned. (This is tracked by the given
- /// overlapping state.) Otherwise, the search continues at the starting
- /// position given.
- ///
- /// If for some reason you want the search to forget about its previous
- /// state and restart the search at a particular position, then setting the
- /// state to [`OverlappingState::start`] will accomplish that.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFAs generated by this crate, this only occurs in a non-default
- /// configuration where quit bytes are used or Unicode word boundaries are
- /// heuristically enabled.
+ /// # Example
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// This example shows how to find all matching patterns in a haystack,
+ /// even when some patterns match at the same position as other patterns.
///
- /// # Panics
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// Input, MatchKind, PatternSet,
+ /// };
///
- /// This routine must panic if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
+ /// let patterns = &[
+ /// r"[[:word:]]+",
+ /// r"[0-9]+",
+ /// r"[[:alpha:]]+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ];
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ ///
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(dfa.pattern_len());
+ /// dfa.try_which_overlapping_matches(&input, &mut patset)?;
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
///
- /// It must also panic if the given haystack range is not valid.
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
#[inline]
- fn find_overlapping_fwd_at(
+ fn try_which_overlapping_matches(
&self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_overlapping_fwd(
- pre, self, pattern_id, bytes, start, end, state,
- )
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), MatchError> {
+ let mut state = OverlappingState::start();
+ while let Some(m) = {
+ self.try_search_overlapping_fwd(input, &mut state)?;
+ state.get_match()
+ } {
+ let _ = patset.insert(m.pattern());
+ // There's nothing left to find, so we can stop. Or the caller
+ // asked us to.
+ if patset.is_full() || input.get_earliest() {
+ break;
+ }
+ }
+ Ok(())
}
}
-unsafe impl<'a, T: Automaton> Automaton for &'a T {
+unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
#[inline]
fn next_state(&self, current: StateID, input: u8) -> StateID {
(**self).next_state(current, input)
@@ -1619,23 +1801,22 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T {
#[inline]
fn start_state_forward(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- (**self).start_state_forward(pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ (**self).start_state_forward(input)
}
#[inline]
fn start_state_reverse(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- (**self).start_state_reverse(pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ (**self).start_state_reverse(input)
+ }
+
+ #[inline]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ (**self).universal_start_state(mode)
}
#[inline]
@@ -1669,13 +1850,13 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T {
}
#[inline]
- fn pattern_count(&self) -> usize {
- (**self).pattern_count()
+ fn pattern_len(&self) -> usize {
+ (**self).pattern_len()
}
#[inline]
- fn match_count(&self, id: StateID) -> usize {
- (**self).match_count(id)
+ fn match_len(&self, id: StateID) -> usize {
+ (**self).match_len(id)
}
#[inline]
@@ -1684,109 +1865,72 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T {
}
#[inline]
- fn accelerator(&self, id: StateID) -> &[u8] {
- (**self).accelerator(id)
- }
-
- #[inline]
- fn find_earliest_fwd(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_earliest_fwd(bytes)
+ fn has_empty(&self) -> bool {
+ (**self).has_empty()
}
#[inline]
- fn find_earliest_rev(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_earliest_rev(bytes)
+ fn is_utf8(&self) -> bool {
+ (**self).is_utf8()
}
#[inline]
- fn find_leftmost_fwd(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_leftmost_fwd(bytes)
+ fn is_always_start_anchored(&self) -> bool {
+ (**self).is_always_start_anchored()
}
#[inline]
- fn find_leftmost_rev(
- &self,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_leftmost_rev(bytes)
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ (**self).accelerator(id)
}
#[inline]
- fn find_overlapping_fwd(
- &self,
- bytes: &[u8],
- state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_overlapping_fwd(bytes, state)
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ (**self).get_prefilter()
}
#[inline]
- fn find_earliest_fwd_at(
+ fn try_search_fwd(
&self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end)
+ (**self).try_search_fwd(input)
}
#[inline]
- fn find_earliest_rev_at(
+ fn try_search_rev(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_earliest_rev_at(pattern_id, bytes, start, end)
+ (**self).try_search_rev(input)
}
#[inline]
- fn find_leftmost_fwd_at(
+ fn try_search_overlapping_fwd(
&self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ (**self).try_search_overlapping_fwd(input, state)
}
#[inline]
- fn find_leftmost_rev_at(
+ fn try_search_overlapping_rev(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self).find_leftmost_rev_at(pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ (**self).try_search_overlapping_rev(input, state)
}
+ #[cfg(feature = "alloc")]
#[inline]
- fn find_overlapping_fwd_at(
+ fn try_which_overlapping_matches(
&self,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- (**self)
- .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state)
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), MatchError> {
+ (**self).try_which_overlapping_matches(input, patset)
}
}
@@ -1799,15 +1943,21 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T {
/// the search at the next position. Additionally, it also tracks which state
/// the last search call terminated in.
///
-/// This type provides no introspection capabilities. The only thing a caller
-/// can do is construct it and pass it around to permit search routines to use
-/// it to track state.
+/// This type provides little introspection capabilities. The only thing a
+/// caller can do is construct it and pass it around to permit search routines
+/// to use it to track state, and also ask whether a match has been found.
///
/// Callers should always provide a fresh state constructed via
/// [`OverlappingState::start`] when starting a new search. Reusing state from
/// a previous search may result in incorrect results.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct OverlappingState {
+ /// The match reported by the most recent overlapping search to use this
+ /// state.
+ ///
+ /// If a search does not find any matches, then it is expected to clear
+ /// this value.
+ pub(crate) mat: Option<HalfMatch>,
/// The state ID of the state at which the search was in when the call
/// terminated. When this is a match state, `last_match` must be set to a
/// non-None value.
@@ -1816,50 +1966,96 @@ pub struct OverlappingState {
/// automaton. We cannot use the actual ID, since any one automaton may
/// have many start states, and which one is in use depends on several
/// search-time factors.
- id: Option<StateID>,
- /// Information associated with a match when `id` corresponds to a match
- /// state.
- last_match: Option<StateMatch>,
-}
-
-/// Internal state about the last match that occurred. This records both the
-/// offset of the match and the match index.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub(crate) struct StateMatch {
- /// The index into the matching patterns for the current match state.
- pub(crate) match_index: usize,
- /// The offset in the haystack at which the match occurred. This is used
- /// when reporting multiple matches at the same offset. That is, when
- /// an overlapping search runs, the first thing it checks is whether it's
- /// already in a match state, and if so, whether there are more patterns
- /// to report as matches in that state. If so, it increments `match_index`
- /// and returns the pattern and this offset. Once `match_index` exceeds the
- /// number of matching patterns in the current state, the search continues.
- pub(crate) offset: usize,
+ pub(crate) id: Option<StateID>,
+ /// The position of the search.
+ ///
+ /// When `id` is None (i.e., we are starting a search), this is set to
+ /// the beginning of the search as given by the caller regardless of its
+ /// current value. Subsequent calls to an overlapping search pick up at
+ /// this offset.
+ pub(crate) at: usize,
+ /// The index into the matching patterns of the next match to report if the
+ /// current state is a match state. Note that this may be 1 greater than
+ /// the total number of matches to report for the current match state. (In
+ /// which case, no more matches should be reported at the current position
+ /// and the search should advance to the next position.)
+ pub(crate) next_match_index: Option<usize>,
+ /// This is set to true when a reverse overlapping search has entered its
+ /// EOI transitions.
+ ///
+ /// This isn't used in a forward search because it knows to stop once the
+ /// position exceeds the end of the search range. In a reverse search,
+ /// since we use unsigned offsets, we don't "know" once we've gone past
+ /// `0`. So the only way to detect it is with this extra flag. The reverse
+ /// overlapping search knows to terminate specifically after it has
+ /// reported all matches after following the EOI transition.
+ pub(crate) rev_eoi: bool,
}
impl OverlappingState {
/// Create a new overlapping state that begins at the start state of any
/// automaton.
pub fn start() -> OverlappingState {
- OverlappingState { id: None, last_match: None }
+ OverlappingState {
+ mat: None,
+ id: None,
+ at: 0,
+ next_match_index: None,
+ rev_eoi: false,
+ }
}
- pub(crate) fn id(&self) -> Option<StateID> {
- self.id
+ /// Return the match result of the most recent search to execute with this
+ /// state.
+ ///
+ /// A searches will clear this result automatically, such that if no
+ /// match is found, this will correctly report `None`.
+ pub fn get_match(&self) -> Option<HalfMatch> {
+ self.mat
}
+}
- pub(crate) fn set_id(&mut self, id: StateID) {
- self.id = Some(id);
- }
+/// Runs the given overlapping `search` function (forwards or backwards) until
+/// a match is found whose offset does not split a codepoint.
+///
+/// This is *not* always correct to call. It should only be called when the DFA
+/// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling
+/// this when both of those things aren't true might result in legitimate
+/// matches getting skipped.
+#[cold]
+#[inline(never)]
+fn skip_empty_utf8_splits_overlapping<F>(
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ mut search: F,
+) -> Result<(), MatchError>
+where
+ F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
+{
+ // Note that this routine works for forwards and reverse searches
+ // even though there's no code here to handle those cases. That's
+ // because overlapping searches drive themselves to completion via
+ // `OverlappingState`. So all we have to do is push it until no matches are
+ // found.
- pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
- self.last_match.as_mut()
+ let mut hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
+ if input.get_anchored().is_anchored() {
+ if !input.is_char_boundary(hm.offset()) {
+ state.mat = None;
+ }
+ return Ok(());
}
-
- pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
- self.last_match = Some(last_match);
+ while !input.is_char_boundary(hm.offset()) {
+ search(input, state)?;
+ hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
}
+ Ok(())
}
/// Write a prefix "state" indicator for fmt::Debug impls.
@@ -1901,3 +2097,24 @@ pub(crate) fn fmt_state_indicator<A: Automaton>(
}
Ok(())
}
+
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
+mod tests {
+ // A basic test ensuring that our Automaton trait is object safe. (This is
+ // the main reason why we don't define the search routines as generic over
+ // Into<Input>.)
+ #[test]
+ fn object_safe() {
+ use crate::{
+ dfa::{dense, Automaton},
+ HalfMatch, Input,
+ };
+
+ let dfa = dense::DFA::new("abc").unwrap();
+ let dfa: &dyn Automaton = &dfa;
+ assert_eq!(
+ Ok(Some(HalfMatch::must(0, 6))),
+ dfa.try_search_fwd(&Input::new(b"xyzabcxyz")),
+ );
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs
index 07c135098..6da865f97 100644
--- a/vendor/regex-automata/src/dfa/dense.rs
+++ b/vendor/regex-automata/src/dfa/dense.rs
@@ -4,41 +4,45 @@ Types and routines specific to dense DFAs.
This module is the home of [`dense::DFA`](DFA).
This module also contains a [`dense::Builder`](Builder) and a
-[`dense::Config`](Config) for configuring and building a dense DFA.
+[`dense::Config`](Config) for building and configuring a dense DFA.
*/
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
use core::cmp;
use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
use alloc::{
collections::{BTreeMap, BTreeSet},
vec,
vec::Vec,
};
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
use crate::{
dfa::{
- accel::Accel, determinize, error::Error, minimize::Minimizer, sparse,
+ accel::Accel, determinize, minimize::Minimizer, remapper::Remapper,
+ sparse,
},
nfa::thompson,
- util::alphabet::ByteSet,
- MatchKind,
+ util::{look::LookMatcher, search::MatchKind},
};
use crate::{
dfa::{
accel::Accels,
automaton::{fmt_state_indicator, Automaton},
special::Special,
+ start::StartKind,
DEAD,
},
util::{
- alphabet::{self, ByteClasses},
- bytes::{self, DeserializeError, Endian, SerializeError},
- id::{PatternID, StateID},
- start::Start,
+ alphabet::{self, ByteClasses, ByteSet},
+ int::{Pointer, Usize},
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, Input, MatchError},
+ start::{Start, StartByteMap},
+ wire::{self, DeserializeError, Endian, SerializeError},
},
};
@@ -53,17 +57,19 @@ const VERSION: u32 = 2;
/// The configuration used for compiling a dense DFA.
///
+/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Config` type directly.
+///
/// A dense DFA configuration is a simple data object that is typically used
/// with [`dense::Builder::configure`](self::Builder::configure).
///
-/// The default configuration guarantees that a search will _never_ return a
-/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a
-/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode
-/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a
-/// search to return an error. See the corresponding configuration options for
-/// more details on when those error conditions arise.
-#[cfg(feature = "alloc")]
-#[derive(Clone, Copy, Debug, Default)]
+/// The default configuration guarantees that a search will never return
+/// a "quit" error, although it is possible for a search to fail if
+/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
+/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug, Default)]
pub struct Config {
// As with other configuration types in this crate, we put all our knobs
// in options so that we can distinguish between "default" and "not set."
@@ -72,123 +78,27 @@ pub struct Config {
// 'overwrite' method.
//
// For docs on the fields below, see the corresponding method setters.
- anchored: Option<bool>,
accelerate: Option<bool>,
+ pre: Option<Option<Prefilter>>,
minimize: Option<bool>,
match_kind: Option<MatchKind>,
+ start_kind: Option<StartKind>,
starts_for_each_pattern: Option<bool>,
byte_classes: Option<bool>,
unicode_word_boundary: Option<bool>,
- quit: Option<ByteSet>,
+ quitset: Option<ByteSet>,
+ specialize_start_states: Option<bool>,
dfa_size_limit: Option<Option<usize>>,
determinize_size_limit: Option<Option<usize>>,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl Config {
/// Return a new default dense DFA compiler configuration.
pub fn new() -> Config {
Config::default()
}
- /// Set whether matching must be anchored at the beginning of the input.
- ///
- /// When enabled, a match must begin at the start of a search. When
- /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`,
- /// which enables a match to appear anywhere.
- ///
- /// Note that if you want to run both anchored and unanchored
- /// searches without building multiple automatons, you can enable the
- /// [`Config::starts_for_each_pattern`] configuration instead. This will
- /// permit unanchored any-pattern searches and pattern-specific anchored
- /// searches. See the documentation for that configuration for an example.
- ///
- /// By default this is disabled.
- ///
- /// **WARNING:** this is subtly different than using a `^` at the start of
- /// your regex. A `^` forces a regex to match exclusively at the start of
- /// input, regardless of where you begin your search. In contrast, enabling
- /// this option will allow your regex to match anywhere in your input,
- /// but the match must start at the beginning of a search. (Most of the
- /// higher level convenience search routines make "start of input" and
- /// "start of search" equivalent, but some routines allow treating these as
- /// orthogonal.)
- ///
- /// For example, consider the haystack `aba` and the following searches:
- ///
- /// 1. The regex `^a` is compiled with `anchored=false` and searches
- /// `aba` starting at position `2`. Since `^` requires the match to
- /// start at the beginning of the input and `2 > 0`, no match is found.
- /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
- /// starting at position `2`. This reports a match at `[2, 3]` since
- /// the match starts where the search started. Since there is no `^`,
- /// there is no requirement for the match to start at the beginning of
- /// the input.
- /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
- /// starting at position `1`. Since `b` corresponds to position `1` and
- /// since the regex is anchored, it finds no match.
- /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
- /// startting at position `1`. Since the regex is neither anchored nor
- /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
- /// prefix that permits it to match anywhere. Thus, it reports a match
- /// at `[2, 3]`.
- ///
- /// # Example
- ///
- /// This demonstrates the differences between an anchored search and
- /// a pattern that begins with `^` (as described in the above warning
- /// message).
- ///
- /// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
- ///
- /// let haystack = "aba".as_bytes();
- ///
- /// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().anchored(false)) // default
- /// .build(r"^a")?;
- /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
- /// // No match is found because 2 is not the beginning of the haystack,
- /// // which is what ^ requires.
- /// let expected = None;
- /// assert_eq!(expected, got);
- ///
- /// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().anchored(true))
- /// .build(r"a")?;
- /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
- /// // An anchored search can still match anywhere in the haystack, it just
- /// // must begin at the start of the search which is '2' in this case.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// assert_eq!(expected, got);
- ///
- /// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().anchored(true))
- /// .build(r"a")?;
- /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
- /// // No match is found since we start searching at offset 1 which
- /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
- /// // is found.
- /// let expected = None;
- /// assert_eq!(expected, got);
- ///
- /// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().anchored(false)) // default
- /// .build(r"a")?;
- /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
- /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
- /// // pattern. Even though the search starts at 'b', the 'match anything'
- /// // prefix allows the search to match 'a'.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn anchored(mut self, yes: bool) -> Config {
- self.anchored = Some(yes);
- self
- }
-
/// Enable state acceleration.
///
/// When enabled, DFA construction will analyze each state to determine
@@ -212,6 +122,87 @@ impl Config {
self
}
+ /// Set a prefilter to be used whenever a start state is entered.
+ ///
+ /// A [`Prefilter`] in this context is meant to accelerate searches by
+ /// looking for literal prefixes that every match for the corresponding
+ /// pattern (or patterns) must start with. Once a prefilter produces a
+ /// match, the underlying search routine continues on to try and confirm
+ /// the match.
+ ///
+ /// Be warned that setting a prefilter does not guarantee that the search
+ /// will be faster. While it's usually a good bet, if the prefilter
+ /// produces a lot of false positive candidates (i.e., positions matched
+ /// by the prefilter but not by the regex), then the overall result can
+ /// be slower than if you had just executed the regex engine without any
+ /// prefilters.
+ ///
+ /// Note that unless [`Config::specialize_start_states`] has been
+ /// explicitly set, then setting this will also enable (when `pre` is
+ /// `Some`) or disable (when `pre` is `None`) start state specialization.
+ /// This occurs because without start state specialization, a prefilter
+ /// is likely to be less effective. And without a prefilter, start state
+ /// specialization is usually pointless.
+ ///
+ /// **WARNING:** Note that prefilters are not preserved as part of
+ /// serialization. Serializing a DFA will drop its prefilter.
+ ///
+ /// By default no prefilter is set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 11)),
+ /// re.try_search_fwd(&input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Be warned though that an incorrect prefilter can lead to incorrect
+ /// results!
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// // No match reported even though there clearly is one!
+ /// None,
+ /// re.try_search_fwd(&input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
+ self.pre = Some(pre);
+ if self.specialize_start_states.is_none() {
+ self.specialize_start_states =
+ Some(self.get_prefilter().is_some());
+ }
+ self
+ }
+
/// Minimize the DFA.
///
/// When enabled, the DFA built will be minimized such that it is as small
@@ -283,20 +274,21 @@ impl Config {
/// report overlapping matches.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::{Automaton, OverlappingState, dense},
- /// HalfMatch, MatchKind,
+ /// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().match_kind(MatchKind::All))
/// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let haystack = "@foo".as_bytes();
+ /// let input = Input::new("@foo");
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
- /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(&input, &mut state)?;
+ /// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
@@ -304,8 +296,8 @@ impl Config {
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
- /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(&input, &mut state)?;
+ /// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -322,21 +314,31 @@ impl Config {
/// you, so it's usually not necessary to do this yourself.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind};
+ /// use regex_automata::{
+ /// dfa::{dense, Automaton, StartKind},
+ /// nfa::thompson::NFA,
+ /// Anchored, HalfMatch, Input, MatchKind,
+ /// };
///
/// let haystack = "123foobar456".as_bytes();
- /// let pattern = r"[a-z]+";
+ /// let pattern = r"[a-z]+r";
///
/// let dfa_fwd = dense::DFA::new(pattern)?;
/// let dfa_rev = dense::Builder::new()
+ /// .thompson(NFA::config().reverse(true))
/// .configure(dense::Config::new()
- /// .anchored(true)
+ /// // This isn't strictly necessary since both anchored and
+ /// // unanchored searches are supported by default. But since
+ /// // finding the start-of-match only requires anchored searches,
+ /// // we can get rid of the unanchored configuration and possibly
+ /// // slim down our DFA considerably.
+ /// .start_kind(StartKind::Anchored)
/// .match_kind(MatchKind::All)
/// )
/// .build(pattern)?;
/// let expected_fwd = HalfMatch::must(0, 9);
/// let expected_rev = HalfMatch::must(0, 3);
- /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap();
+ /// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap();
/// // Here we don't specify the pattern to search for since there's only
/// // one pattern and we're doing a leftmost search. But if this were an
/// // overlapping search, you'd need to specify the pattern that matched
@@ -344,9 +346,10 @@ impl Config {
/// // starting position of a match of some other pattern.) That in turn
/// // requires building the reverse automaton with starts_for_each_pattern
/// // enabled. Indeed, this is what Regex does internally.
- /// let got_rev = dfa_rev.find_leftmost_rev_at(
- /// None, haystack, 0, got_fwd.offset(),
- /// )?.unwrap();
+ /// let input = Input::new(haystack)
+ /// .range(..got_fwd.offset())
+ /// .anchored(Anchored::Yes);
+ /// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap();
/// assert_eq!(expected_fwd, got_fwd);
/// assert_eq!(expected_rev, got_rev);
///
@@ -357,6 +360,45 @@ impl Config {
self
}
+ /// The type of starting state configuration to use for a DFA.
+ ///
+ /// By default, the starting state configuration is [`StartKind::Both`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton, StartKind},
+ /// Anchored, HalfMatch, Input,
+ /// };
+ ///
+ /// let haystack = "quux foo123";
+ /// let expected = HalfMatch::must(0, 11);
+ ///
+ /// // By default, DFAs support both anchored and unanchored searches.
+ /// let dfa = DFA::new(r"[0-9]+")?;
+ /// let input = Input::new(haystack);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ ///
+ /// // But if we only need anchored searches, then we can build a DFA
+ /// // that only supports anchored searches. This leads to a smaller DFA
+ /// // (potentially significantly smaller in some cases), but a DFA that
+ /// // will panic if you try to use it with an unanchored search.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().start_kind(StartKind::Anchored))
+ /// .build(r"[0-9]+")?;
+ /// let input = Input::new(haystack)
+ /// .range(8..)
+ /// .anchored(Anchored::Yes);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn start_kind(mut self, kind: StartKind) -> Config {
+ self.start_kind = Some(kind);
+ self
+ }
+
/// Whether to compile a separate start state for each pattern in the
/// automaton.
///
@@ -397,36 +439,36 @@ impl Config {
///
/// ```
/// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch, PatternID,
+ /// dfa::{dense, Automaton},
+ /// Anchored, HalfMatch, PatternID, Input,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().starts_for_each_pattern(true))
/// .build(r"foo[0-9]+")?;
- /// let haystack = b"quux foo123";
+ /// let haystack = "quux foo123";
///
/// // Here's a normal unanchored search. Notice that we use 'None' for the
/// // pattern ID. Since the DFA was built as an unanchored machine, it
/// // use its default unanchored starting state.
/// let expected = HalfMatch::must(0, 11);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
- /// None, None, haystack, 0, haystack.len(),
- /// )?);
+ /// let input = Input::new(haystack);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
/// // But now if we explicitly specify the pattern to search ('0' being
/// // the only pattern in the DFA), then it will use the starting state
/// // for that specific pattern which is always anchored. Since the
/// // pattern doesn't have a match at the beginning of the haystack, we
/// // find nothing.
- /// assert_eq!(None, dfa.find_leftmost_fwd_at(
- /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
- /// )?);
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(0)));
+ /// assert_eq!(None, dfa.try_search_fwd(&input)?);
/// // And finally, an anchored search is not the same as putting a '^' at
/// // beginning of the pattern. An anchored search can only match at the
/// // beginning of the *search*, which we can change:
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
- /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
- /// )?);
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(0)))
+ /// .range(5..);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -446,8 +488,8 @@ impl Config {
/// in the DFA. For example, the pattern `[ab]+` has at least two
/// equivalence classes: a set containing `a` and `b` and a set containing
/// every byte except for `a` and `b`. `a` and `b` are in the same
- /// equivalence classes because they never discriminate between a match
- /// and a non-match.
+ /// equivalence class because they never discriminate between a match and a
+ /// non-match.
///
/// The advantage of this map is that the size of the transition table
/// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
@@ -473,7 +515,7 @@ impl Config {
/// When set, this will attempt to implement Unicode word boundaries as if
/// they were ASCII word boundaries. This only works when the search input
/// is ASCII only. If a non-ASCII byte is observed while searching, then a
- /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+ /// [`MatchError::quit`](crate::MatchError::quit) error is returned.
///
/// A possible alternative to enabling this option is to simply use an
/// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
@@ -497,7 +539,7 @@ impl Config {
/// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
/// to using the `try_` suite of methods. Alternatively, if
/// callers can guarantee that their input is ASCII only, then a
- /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+ /// [`MatchError::quit`](crate::MatchError::quit) error will never be
/// returned while searching.
///
/// This is disabled by default.
@@ -511,7 +553,7 @@ impl Config {
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
- /// HalfMatch, MatchError, MatchKind,
+ /// HalfMatch, Input, MatchError,
/// };
///
/// let dfa = dense::Builder::new()
@@ -520,9 +562,9 @@ impl Config {
///
/// // The match occurs before the search ever observes the snowman
/// // character, so no error occurs.
- /// let haystack = "foo 123 ☃".as_bytes();
+ /// let haystack = "foo 123 ☃".as_bytes();
/// let expected = Some(HalfMatch::must(0, 7));
- /// let got = dfa.find_leftmost_fwd(haystack)?;
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// // Notice that this search fails, even though the snowman character
@@ -530,9 +572,23 @@ impl Config {
/// // routines read one byte past the end of the search to account for
/// // look-around, and indeed, this is required here to determine whether
/// // the trailing \b matches.
- /// let haystack = "foo 123☃".as_bytes();
- /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
- /// let got = dfa.find_leftmost_fwd(haystack);
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = MatchError::quit(0xE2, 8);
+ /// let got = dfa.try_search_fwd(&Input::new(haystack));
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // Another example is executing a search where the span of the haystack
+ /// // we specify is all ASCII, but there is non-ASCII just before it. This
+ /// // correctly also reports an error.
+ /// let input = Input::new("β123").range(2..);
+ /// let expected = MatchError::quit(0xB2, 1);
+ /// let got = dfa.try_search_fwd(&input);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // And similarly for the trailing word boundary.
+ /// let input = Input::new("123β").range(..3);
+ /// let expected = MatchError::quit(0xCE, 3);
+ /// let got = dfa.try_search_fwd(&input);
/// assert_eq!(Err(expected), got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -549,7 +605,7 @@ impl Config {
/// Add a "quit" byte to the DFA.
///
/// When a quit byte is seen during search time, then search will return
- /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+ /// a [`MatchError::quit`](crate::MatchError::quit) error indicating the
/// offset at which the search stopped.
///
/// A quit byte will always overrule any other aspects of a regex. For
@@ -591,10 +647,8 @@ impl Config {
/// a user supplied pattern from matching across a line boundary.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, dense},
- /// HalfMatch, MatchError,
- /// };
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError};
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().quit(b'\n', true))
@@ -604,8 +658,8 @@ impl Config {
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
- /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
- /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err();
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -617,17 +671,98 @@ impl Config {
Unicode word boundaries are enabled"
);
}
- if self.quit.is_none() {
- self.quit = Some(ByteSet::empty());
+ if self.quitset.is_none() {
+ self.quitset = Some(ByteSet::empty());
}
if yes {
- self.quit.as_mut().unwrap().add(byte);
+ self.quitset.as_mut().unwrap().add(byte);
} else {
- self.quit.as_mut().unwrap().remove(byte);
+ self.quitset.as_mut().unwrap().remove(byte);
}
self
}
+ /// Enable specializing start states in the DFA.
+ ///
+ /// When start states are specialized, an implementor of a search routine
+ /// using a lazy DFA can tell when the search has entered a starting state.
+ /// When start states aren't specialized, then it is impossible to know
+ /// whether the search has entered a start state.
+ ///
+ /// Ideally, this option wouldn't need to exist and we could always
+ /// specialize start states. The problem is that start states can be quite
+ /// active. This in turn means that an efficient search routine is likely
+ /// to ping-pong between a heavily optimized hot loop that handles most
+ /// states and to a less optimized specialized handling of start states.
+ /// This causes branches to get heavily mispredicted and overall can
+ /// materially decrease throughput. Therefore, specializing start states
+ /// should only be enabled when it is needed.
+ ///
+ /// Knowing whether a search is in a start state is typically useful when a
+ /// prefilter is active for the search. A prefilter is typically only run
+ /// when in a start state and a prefilter can greatly accelerate a search.
+ /// Therefore, the possible cost of specializing start states is worth it
+ /// in this case. Otherwise, if you have no prefilter, there is likely no
+ /// reason to specialize start states.
+ ///
+ /// This is disabled by default, but note that it is automatically
+ /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
+ /// `specialize_start_states` has already been set, [`Config::prefilter`]
+ /// will automatically enable or disable it based on whether a prefilter
+ /// is present or not, respectively. This is done because a prefilter's
+ /// effectiveness is rooted in being executed whenever the DFA is in a
+ /// start state, and that's only possible to do when they are specialized.
+ ///
+ /// Note that it is plausibly reasonable to _disable_ this option
+ /// explicitly while _enabling_ a prefilter. In that case, a prefilter
+ /// will still be run at the beginning of a search, but never again. This
+ /// in theory could strike a good balance if you're in a situation where a
+ /// prefilter is likely to produce many false positive candidates.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to enable start state specialization and then
+ /// shows how to check whether a state is a start state or not.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().specialize_start_states(true))
+ /// .build(r"[a-z]+")?;
+ ///
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let sid = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // The ID returned by 'start_state_forward' will always be tagged as
+ /// // a start state when start state specialization is enabled.
+ /// assert!(dfa.is_special_state(sid));
+ /// assert!(dfa.is_start_state(sid));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Compare the above with the default DFA configuration where start states
+ /// are _not_ specialized. In this case, the start state is not tagged at
+ /// all:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
+ ///
+ /// let dfa = DFA::new(r"[a-z]+")?;
+ ///
+ /// let haystack = "123 foobar 4567";
+ /// let sid = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // Start states are not special in the default configuration!
+ /// assert!(!dfa.is_special_state(sid));
+ /// assert!(!dfa.is_start_state(sid));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn specialize_start_states(mut self, yes: bool) -> Config {
+ self.specialize_start_states = Some(yes);
+ self
+ }
+
/// Set a size limit on the total heap used by a DFA.
///
/// This size limit is expressed in bytes and is applied during
@@ -655,28 +790,63 @@ impl Config {
/// can get.
///
/// ```
- /// use regex_automata::dfa::{dense, Automaton};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{dense, Automaton}, Input};
///
- /// // 3MB isn't enough!
+ /// // 6MB isn't enough!
/// dense::Builder::new()
- /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000)))
+ /// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000)))
/// .build(r"\w{20}")
/// .unwrap_err();
///
- /// // ... but 4MB probably is!
+ /// // ... but 7MB probably is!
/// // (Note that DFA sizes aren't necessarily stable between releases.)
/// let dfa = dense::Builder::new()
- /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000)))
+ /// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000)))
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
- /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
- /// While one needs a little more than 3MB to represent `\w{20}`, it
- /// turns out that you only need a little more than 4KB to represent
+ /// While one needs a little more than 6MB to represent `\w{20}`, it
+ /// turns out that you only need a little more than 6KB to represent
/// `(?-u:\w{20})`. So only use Unicode if you need it!
+ ///
+ /// As with [`Config::determinize_size_limit`], the size of a DFA is
+ /// influenced by other factors, such as what start state configurations
+ /// to support. For example, if you only need unanchored searches and not
+ /// anchored searches, then configuring the DFA to only support unanchored
+ /// searches can reduce its size. By default, DFAs support both unanchored
+ /// and anchored searches.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input};
+ ///
+ /// // 3MB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .dfa_size_limit(Some(3_000_000))
+ /// .start_kind(StartKind::Unanchored)
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 4MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .dfa_size_limit(Some(4_000_000))
+ /// .start_kind(StartKind::Unanchored)
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
self.dfa_size_limit = Some(bytes);
self
@@ -708,26 +878,68 @@ impl Config {
/// is still not as much as the DFA itself.)
///
/// ```
- /// use regex_automata::dfa::{dense, Automaton};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
+ /// use regex_automata::{dfa::{dense, Automaton}, Input};
///
- /// // 300KB isn't enough!
+ /// // 600KB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new()
- /// .determinize_size_limit(Some(300_000))
+ /// .determinize_size_limit(Some(600_000))
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 700KB probably is!
+ /// // (Note that auxiliary storage sizes aren't necessarily stable between
+ /// // releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(700_000))
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Note that some parts of the configuration on a DFA can have a
+ /// big impact on how big the DFA is, and thus, how much memory is
+ /// used. For example, the default setting for [`Config::start_kind`] is
+ /// [`StartKind::Both`]. But if you only need an anchored search, for
+ /// example, then it can be much cheaper to build a DFA that only supports
+ /// anchored searches. (Running an unanchored search with it would panic.)
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
+ /// use regex_automata::{
+ /// dfa::{dense, Automaton, StartKind},
+ /// Anchored, Input,
+ /// };
+ ///
+ /// // 200KB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(200_000))
+ /// .start_kind(StartKind::Anchored)
/// )
/// .build(r"\w{20}")
/// .unwrap_err();
///
- /// // ... but 400KB probably is!
+ /// // ... but 300KB probably is!
/// // (Note that auxiliary storage sizes aren't necessarily stable between
/// // releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new()
- /// .determinize_size_limit(Some(400_000))
+ /// .determinize_size_limit(Some(300_000))
+ /// .start_kind(StartKind::Anchored)
/// )
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
- /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+ /// let input = Input::new(&haystack).anchored(Anchored::Yes);
+ /// assert!(dfa.try_search_fwd(&input)?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -736,17 +948,17 @@ impl Config {
self
}
- /// Returns whether this configuration has enabled anchored searches.
- pub fn get_anchored(&self) -> bool {
- self.anchored.unwrap_or(false)
- }
-
/// Returns whether this configuration has enabled simple state
/// acceleration.
pub fn get_accelerate(&self) -> bool {
self.accelerate.unwrap_or(true)
}
+ /// Returns the prefilter attached to this configuration, if any.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
+ }
+
/// Returns whether this configuration has enabled the expensive process
/// of minimizing a DFA.
pub fn get_minimize(&self) -> bool {
@@ -758,6 +970,11 @@ impl Config {
self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
}
+ /// Returns the starting state configuration for a DFA.
+ pub fn get_starts(&self) -> StartKind {
+ self.start_kind.unwrap_or(StartKind::Both)
+ }
+
/// Returns whether this configuration has enabled anchored starting states
/// for every pattern in the DFA.
pub fn get_starts_for_each_pattern(&self) -> bool {
@@ -783,7 +1000,16 @@ impl Config {
/// least one byte has this enabled, it is possible for a search to return
/// an error.
pub fn get_quit(&self, byte: u8) -> bool {
- self.quit.map_or(false, |q| q.contains(byte))
+ self.quitset.map_or(false, |q| q.contains(byte))
+ }
+
+ /// Returns whether this configuration will instruct the DFA to
+ /// "specialize" start states. When enabled, the DFA will mark start states
+ /// as "special" so that search routines using the DFA can detect when
+ /// it's in a start state and do some kind of optimization (like run a
+ /// prefilter).
+ pub fn get_specialize_start_states(&self) -> bool {
+ self.specialize_start_states.unwrap_or(false)
}
/// Returns the DFA size limit of this configuration if one was set.
@@ -814,12 +1040,13 @@ impl Config {
/// always used. If an option in `o` is not set, then the corresponding
/// option in `self` is used. If it's not set in `self` either, then it
/// remains not set.
- pub(crate) fn overwrite(self, o: Config) -> Config {
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
Config {
- anchored: o.anchored.or(self.anchored),
accelerate: o.accelerate.or(self.accelerate),
+ pre: o.pre.or_else(|| self.pre.clone()),
minimize: o.minimize.or(self.minimize),
match_kind: o.match_kind.or(self.match_kind),
+ start_kind: o.start_kind.or(self.start_kind),
starts_for_each_pattern: o
.starts_for_each_pattern
.or(self.starts_for_each_pattern),
@@ -827,7 +1054,10 @@ impl Config {
unicode_word_boundary: o
.unicode_word_boundary
.or(self.unicode_word_boundary),
- quit: o.quit.or(self.quit),
+ quitset: o.quitset.or(self.quitset),
+ specialize_start_states: o
+ .specialize_start_states
+ .or(self.specialize_start_states),
dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
determinize_size_limit: o
.determinize_size_limit
@@ -878,44 +1108,42 @@ impl Config {
/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
/// * The pattern itself is permitted to match invalid UTF-8. For example,
/// things like `[^a]` that match any byte except for `a` are permitted.
-/// * Unanchored patterns can search through invalid UTF-8. That is, for
-/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
-/// `(?s:.)*?`.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
-/// nfa::thompson,
-/// HalfMatch, SyntaxConfig,
+/// util::syntax,
+/// HalfMatch, Input,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().minimize(false))
-/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
-/// .thompson(thompson::Config::new().utf8(false))
+/// .syntax(syntax::Config::new().unicode(false).utf8(false))
/// .build(r"foo[^b]ar.*")?;
///
/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
/// let expected = Some(HalfMatch::must(0, 10));
-/// let got = dfa.find_leftmost_fwd(haystack)?;
+/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
#[derive(Clone, Debug)]
pub struct Builder {
config: Config,
- thompson: thompson::Builder,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl Builder {
/// Create a new dense DFA builder with the default configuration.
pub fn new() -> Builder {
Builder {
config: Config::default(),
- thompson: thompson::Builder::new(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
}
}
@@ -923,7 +1151,8 @@ impl Builder {
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
- pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> {
+ #[cfg(feature = "syntax")]
+ pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> {
self.build_many(&[pattern])
}
@@ -931,11 +1160,22 @@ impl Builder {
///
/// When matches are returned, the pattern ID corresponds to the index of
/// the pattern in the slice given.
+ #[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
- ) -> Result<OwnedDFA, Error> {
- let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?;
+ ) -> Result<OwnedDFA, BuildError> {
+ let nfa = self
+ .thompson
+ .clone()
+ // We can always forcefully disable captures because DFAs do not
+ // support them.
+ .configure(
+ thompson::Config::new()
+ .which_captures(thompson::WhichCaptures::None),
+ )
+ .build_many(patterns)
+ .map_err(BuildError::nfa)?;
self.build_from_nfa(&nfa)
}
@@ -949,19 +1189,19 @@ impl Builder {
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
- /// nfa::thompson,
- /// HalfMatch,
+ /// nfa::thompson::NFA,
+ /// HalfMatch, Input,
/// };
///
/// let haystack = "foo123bar".as_bytes();
///
/// // This shows how to set non-default options for building an NFA.
- /// let nfa = thompson::Builder::new()
- /// .configure(thompson::Config::new().shrink(false))
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().shrink(true))
/// .build(r"[0-9]+")?;
/// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
/// let expected = Some(HalfMatch::must(0, 6));
- /// let got = dfa.find_leftmost_fwd(haystack)?;
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -969,13 +1209,13 @@ impl Builder {
pub fn build_from_nfa(
&self,
nfa: &thompson::NFA,
- ) -> Result<OwnedDFA, Error> {
- let mut quit = self.config.quit.unwrap_or(ByteSet::empty());
+ ) -> Result<OwnedDFA, BuildError> {
+ let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty());
if self.config.get_unicode_word_boundary()
- && nfa.has_word_boundary_unicode()
+ && nfa.look_set_any().contains_word_unicode()
{
for b in 0x80..=0xFF {
- quit.add(b);
+ quitset.add(b);
}
}
let classes = if !self.config.get_byte_classes() {
@@ -990,8 +1230,13 @@ impl Builder {
// It is important to distinguish any "quit" bytes from all other
// bytes. Otherwise, a non-quit byte may end up in the same class
// as a quit byte, and thus cause the DFA stop when it shouldn't.
- if !quit.is_empty() {
- set.add_set(&quit);
+ //
+ // Test case:
+ //
+ // regex-cli find hybrid regex -w @conn.json.1000x.log \
+ // '^#' '\b10\.55\.182\.100\b'
+ if !quitset.is_empty() {
+ set.add_set(&quitset);
}
set.byte_classes()
};
@@ -999,12 +1244,16 @@ impl Builder {
let mut dfa = DFA::initial(
classes,
nfa.pattern_len(),
+ self.config.get_starts(),
+ nfa.look_matcher(),
self.config.get_starts_for_each_pattern(),
+ self.config.get_prefilter().map(|p| p.clone()),
+ quitset,
+ Flags::from_nfa(&nfa),
)?;
determinize::Config::new()
- .anchored(self.config.get_anchored())
.match_kind(self.config.get_match_kind())
- .quit(quit)
+ .quit(quitset)
.dfa_size_limit(self.config.get_dfa_size_limit())
.determinize_size_limit(self.config.get_determinize_size_limit())
.run(nfa, &mut dfa)?;
@@ -1014,6 +1263,16 @@ impl Builder {
if self.config.get_accelerate() {
dfa.accelerate();
}
+ // The state shuffling done before this point always assumes that start
+ // states should be marked as "special," even though it isn't the
+ // default configuration. State shuffling is complex enough as it is,
+ // so it's simpler to just "fix" our special state ID ranges to not
+ // include starting states after-the-fact.
+ if !self.config.get_specialize_start_states() {
+ dfa.special.set_no_special_start_states();
+ }
+ // Look for and set the universal starting states.
+ dfa.set_universal_starts();
Ok(dfa)
}
@@ -1024,16 +1283,17 @@ impl Builder {
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
/// These settings only apply when constructing a DFA directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.thompson.syntax(config);
self
@@ -1048,13 +1308,14 @@ impl Builder {
///
/// These settings only apply when constructing a DFA directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.thompson.configure(config);
self
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl Default for Builder {
fn default() -> Builder {
Builder::new()
@@ -1067,7 +1328,7 @@ impl Default for Builder {
/// reason for making DFAs generic is no_std support, and more generally,
/// making it possible to load a DFA from an arbitrary slice of bytes.
#[cfg(feature = "alloc")]
-pub(crate) type OwnedDFA = DFA<Vec<u32>>;
+pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>;
/// A dense table-based deterministic finite automaton (DFA).
///
@@ -1117,11 +1378,11 @@ pub(crate) type OwnedDFA = DFA<Vec<u32>>;
/// for searching. For example:
///
/// ```
-/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let expected = HalfMatch::must(0, 8);
-/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone)]
@@ -1143,7 +1404,7 @@ pub struct DFA<T> {
/// a match exists, but _which_ patterns match. So we need to store the
/// matching pattern IDs for each match state. We do this even when there
/// is only one pattern for the sake of simplicity. In practice, this uses
- /// up very little space for the case of on pattern.
+ /// up very little space for the case of one pattern.
ms: MatchStates<T>,
/// Information about which states are "special." Special states are states
/// that are dead, quit, matching, starting or accelerated. For more info,
@@ -1160,9 +1421,25 @@ pub struct DFA<T> {
/// transition table. See dfa/special.rs for more details on how states are
/// arranged.
accels: Accels<T>,
+ /// Any prefilter attached to this DFA.
+ ///
+ /// Note that currently prefilters are not serialized. When deserializing
+ /// a DFA from bytes, this is always set to `None`.
+ pre: Option<Prefilter>,
+ /// The set of "quit" bytes for this DFA.
+ ///
+ /// This is only used when computing the start state for a particular
+ /// position in a haystack. Namely, in the case where there is a quit
+ /// byte immediately before the start of the search, this set needs to be
+ /// explicitly consulted. In all other cases, quit bytes are detected by
+ /// the DFA itself, by transitioning all quit bytes to a special "quit
+ /// state."
+ quitset: ByteSet,
+ /// Various flags describing the behavior of this DFA.
+ flags: Flags,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl OwnedDFA {
/// Parse the given regular expression using a default configuration and
/// return the corresponding DFA.
@@ -1173,14 +1450,15 @@ impl OwnedDFA {
/// # Example
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::new("foo[0-9]+bar")?;
- /// let expected = HalfMatch::must(0, 11);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn new(pattern: &str) -> Result<OwnedDFA, Error> {
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> {
Builder::new().build(pattern)
}
@@ -1193,35 +1471,38 @@ impl OwnedDFA {
/// # Example
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
- /// let expected = HalfMatch::must(1, 3);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// let expected = Some(HalfMatch::must(1, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> {
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<OwnedDFA, BuildError> {
Builder::new().build_many(patterns)
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl OwnedDFA {
/// Create a new DFA that matches every input.
///
/// # Example
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::always_match()?;
///
- /// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn always_match() -> Result<OwnedDFA, Error> {
+ pub fn always_match() -> Result<OwnedDFA, BuildError> {
let nfa = thompson::NFA::always_match();
Builder::new().build_from_nfa(&nfa)
}
@@ -1231,38 +1512,65 @@ impl OwnedDFA {
/// # Example
///
/// ```
- /// use regex_automata::dfa::{Automaton, dense};
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
///
/// let dfa = dense::DFA::never_match()?;
- /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
- /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn never_match() -> Result<OwnedDFA, Error> {
+ pub fn never_match() -> Result<OwnedDFA, BuildError> {
let nfa = thompson::NFA::never_match();
Builder::new().build_from_nfa(&nfa)
}
- /// Create an initial DFA with the given equivalence classes, pattern count
- /// and whether anchored starting states are enabled for each pattern. An
- /// initial DFA can be further mutated via determinization.
+ /// Create an initial DFA with the given equivalence classes, pattern
+ /// length and whether anchored starting states are enabled for each
+ /// pattern. An initial DFA can be further mutated via determinization.
fn initial(
classes: ByteClasses,
- pattern_count: usize,
+ pattern_len: usize,
+ starts: StartKind,
+ lookm: &LookMatcher,
starts_for_each_pattern: bool,
- ) -> Result<OwnedDFA, Error> {
- let start_pattern_count =
- if starts_for_each_pattern { pattern_count } else { 0 };
+ pre: Option<Prefilter>,
+ quitset: ByteSet,
+ flags: Flags,
+ ) -> Result<OwnedDFA, BuildError> {
+ let start_pattern_len =
+ if starts_for_each_pattern { Some(pattern_len) } else { None };
Ok(DFA {
tt: TransitionTable::minimal(classes),
- st: StartTable::dead(start_pattern_count)?,
- ms: MatchStates::empty(pattern_count),
+ st: StartTable::dead(starts, lookm, start_pattern_len)?,
+ ms: MatchStates::empty(pattern_len),
special: Special::new(),
accels: Accels::empty(),
+ pre,
+ quitset,
+ flags,
})
}
}
+#[cfg(feature = "dfa-build")]
+impl DFA<&[u32]> {
+ /// Return a new default dense DFA compiler configuration.
+ ///
+ /// This is a convenience routine to avoid needing to import the [`Config`]
+ /// type when customizing the construction of a dense DFA.
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Create a new dense DFA builder with the default configuration.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
impl<T: AsRef<[u32]>> DFA<T> {
/// Cheaply return a borrowed version of this dense DFA. Specifically,
/// the DFA returned always uses `&[u32]` for its transition table.
@@ -1273,6 +1581,9 @@ impl<T: AsRef<[u32]>> DFA<T> {
ms: self.ms.as_ref(),
special: self.special,
accels: self.accels(),
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
}
}
@@ -1289,20 +1600,57 @@ impl<T: AsRef<[u32]>> DFA<T> {
ms: self.ms.to_owned(),
special: self.special,
accels: self.accels().to_owned(),
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
}
}
+ /// Returns the starting state configuration for this DFA.
+ ///
+ /// The default is [`StartKind::Both`], which means the DFA supports both
+ /// unanchored and anchored searches. However, this can generally lead to
+ /// bigger DFAs. Therefore, a DFA might be compiled with support for just
+ /// unanchored or anchored searches. In that case, running a search with
+ /// an unsupported configuration will panic.
+ pub fn start_kind(&self) -> StartKind {
+ self.st.kind
+ }
+
+ /// Returns the start byte map used for computing the `Start` configuration
+ /// at the beginning of a search.
+ pub(crate) fn start_map(&self) -> &StartByteMap {
+ &self.st.start_map
+ }
+
/// Returns true only if this DFA has starting states for each pattern.
///
/// When a DFA has starting states for each pattern, then a search with the
/// DFA can be configured to only look for anchored matches of a specific
- /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
- /// can accept a non-None `pattern_id` if and only if this method returns
- /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+ /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
+ /// accept a non-None `pattern_id` if and only if this method returns true.
+ /// Otherwise, calling `try_search_fwd` will panic.
///
/// Note that if the DFA has no patterns, this always returns false.
- pub fn has_starts_for_each_pattern(&self) -> bool {
- self.st.patterns > 0
+ pub fn starts_for_each_pattern(&self) -> bool {
+ self.st.pattern_len.is_some()
+ }
+
+ /// Returns the equivalence classes that make up the alphabet for this DFA.
+ ///
+ /// Unless [`Config::byte_classes`] was disabled, it is possible that
+ /// multiple distinct bytes are grouped into the same equivalence class
+ /// if it is impossible for them to discriminate between a match and a
+ /// non-match. This has the effect of reducing the overall alphabet size
+ /// and in turn potentially substantially reducing the size of the DFA's
+ /// transition table.
+ ///
+ /// The downside of using equivalence classes like this is that every state
+ /// transition will automatically use this map to convert an arbitrary
+ /// byte to its corresponding equivalence class. In practice this has a
+ /// negligible impact on performance.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.tt.classes
}
/// Returns the total number of elements in the alphabet for this DFA.
@@ -1368,27 +1716,6 @@ impl<T: AsRef<[u32]>> DFA<T> {
self.tt.stride()
}
- /// Returns the "universal" start state for this DFA.
- ///
- /// A universal start state occurs only when all of the starting states
- /// for this DFA are precisely the same. This occurs when there are no
- /// look-around assertions at the beginning (or end for a reverse DFA) of
- /// the pattern.
- ///
- /// Using this as a starting state for a DFA without a universal starting
- /// state has unspecified behavior. This condition is not checked, so the
- /// caller must guarantee it themselves.
- pub(crate) fn universal_start_state(&self) -> StateID {
- // We choose 'NonWordByte' for no particular reason, other than
- // the fact that this is the 'main' starting configuration used in
- // determinization. But in essence, it doesn't really matter.
- //
- // Also, we might consider exposing this routine, but it seems
- // a little tricky to use correctly. Maybe if we also expose a
- // 'has_universal_start_state' method?
- self.st.start(Start::NonWordByte, None)
- }
-
/// Returns the memory usage, in bytes, of this DFA.
///
/// The memory usage is computed based on the number of bytes used to
@@ -1417,17 +1744,17 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// # Example
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dense = dense::DFA::new("foo[0-9]+")?;
/// let sparse = dense.to_sparse()?;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
- pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> {
+ #[cfg(feature = "dfa-build")]
+ pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, BuildError> {
sparse::DFA::from_dense(self)
}
@@ -1453,7 +1780,7 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -1465,13 +1792,13 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
- self.to_bytes::<bytes::LE>()
+ self.to_bytes::<wire::LE>()
}
/// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
@@ -1496,7 +1823,7 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -1508,13 +1835,13 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
- self.to_bytes::<bytes::BE>()
+ self.to_bytes::<wire::BE>()
}
/// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
@@ -1548,7 +1875,7 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -1558,21 +1885,21 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
- self.to_bytes::<bytes::NE>()
+ self.to_bytes::<wire::NE>()
}
/// The implementation of the public `to_bytes` serialization methods,
/// which is generic over endianness.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
let len = self.write_to_len();
- let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len);
+ let (mut buf, padding) = wire::alloc_aligned_buffer::<u32>(len);
// This should always succeed since the only possible serialization
// error is providing a buffer that's too small, but we've ensured that
// `buf` is big enough here.
@@ -1607,27 +1934,35 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
///
- /// // Create a 4KB buffer on the stack to store our serialized DFA.
- /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
/// // N.B. We use native endianness here to make the example work, but
/// // using write_to_little_endian would work on a little endian target.
- /// let written = original_dfa.write_to_native_endian(&mut buf)?;
- /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_little_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.as_ref().write_to::<bytes::LE>(dst)
+ self.as_ref().write_to::<wire::LE>(dst)
}
/// Serialize this DFA as raw bytes to the given slice, in big endian
@@ -1657,27 +1992,35 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
///
- /// // Create a 4KB buffer on the stack to store our serialized DFA.
- /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
/// // N.B. We use native endianness here to make the example work, but
/// // using write_to_big_endian would work on a big endian target.
- /// let written = original_dfa.write_to_native_endian(&mut buf)?;
- /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_big_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.as_ref().write_to::<bytes::BE>(dst)
+ self.as_ref().write_to::<wire::BE>(dst)
}
/// Serialize this DFA as raw bytes to the given slice, in native endian
@@ -1716,25 +2059,33 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
///
- /// // Create a 4KB buffer on the stack to store our serialized DFA.
- /// let mut buf = [0u8; 4 * (1<<10)];
- /// let written = original_dfa.write_to_native_endian(&mut buf)?;
- /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_native_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.as_ref().write_to::<bytes::NE>(dst)
+ self.as_ref().write_to::<wire::NE>(dst)
}
/// Return the total number of bytes required to serialize this DFA.
@@ -1756,17 +2107,33 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// a DFA.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
- /// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
///
/// let mut buf = vec![0; original_dfa.write_to_len()];
- /// let written = original_dfa.write_to_native_endian(&mut buf)?;
- /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ /// // This is guaranteed to succeed, because the only serialization error
+ /// // that can occur is when the provided buffer is too small. But
+ /// // write_to_len guarantees a correct size.
+ /// let written = original_dfa.write_to_native_endian(&mut buf).unwrap();
+ /// // But this is not guaranteed to succeed! In particular,
+ /// // deserialization requires proper alignment for &[u32], but our buffer
+ /// // was allocated as a &[u8] whose required alignment is smaller than
+ /// // &[u32]. However, it's likely to work in practice because of how most
+ /// // allocators work. So if you write code like this, make sure to either
+ /// // handle the error correctly and/or run it under Miri since Miri will
+ /// // likely provoke the error by returning Vec<u8> buffers with alignment
+ /// // less than &[u32].
+ /// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) {
+ /// // As mentioned above, it is legal for an error to be returned
+ /// // here. It is quite difficult to get a Vec<u8> with a guaranteed
+ /// // alignment equivalent to Vec<u32>.
+ /// Err(_) => return Ok(()),
+ /// Ok((dfa, _)) => dfa,
+ /// };
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -1776,15 +2143,17 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// either need to deal with adding some initial padding yourself, or use
/// one of the `to_bytes` methods, which will do it for you.
pub fn write_to_len(&self) -> usize {
- bytes::write_label_len(LABEL)
- + bytes::write_endianness_check_len()
- + bytes::write_version_len()
+ wire::write_label_len(LABEL)
+ + wire::write_endianness_check_len()
+ + wire::write_version_len()
+ size_of::<u32>() // unused, intended for future flexibility
+ + self.flags.write_to_len()
+ self.tt.write_to_len()
+ self.st.write_to_len()
+ self.ms.write_to_len()
+ self.special.write_to_len()
+ self.accels.write_to_len()
+ + self.quitset.write_to_len()
}
}
@@ -1843,14 +2212,14 @@ impl<'a> DFA<&'a [u32]> {
/// and then use it for searching.
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// let (bytes, _) = initial.to_bytes_native_endian();
/// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -1865,7 +2234,7 @@ impl<'a> DFA<&'a [u32]> {
/// alternative way to write the above example:
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// // Serialization returns the number of leading padding bytes added to
@@ -1873,8 +2242,8 @@ impl<'a> DFA<&'a [u32]> {
/// let (bytes, pad) = initial.to_bytes_native_endian();
/// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -1893,7 +2262,7 @@ impl<'a> DFA<&'a [u32]> {
/// part is serializing the DFA to a file:
///
/// ```no_run
- /// use regex_automata::dfa::{Automaton, dense::DFA};
+ /// use regex_automata::dfa::dense::DFA;
///
/// let dfa = DFA::new("foo[0-9]+")?;
///
@@ -1912,30 +2281,24 @@ impl<'a> DFA<&'a [u32]> {
/// compilation to choose the correct endianness.
///
/// ```no_run
- /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
- ///
- /// type S = u32;
- /// type DFA = dense::DFA<&'static [S]>;
- ///
- /// fn get_foo() -> &'static DFA {
- /// use std::cell::Cell;
- /// use std::mem::MaybeUninit;
- /// use std::sync::Once;
- ///
- /// // This struct with a generic B is used to permit unsizing
- /// // coercions, specifically, where B winds up being a [u8]. We also
- /// // need repr(C) to guarantee that _align comes first, which forces
- /// // a correct alignment.
- /// #[repr(C)]
- /// struct Aligned<B: ?Sized> {
- /// _align: [S; 0],
- /// bytes: B,
- /// }
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// util::{lazy::Lazy, wire::AlignAs},
+ /// HalfMatch, Input,
+ /// };
///
+ /// // This crate provides its own "lazy" type, kind of like
+ /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
+ /// // no-std environments and let's us write this using completely
+ /// // safe code.
+ /// static RE: Lazy<DFA<&'static [u32]>> = Lazy::new(|| {
/// # const _: &str = stringify! {
/// // This assignment is made possible (implicitly) via the
- /// // CoerceUnsized trait.
- /// static ALIGNED: &Aligned<[u8]> = &Aligned {
+ /// // CoerceUnsized trait. This is what guarantees that our
+ /// // bytes are stored in memory on a 4 byte boundary. You
+ /// // *must* do this or something equivalent for correct
+ /// // deserialization.
+ /// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
/// _align: [],
/// #[cfg(target_endian = "big")]
/// bytes: *include_bytes!("foo.bigendian.dfa"),
@@ -1943,55 +2306,40 @@ impl<'a> DFA<&'a [u32]> {
/// bytes: *include_bytes!("foo.littleendian.dfa"),
/// };
/// # };
- /// # static ALIGNED: &Aligned<[u8]> = &Aligned {
+ /// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
/// # _align: [],
/// # bytes: [],
/// # };
///
- /// struct Lazy(Cell<MaybeUninit<DFA>>);
- /// // SAFETY: This is safe because DFA impls Sync.
- /// unsafe impl Sync for Lazy {}
- ///
- /// static INIT: Once = Once::new();
- /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
- ///
- /// INIT.call_once(|| {
- /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
- /// .expect("serialized DFA should be valid");
- /// // SAFETY: This is guaranteed to only execute once, and all
- /// // we do with the pointer is write the DFA to it.
- /// unsafe {
- /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
- /// }
- /// });
- /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
- /// // stored in static memory.
- /// unsafe {
- /// let dfa = (*DFA.0.as_ptr()).as_ptr();
- /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
- /// }
- /// }
+ /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+ /// .expect("serialized DFA should be valid");
+ /// dfa
+ /// });
///
- /// let dfa = get_foo();
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+ /// let expected = Ok(Some(HalfMatch::must(0, 8)));
+ /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
/// ```
///
- /// Alternatively, consider using
- /// [`lazy_static`](https://crates.io/crates/lazy_static)
- /// or
- /// [`once_cell`](https://crates.io/crates/once_cell),
- /// which will guarantee safety for you. You will still need to use the
- /// `Aligned` trick above to force correct alignment, but this is safe to
- /// do and `from_bytes` will return an error if you get it wrong.
+ /// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy)
+ /// is [`lazy_static`](https://crates.io/crates/lazy_static) or
+ /// [`once_cell`](https://crates.io/crates/once_cell), which provide
+ /// stronger guarantees (like the initialization function only being
+ /// executed once). And `once_cell` in particular provides a more
+ /// expressive API. But a `Lazy` value from this crate is likely just fine
+ /// in most circumstances.
+ ///
+ /// Note that regardless of which initialization method you use, you
+ /// will still need to use the [`AlignAs`](crate::util::wire::AlignAs)
+ /// trick above to force correct alignment, but this is safe to do and
+ /// `from_bytes` will return an error if you get it wrong.
pub fn from_bytes(
slice: &'a [u8],
) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
- // SAFETY: This is safe because we validate both the transition table,
- // start state ID list and the match states below. If either validation
- // fails, then we return an error.
+ // SAFETY: This is safe because we validate the transition table, start
+ // table, match states and accelerators below. If any validation fails,
+ // then we return an error.
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
- dfa.tt.validate()?;
+ dfa.tt.validate(&dfa.special)?;
dfa.st.validate(&dfa.tt)?;
dfa.ms.validate(&dfa)?;
dfa.accels.validate()?;
@@ -2015,7 +2363,7 @@ impl<'a> DFA<&'a [u32]> {
/// # Example
///
/// ```
- /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// let (bytes, _) = initial.to_bytes_native_endian();
@@ -2023,8 +2371,8 @@ impl<'a> DFA<&'a [u32]> {
/// // directly from a compatible serialization routine.
/// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub unsafe fn from_bytes_unchecked(
@@ -2032,15 +2380,18 @@ impl<'a> DFA<&'a [u32]> {
) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
let mut nr = 0;
- nr += bytes::skip_initial_padding(slice);
- bytes::check_alignment::<StateID>(&slice[nr..])?;
- nr += bytes::read_label(&slice[nr..], LABEL)?;
- nr += bytes::read_endianness_check(&slice[nr..])?;
- nr += bytes::read_version(&slice[nr..], VERSION)?;
+ nr += wire::skip_initial_padding(slice);
+ wire::check_alignment::<StateID>(&slice[nr..])?;
+ nr += wire::read_label(&slice[nr..], LABEL)?;
+ nr += wire::read_endianness_check(&slice[nr..])?;
+ nr += wire::read_version(&slice[nr..], VERSION)?;
- let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+ let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
nr += size_of::<u32>();
+ let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
+ nr += nread;
+
let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
@@ -2052,12 +2403,17 @@ impl<'a> DFA<&'a [u32]> {
let (special, nread) = Special::from_bytes(&slice[nr..])?;
nr += nread;
- special.validate_state_count(tt.count(), tt.stride2)?;
+ special.validate_state_len(tt.len(), tt.stride2)?;
let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
- Ok((DFA { tt, st, ms, special, accels }, nr))
+ let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ // Prefilters don't support serialization, so they're always absent.
+ let pre = None;
+ Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr))
}
/// The implementation of the public `write_to` serialization methods,
@@ -2075,39 +2431,41 @@ impl<'a> DFA<&'a [u32]> {
dst = &mut dst[..nwrite];
let mut nw = 0;
- nw += bytes::write_label(LABEL, &mut dst[nw..])?;
- nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
- nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += wire::write_label(LABEL, &mut dst[nw..])?;
+ nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
nw += {
// Currently unused, intended for future flexibility
E::write_u32(0, &mut dst[nw..]);
size_of::<u32>()
};
+ nw += self.flags.write_to::<E>(&mut dst[nw..])?;
nw += self.tt.write_to::<E>(&mut dst[nw..])?;
nw += self.st.write_to::<E>(&mut dst[nw..])?;
nw += self.ms.write_to::<E>(&mut dst[nw..])?;
nw += self.special.write_to::<E>(&mut dst[nw..])?;
nw += self.accels.write_to::<E>(&mut dst[nw..])?;
+ nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
Ok(nw)
}
}
-/// The following methods implement mutable routines on the internal
-/// representation of a DFA. As such, we must fix the first type parameter to a
-/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
-/// can get away with this because these methods are internal to the crate and
-/// are exclusively used during construction of the DFA.
-#[cfg(feature = "alloc")]
+// The following methods implement mutable routines on the internal
+// representation of a DFA. As such, we must fix the first type parameter to a
+// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
+// can get away with this because these methods are internal to the crate and
+// are exclusively used during construction of the DFA.
+#[cfg(feature = "dfa-build")]
impl OwnedDFA {
/// Add a start state of this DFA.
pub(crate) fn set_start_state(
&mut self,
- index: Start,
- pattern_id: Option<PatternID>,
+ anchored: Anchored,
+ start: Start,
id: StateID,
) {
assert!(self.tt.is_valid(id), "invalid start state");
- self.st.set_start(index, pattern_id, id);
+ self.st.set_start(anchored, start, id);
}
/// Set the given transition to this DFA. Both the `from` and `to` states
@@ -2127,7 +2485,7 @@ impl OwnedDFA {
///
/// If adding a state would exceed `StateID::LIMIT`, then this returns an
/// error.
- pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> {
+ pub(crate) fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
self.tt.add_empty_state()
}
@@ -2140,20 +2498,42 @@ impl OwnedDFA {
self.tt.swap(id1, id2);
}
- /// Truncate the states in this DFA to the given count.
+ /// Remap all of the state identifiers in this DFA according to the map
+ /// function given. This includes all transitions and all starting state
+ /// identifiers.
+ pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ // We could loop over each state ID and call 'remap_state' here, but
+ // this is more direct: just map every transition directly. This
+ // technically might do a little extra work since the alphabet length
+ // is likely less than the stride, but if that is indeed an issue we
+ // should benchmark it and fix it.
+ for sid in self.tt.table_mut().iter_mut() {
+ *sid = map(*sid);
+ }
+ for sid in self.st.table_mut().iter_mut() {
+ *sid = map(*sid);
+ }
+ }
+
+ /// Remap the transitions for the state given according to the function
+ /// given. This applies the given map function to every transition in the
+ /// given state and changes the transition in place to the result of the
+ /// map function for that transition.
+ pub(crate) fn remap_state(
+ &mut self,
+ id: StateID,
+ map: impl Fn(StateID) -> StateID,
+ ) {
+ self.tt.remap(id, map);
+ }
+
+ /// Truncate the states in this DFA to the given length.
///
/// This routine does not do anything to check the correctness of this
/// truncation. Callers must ensure that other states pointing to truncated
/// states are updated appropriately.
- pub(crate) fn truncate_states(&mut self, count: usize) {
- self.tt.truncate(count);
- }
-
- /// Return a mutable representation of the state corresponding to the given
- /// id. This is useful for implementing routines that manipulate DFA states
- /// (e.g., swapping states).
- pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
- self.tt.state_mut(id)
+ pub(crate) fn truncate_states(&mut self, len: usize) {
+ self.tt.truncate(len);
}
/// Minimize this DFA in place using Hopcroft's algorithm.
@@ -2171,7 +2551,7 @@ impl OwnedDFA {
pub(crate) fn set_pattern_map(
&mut self,
map: &BTreeMap<StateID, Vec<PatternID>>,
- ) -> Result<(), Error> {
+ ) -> Result<(), BuildError> {
self.ms = self.ms.new_with_map(map)?;
Ok(())
}
@@ -2180,7 +2560,7 @@ impl OwnedDFA {
/// them as candidates for acceleration during search.
pub(crate) fn accelerate(&mut self) {
// dead and quit states can never be accelerated.
- if self.state_count() <= 2 {
+ if self.state_len() <= 2 {
return;
}
@@ -2191,6 +2571,11 @@ impl OwnedDFA {
let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0);
for state in self.states() {
if let Some(accel) = state.accelerate(self.byte_classes()) {
+ debug!(
+ "accelerating full DFA state {}: {:?}",
+ state.id().as_usize(),
+ accel,
+ );
accels.insert(state.id(), accel);
if self.is_match_state(state.id()) {
cmatch += 1;
@@ -2212,7 +2597,7 @@ impl OwnedDFA {
// A remapper keeps track of state ID changes. Once we're done
// shuffling, the remapper is used to rewrite all transitions in the
// DFA based on the new positions of states.
- let mut remapper = Remapper::from_dfa(self);
+ let mut remapper = Remapper::new(self);
// As we swap states, if they are match states, we need to swap their
// pattern ID lists too (for multi-regexes). We do this by converting
@@ -2295,7 +2680,7 @@ impl OwnedDFA {
if cnormal > 0 {
// our next available starting and normal states for swapping.
let mut next_start_id = self.special.min_start;
- let mut cur_id = self.from_index(self.state_count() - 1);
+ let mut cur_id = self.to_state_id(self.state_len() - 1);
// This is guaranteed to exist since cnormal > 0.
let mut next_norm_id =
self.tt.next_state_id(self.special.max_start);
@@ -2361,9 +2746,9 @@ impl OwnedDFA {
self.special.set_max();
self.special.validate().expect("special state ranges should validate");
self.special
- .validate_state_count(self.state_count(), self.stride2())
+ .validate_state_len(self.state_len(), self.stride2())
.expect(
- "special state ranges should be consistent with state count",
+ "special state ranges should be consistent with state length",
);
assert_eq!(
self.special.accel_len(self.stride()),
@@ -2395,36 +2780,29 @@ impl OwnedDFA {
pub(crate) fn shuffle(
&mut self,
mut matches: BTreeMap<StateID, Vec<PatternID>>,
- ) -> Result<(), Error> {
+ ) -> Result<(), BuildError> {
// The determinizer always adds a quit state and it is always second.
- self.special.quit_id = self.from_index(1);
+ self.special.quit_id = self.to_state_id(1);
// If all we have are the dead and quit states, then we're done and
// the DFA will never produce a match.
- if self.state_count() <= 2 {
+ if self.state_len() <= 2 {
self.special.set_max();
return Ok(());
}
- // Collect all our start states into a convenient set and confirm there
- // is no overlap with match states. In the classicl DFA construction,
- // start states can be match states. But because of look-around, we
- // delay all matches by a byte, which prevents start states from being
- // match states.
+ // Collect all our non-DEAD start states into a convenient set and
+ // confirm there is no overlap with match states. In the classicl DFA
+ // construction, start states can be match states. But because of
+ // look-around, we delay all matches by a byte, which prevents start
+ // states from being match states.
let mut is_start: BTreeSet<StateID> = BTreeSet::new();
for (start_id, _, _) in self.starts() {
- // While there's nothing theoretically wrong with setting a start
- // state to a dead ID (indeed, it could be an optimization!), the
- // shuffling code below assumes that start states aren't dead. If
- // this assumption is violated, the dead state could be shuffled
- // to a new location, which must never happen. So if we do want
- // to allow start states to be dead, then this assert should be
- // removed and the code below fixed.
- //
- // N.B. Minimization can cause start states to be dead, but that
- // happens after states are shuffled, so it's OK. Also, start
- // states are dead for the DFA that never matches anything, but
- // in that case, there are no states to shuffle.
- assert_ne!(start_id, DEAD, "start state cannot be dead");
+ // If a starting configuration points to a DEAD state, then we
+ // don't want to shuffle it. The DEAD state is always the first
+ // state with ID=0. So we can just leave it be.
+ if start_id == DEAD {
+ continue;
+ }
assert!(
!matches.contains_key(&start_id),
"{:?} is both a start and a match state, which is not allowed",
@@ -2438,7 +2816,7 @@ impl OwnedDFA {
// IDs and swapping them changes their IDs, we need to record every
// swap we make so that we can remap IDs. The remapper handles this
// book-keeping for us.
- let mut remapper = Remapper::from_dfa(self);
+ let mut remapper = Remapper::new(self);
// Shuffle matching states.
if matches.is_empty() {
@@ -2448,7 +2826,7 @@ impl OwnedDFA {
// The determinizer guarantees that the first two states are the
// dead and quit states, respectively. We want our match states to
// come right after quit.
- let mut next_id = self.from_index(2);
+ let mut next_id = self.to_state_id(2);
let mut new_matches = BTreeMap::new();
self.special.min_match = next_id;
for (id, pids) in matches {
@@ -2470,7 +2848,7 @@ impl OwnedDFA {
// Shuffle starting states.
{
- let mut next_id = self.from_index(2);
+ let mut next_id = self.to_state_id(2);
if self.special.matches() {
next_id = self.tt.next_state_id(self.special.max_match);
}
@@ -2491,32 +2869,77 @@ impl OwnedDFA {
self.special.set_max();
self.special.validate().expect("special state ranges should validate");
self.special
- .validate_state_count(self.state_count(), self.stride2())
+ .validate_state_len(self.state_len(), self.stride2())
.expect(
- "special state ranges should be consistent with state count",
+ "special state ranges should be consistent with state length",
);
Ok(())
}
-}
-/// A variety of generic internal methods for accessing DFA internals.
-impl<T: AsRef<[u32]>> DFA<T> {
- /// Return the byte classes used by this DFA.
- pub(crate) fn byte_classes(&self) -> &ByteClasses {
- &self.tt.classes
+ /// Checks whether there are universal start states (both anchored and
+ /// unanchored), and if so, sets the relevant fields to the start state
+ /// IDs.
+ ///
+ /// Universal start states occur precisely when the all patterns in the
+ /// DFA have no look-around assertions in their prefix.
+ fn set_universal_starts(&mut self) {
+ assert_eq!(6, Start::len(), "expected 6 start configurations");
+
+ let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
+ // This OK because we only call 'start' under conditions
+ // in which we know it will succeed.
+ dfa.st.start(inp, start).expect("valid Input configuration")
+ };
+ if self.start_kind().has_unanchored() {
+ let inp = Input::new("").anchored(Anchored::No);
+ let sid = start_id(self, &inp, Start::NonWordByte);
+ if sid == start_id(self, &inp, Start::WordByte)
+ && sid == start_id(self, &inp, Start::Text)
+ && sid == start_id(self, &inp, Start::LineLF)
+ && sid == start_id(self, &inp, Start::LineCR)
+ && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ {
+ self.st.universal_start_unanchored = Some(sid);
+ }
+ }
+ if self.start_kind().has_anchored() {
+ let inp = Input::new("").anchored(Anchored::Yes);
+ let sid = start_id(self, &inp, Start::NonWordByte);
+ if sid == start_id(self, &inp, Start::WordByte)
+ && sid == start_id(self, &inp, Start::Text)
+ && sid == start_id(self, &inp, Start::LineLF)
+ && sid == start_id(self, &inp, Start::LineCR)
+ && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ {
+ self.st.universal_start_anchored = Some(sid);
+ }
+ }
}
+}
+// A variety of generic internal methods for accessing DFA internals.
+impl<T: AsRef<[u32]>> DFA<T> {
/// Return the info about special states.
pub(crate) fn special(&self) -> &Special {
&self.special
}
/// Return the info about special states as a mutable borrow.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub(crate) fn special_mut(&mut self) -> &mut Special {
&mut self.special
}
+ /// Returns the quit set (may be empty) used by this DFA.
+ pub(crate) fn quitset(&self) -> &ByteSet {
+ &self.quitset
+ }
+
+ /// Returns the flags for this DFA.
+ pub(crate) fn flags(&self) -> &Flags {
+ &self.flags
+ }
+
/// Returns an iterator over all states in this DFA.
///
/// This iterator yields a tuple for each state. The first element of the
@@ -2528,14 +2951,14 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// Return the total number of states in this DFA. Every DFA has at least
/// 1 state, even the empty DFA.
- pub(crate) fn state_count(&self) -> usize {
- self.tt.count()
+ pub(crate) fn state_len(&self) -> usize {
+ self.tt.len()
}
/// Return an iterator over all pattern IDs for the given match state.
///
/// If the given state is not a match state, then this panics.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
assert!(self.is_match_state(id));
self.ms.pattern_id_slice(self.match_state_index(id))
@@ -2550,21 +2973,21 @@ impl<T: AsRef<[u32]>> DFA<T> {
}
/// Returns the total number of patterns matched by this DFA.
- pub(crate) fn pattern_count(&self) -> usize {
- self.ms.patterns
+ pub(crate) fn pattern_len(&self) -> usize {
+ self.ms.pattern_len
}
/// Returns a map from match state ID to a list of pattern IDs that match
/// in that state.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
self.ms.to_map(self)
}
/// Returns the ID of the quit state for this DFA.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub(crate) fn quit_id(&self) -> StateID {
- self.from_index(1)
+ self.to_state_id(1)
}
/// Convert the given state identifier to the state's index. The state's
@@ -2576,14 +2999,14 @@ impl<T: AsRef<[u32]>> DFA<T> {
self.tt.to_index(id)
}
- /// Convert an index to a state (in the range 0..self.state_count()) to an
+ /// Convert an index to a state (in the range 0..self.state_len()) to an
/// actual state identifier.
///
/// This is useful when using a `Vec<T>` as an efficient map keyed by state
/// to some other information (such as a remapped state ID).
- #[cfg(feature = "alloc")]
- pub(crate) fn from_index(&self, index: usize) -> StateID {
- self.tt.from_index(index)
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn to_state_id(&self, index: usize) -> StateID {
+ self.tt.to_state_id(index)
}
/// Return the table of state IDs for this DFA's start states.
@@ -2594,11 +3017,12 @@ impl<T: AsRef<[u32]>> DFA<T> {
/// Returns the index of the match state for the given ID. If the
/// given ID does not correspond to a match state, then this may
/// panic or produce an incorrect result.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn match_state_index(&self, id: StateID) -> usize {
debug_assert!(self.is_match_state(id));
// This is one of the places where we rely on the fact that match
// states are contiguous in the transition table. Namely, that the
- // first match state ID always corresponds to dfa.special.min_start.
+ // first match state ID always corresponds to dfa.special.min_match.
// From there, since we know the stride, we can compute the overall
// index of any match state given the match state's ID.
let min = self.special().min_match.as_usize();
@@ -2645,25 +3069,26 @@ impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
write!(f, "\n")?;
}
writeln!(f, "")?;
- for (i, (start_id, sty, pid)) in self.starts().enumerate() {
+ for (i, (start_id, anchored, sty)) in self.starts().enumerate() {
let id = if f.alternate() {
start_id.as_usize()
} else {
self.to_index(start_id)
};
if i % self.st.stride == 0 {
- match pid {
- None => writeln!(f, "START-GROUP(ALL)")?,
- Some(pid) => {
+ match anchored {
+ Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
+ Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
+ Anchored::Pattern(pid) => {
writeln!(f, "START_GROUP(pattern: {:?})", pid)?
}
}
}
writeln!(f, " {:?} => {:06?}", sty, id)?;
}
- if self.pattern_count() > 1 {
+ if self.pattern_len() > 1 {
writeln!(f, "")?;
- for i in 0..self.ms.count() {
+ for i in 0..self.ms.len() {
let id = self.ms.match_state_id(self, i);
let id = if f.alternate() {
id.as_usize()
@@ -2681,124 +3106,168 @@ impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
writeln!(f, "")?;
}
}
- writeln!(f, "state count: {:?}", self.state_count())?;
- writeln!(f, "pattern count: {:?}", self.pattern_count())?;
+ writeln!(f, "state length: {:?}", self.state_len())?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, "flags: {:?}", self.flags)?;
writeln!(f, ")")?;
Ok(())
}
}
+// SAFETY: We assert that our implementation of each method is correct.
unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_special_state(&self, id: StateID) -> bool {
self.special.is_special_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_dead_state(&self, id: StateID) -> bool {
self.special.is_dead_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_quit_state(&self, id: StateID) -> bool {
self.special.is_quit_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_match_state(&self, id: StateID) -> bool {
self.special.is_match_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_start_state(&self, id: StateID) -> bool {
self.special.is_start_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn is_accel_state(&self, id: StateID) -> bool {
self.special.is_accel_state(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn next_state(&self, current: StateID, input: u8) -> StateID {
let input = self.byte_classes().get(input);
let o = current.as_usize() + usize::from(input);
self.trans()[o]
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
unsafe fn next_state_unchecked(
&self,
current: StateID,
- input: u8,
+ byte: u8,
) -> StateID {
- let input = self.byte_classes().get_unchecked(input);
- let o = current.as_usize() + usize::from(input);
- *self.trans().get_unchecked(o)
+ // We don't (or shouldn't) need an unchecked variant for the byte
+ // class mapping, since bound checks should be omitted automatically
+ // by virtue of its representation. If this ends up not being true as
+ // confirmed by codegen, please file an issue. ---AG
+ let class = self.byte_classes().get(byte);
+ let o = current.as_usize() + usize::from(class);
+ let next = *self.trans().get_unchecked(o);
+ next
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn next_eoi_state(&self, current: StateID) -> StateID {
let eoi = self.byte_classes().eoi().as_usize();
let o = current.as_usize() + eoi;
self.trans()[o]
}
- #[inline]
- fn pattern_count(&self) -> usize {
- self.ms.patterns
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_len(&self) -> usize {
+ self.ms.pattern_len
}
- #[inline]
- fn match_count(&self, id: StateID) -> usize {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn match_len(&self, id: StateID) -> usize {
self.match_pattern_len(id)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
// This is an optimization for the very common case of a DFA with a
// single pattern. This conditional avoids a somewhat more costly path
// that finds the pattern ID from the state machine, which requires
// a bit of slicing/pointer-chasing. This optimization tends to only
// matter when matches are frequent.
- if self.ms.patterns == 1 {
+ if self.ms.pattern_len == 1 {
return PatternID::ZERO;
}
let state_index = self.match_state_index(id);
self.ms.pattern_id(state_index, match_index)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn has_empty(&self) -> bool {
+ self.flags.has_empty
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_utf8(&self) -> bool {
+ self.flags.is_utf8
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_always_start_anchored(&self) -> bool {
+ self.flags.is_always_start_anchored
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_forward(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- let index = Start::from_position_fwd(bytes, start, end);
- self.st.start(index, pattern_id)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.start() > 0 {
+ let offset = input.start() - 1;
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.fwd(&input);
+ self.st.start(input, start)
}
- #[inline]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn start_state_reverse(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- let index = Start::from_position_rev(bytes, start, end);
- self.st.start(index, pattern_id)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.end() < input.haystack().len() {
+ let offset = input.end();
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.rev(&input);
+ self.st.start(input, start)
}
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ match mode {
+ Anchored::No => self.st.universal_start_unanchored,
+ Anchored::Yes => self.st.universal_start_anchored,
+ Anchored::Pattern(_) => None,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn accelerator(&self, id: StateID) -> &[u8] {
if !self.is_accel_state(id) {
return &[];
}
self.accels.needles(self.accelerator_index(id))
}
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref()
+ }
}
/// The transition table portion of a dense DFA.
@@ -2873,7 +3342,7 @@ impl<'a> TransitionTable<&'a [u32]> {
///
/// # Safety
///
- /// This routine is not safe because it does not check the valdity of the
+ /// This routine is not safe because it does not check the validity of the
/// transition table itself. In particular, the transition table can be
/// quite large, so checking its validity can be somewhat expensive. An
/// invalid transition table is not safe because other code may rely on the
@@ -2886,12 +3355,13 @@ impl<'a> TransitionTable<&'a [u32]> {
unsafe fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
- let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?;
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(slice, "state length")?;
slice = &slice[nr..];
- let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?;
+ let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?;
slice = &slice[nr..];
let (classes, nr) = ByteClasses::from_bytes(slice)?;
@@ -2922,37 +3392,32 @@ impl<'a> TransitionTable<&'a [u32]> {
));
}
- let trans_count =
- bytes::shl(count, stride2, "dense table transition count")?;
- let table_bytes_len = bytes::mul(
- trans_count,
+ let trans_len =
+ wire::shl(state_len, stride2, "dense table transition length")?;
+ let table_bytes_len = wire::mul(
+ trans_len,
StateID::SIZE,
- "dense table state byte count",
+ "dense table state byte length",
)?;
- bytes::check_slice_len(slice, table_bytes_len, "transition table")?;
- bytes::check_alignment::<StateID>(slice)?;
+ wire::check_slice_len(slice, table_bytes_len, "transition table")?;
+ wire::check_alignment::<StateID>(slice)?;
let table_bytes = &slice[..table_bytes_len];
slice = &slice[table_bytes_len..];
// SAFETY: Since StateID is always representable as a u32, all we need
// to do is ensure that we have the proper length and alignment. We've
// checked both above, so the cast below is safe.
//
- // N.B. This is the only not-safe code in this function, so we mark
- // it explicitly to call it out, even though it is technically
- // superfluous.
- #[allow(unused_unsafe)]
- let table = unsafe {
- core::slice::from_raw_parts(
- table_bytes.as_ptr() as *const u32,
- trans_count,
- )
- };
+ // N.B. This is the only not-safe code in this function.
+ let table = core::slice::from_raw_parts(
+ table_bytes.as_ptr().cast::<u32>(),
+ trans_len,
+ );
let tt = TransitionTable { table, classes, stride2 };
- Ok((tt, slice.as_ptr() as usize - slice_start))
+ Ok((tt, slice.as_ptr().as_usize() - slice_start))
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl TransitionTable<Vec<u32>> {
/// Create a minimal transition table with just two states: a dead state
/// and a quit state. The alphabet length and stride of the transition
@@ -2985,7 +3450,7 @@ impl TransitionTable<Vec<u32>> {
///
/// If adding a state would exhaust the state identifier space, then this
/// returns an error.
- fn add_empty_state(&mut self) -> Result<StateID, Error> {
+ fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
// Normally, to get a fresh state identifier, we would just
// take the index of the next state added to the transition
// table. However, we actually perform an optimization here
@@ -3026,7 +3491,8 @@ impl TransitionTable<Vec<u32>> {
// itself. e.g., If the stride is 64, then the ID of the 3rd state
// is 192, not 2.
let next = self.table.len();
- let id = StateID::new(next).map_err(|_| Error::too_many_states())?;
+ let id =
+ StateID::new(next).map_err(|_| BuildError::too_many_states())?;
self.table.extend(iter::repeat(0).take(self.stride()));
Ok(id)
}
@@ -3049,26 +3515,25 @@ impl TransitionTable<Vec<u32>> {
}
}
- /// Truncate the states in this transition table to the given count.
+ /// Remap the transitions for the state given according to the function
+ /// given. This applies the given map function to every transition in the
+ /// given state and changes the transition in place to the result of the
+ /// map function for that transition.
+ fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) {
+ for byte in 0..self.alphabet_len() {
+ let i = id.as_usize() + byte;
+ let next = self.table()[i];
+ self.table_mut()[id.as_usize() + byte] = map(next);
+ }
+ }
+
+ /// Truncate the states in this transition table to the given length.
///
/// This routine does not do anything to check the correctness of this
/// truncation. Callers must ensure that other states pointing to truncated
/// states are updated appropriately.
- fn truncate(&mut self, count: usize) {
- self.table.truncate(count << self.stride2);
- }
-
- /// Return a mutable representation of the state corresponding to the given
- /// id. This is useful for implementing routines that manipulate DFA states
- /// (e.g., swapping states).
- fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
- let alphabet_len = self.alphabet_len();
- let i = id.as_usize();
- StateMut {
- id,
- stride2: self.stride2,
- transitions: &mut self.table_mut()[i..i + alphabet_len],
- }
+ fn truncate(&mut self, len: usize) {
+ self.table.truncate(len << self.stride2);
}
}
@@ -3086,9 +3551,9 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
}
dst = &mut dst[..nwrite];
- // write state count
+ // write state length
// Unwrap is OK since number of states is guaranteed to fit in a u32.
- E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+ E::write_u32(u32::try_from(self.len()).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
// write state stride (as power of 2)
@@ -3102,7 +3567,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
// write actual transitions
for &sid in self.table() {
- let n = bytes::write_state_id::<E>(sid, &mut dst);
+ let n = wire::write_state_id::<E>(sid, &mut dst);
dst = &mut dst[n..];
}
Ok(nwrite)
@@ -3111,7 +3576,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
/// Returns the number of bytes the serialized form of this transition
/// table will use.
fn write_to_len(&self) -> usize {
- size_of::<u32>() // state count
+ size_of::<u32>() // state length
+ size_of::<u32>() // stride2
+ self.classes.write_to_len()
+ (self.table().len() * StateID::SIZE)
@@ -3121,8 +3586,25 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
///
/// That is, every state ID can be used to correctly index a state in this
/// table.
- fn validate(&self) -> Result<(), DeserializeError> {
+ fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
for state in self.states() {
+ // We check that the ID itself is well formed. That is, if it's
+ // a special state then it must actually be a quit, dead, accel,
+ // match or start state.
+ if sp.is_special_state(state.id()) {
+ let is_actually_special = sp.is_dead_state(state.id())
+ || sp.is_quit_state(state.id())
+ || sp.is_match_state(state.id())
+ || sp.is_start_state(state.id())
+ || sp.is_accel_state(state.id());
+ if !is_actually_special {
+ // This is kind of a cryptic error message...
+ return Err(DeserializeError::generic(
+ "found dense state tagged as special but \
+ wasn't actually special",
+ ));
+ }
+ }
for (_, to) in state.transitions() {
if !self.is_valid(to) {
return Err(DeserializeError::generic(
@@ -3145,7 +3627,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
/// Converts this transition table to an owned value.
#[cfg(feature = "alloc")]
- fn to_owned(&self) -> TransitionTable<Vec<u32>> {
+ fn to_owned(&self) -> TransitionTable<alloc::vec::Vec<u32>> {
TransitionTable {
table: self.table.as_ref().to_vec(),
classes: self.classes.clone(),
@@ -3179,7 +3661,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
}
/// Convert a state identifier to an index to a state (in the range
- /// 0..self.count()).
+ /// 0..self.len()).
///
/// This is useful when using a `Vec<T>` as an efficient map keyed by state
/// to some other information (such as a remapped state ID).
@@ -3190,7 +3672,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
id.as_usize() >> self.stride2
}
- /// Convert an index to a state (in the range 0..self.count()) to an actual
+ /// Convert an index to a state (in the range 0..self.len()) to an actual
/// state identifier.
///
/// This is useful when using a `Vec<T>` as an efficient map keyed by state
@@ -3198,7 +3680,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
///
/// If the given index is not in the specified range, then this may panic
/// or produce an incorrect state ID.
- fn from_index(&self, index: usize) -> StateID {
+ fn to_state_id(&self, index: usize) -> StateID {
// CORRECTNESS: If the given index is not valid, then it is not
// required for this to panic or return a valid state ID.
StateID::new_unchecked(index << self.stride2)
@@ -3209,30 +3691,22 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
/// This does not check whether the state ID returned is invalid. In fact,
/// if the state ID given is the last state in this DFA, then the state ID
/// returned is guaranteed to be invalid.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn next_state_id(&self, id: StateID) -> StateID {
- self.from_index(self.to_index(id).checked_add(1).unwrap())
+ self.to_state_id(self.to_index(id).checked_add(1).unwrap())
}
/// Returns the state ID for the state immediately preceding the one given.
///
/// If the dead ID given (which is zero), then this panics.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn prev_state_id(&self, id: StateID) -> StateID {
- self.from_index(self.to_index(id).checked_sub(1).unwrap())
+ self.to_state_id(self.to_index(id).checked_sub(1).unwrap())
}
/// Returns the table as a slice of state IDs.
fn table(&self) -> &[StateID] {
- let integers = self.table.as_ref();
- // SAFETY: This is safe because StateID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts(
- integers.as_ptr() as *const StateID,
- integers.len(),
- )
- }
+ wire::u32s_to_state_ids(self.table.as_ref())
}
/// Returns the total number of states in this transition table.
@@ -3241,7 +3715,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
/// states. In particular, the dead state always has ID 0 and is
/// correspondingly always the first state. The dead state is never a match
/// state.
- fn count(&self) -> usize {
+ fn len(&self) -> usize {
self.table().len() >> self.stride2
}
@@ -3277,19 +3751,11 @@ impl<T: AsRef<[u32]>> TransitionTable<T> {
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<T: AsMut<[u32]>> TransitionTable<T> {
/// Returns the table as a slice of state IDs.
fn table_mut(&mut self) -> &mut [StateID] {
- let integers = self.table.as_mut();
- // SAFETY: This is safe because StateID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts_mut(
- integers.as_mut_ptr() as *mut StateID,
- integers.len(),
- )
- }
+ wire::u32s_to_state_ids_mut(self.table.as_mut())
}
}
@@ -3330,10 +3796,10 @@ impl<T: AsMut<[u32]>> TransitionTable<T> {
///
/// 1. If the search starts at the beginning of `context`, then the `Text`
/// start state is used. (Since `^` corresponds to
-/// `hir::Anchor::StartText`.)
+/// `hir::Anchor::Start`.)
/// 2. If the search starts at a position immediately following a line
/// terminator, then the `Line` start state is used. (Since `(?m:^)`
-/// corresponds to `hir::Anchor::StartLine`.)
+/// corresponds to `hir::Anchor::StartLF`.)
/// 3. If the search starts at a position immediately following a byte
/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
@@ -3372,23 +3838,41 @@ pub(crate) struct StartTable<T> {
///
/// In practice, T is either `Vec<u32>` or `&[u32]`.
///
- /// The first `stride` (currently always 4) entries always correspond to
- /// the start states for the entire DFA. After that, there are
- /// `stride * patterns` state IDs, where `patterns` may be zero in the
- /// case of a DFA with no patterns or in the case where the DFA was built
- /// without enabling starting states for each pattern.
+ /// The first `2 * stride` (currently always 8) entries always correspond
+ /// to the starts states for the entire DFA, with the first 4 entries being
+ /// for unanchored searches and the second 4 entries being for anchored
+ /// searches. To keep things simple, we always use 8 entries even if the
+ /// `StartKind` is not both.
+ ///
+ /// After that, there are `stride * patterns` state IDs, where `patterns`
+ /// may be zero in the case of a DFA with no patterns or in the case where
+ /// the DFA was built without enabling starting states for each pattern.
table: T,
+ /// The starting state configuration supported. When 'both', both
+ /// unanchored and anchored searches work. When 'unanchored', anchored
+ /// searches panic. When 'anchored', unanchored searches panic.
+ kind: StartKind,
+ /// The start state configuration for every possible byte.
+ start_map: StartByteMap,
/// The number of starting state IDs per pattern.
stride: usize,
/// The total number of patterns for which starting states are encoded.
- /// This may be zero for non-empty DFAs when the DFA was built without
- /// start states for each pattern. Thus, one cannot use this field to
- /// say how many patterns are in the DFA in all cases. It is specific to
- /// how many patterns are represented in this start table.
- patterns: usize,
+ /// This is `None` for DFAs that were built without start states for each
+ /// pattern. Thus, one cannot use this field to say how many patterns
+ /// are in the DFA in all cases. It is specific to how many patterns are
+ /// represented in this start table.
+ pattern_len: Option<usize>,
+ /// The universal starting state for unanchored searches. This is only
+ /// present when the DFA supports unanchored searches and when all starting
+ /// state IDs for an unanchored search are equivalent.
+ universal_start_unanchored: Option<StateID>,
+ /// The universal starting state for anchored searches. This is only
+ /// present when the DFA supports anchored searches and when all starting
+ /// state IDs for an anchored search are equivalent.
+ universal_start_anchored: Option<StateID>,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl StartTable<Vec<u32>> {
/// Create a valid set of start states all pointing to the dead state.
///
@@ -3400,22 +3884,40 @@ impl StartTable<Vec<u32>> {
/// returns an error. In practice, this is unlikely to be able to occur,
/// since it's likely that allocation would have failed long before it got
/// to this point.
- fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> {
- assert!(patterns <= PatternID::LIMIT);
- let stride = Start::count();
- let pattern_starts_len = match stride.checked_mul(patterns) {
- Some(x) => x,
- None => return Err(Error::too_many_start_states()),
- };
- let table_len = match stride.checked_add(pattern_starts_len) {
+ fn dead(
+ kind: StartKind,
+ lookm: &LookMatcher,
+ pattern_len: Option<usize>,
+ ) -> Result<StartTable<Vec<u32>>, BuildError> {
+ if let Some(len) = pattern_len {
+ assert!(len <= PatternID::LIMIT);
+ }
+ let stride = Start::len();
+ // OK because 2*4 is never going to overflow anything.
+ let starts_len = stride.checked_mul(2).unwrap();
+ let pattern_starts_len =
+ match stride.checked_mul(pattern_len.unwrap_or(0)) {
+ Some(x) => x,
+ None => return Err(BuildError::too_many_start_states()),
+ };
+ let table_len = match starts_len.checked_add(pattern_starts_len) {
Some(x) => x,
- None => return Err(Error::too_many_start_states()),
+ None => return Err(BuildError::too_many_start_states()),
};
- if table_len > core::isize::MAX as usize {
- return Err(Error::too_many_start_states());
+ if let Err(_) = isize::try_from(table_len) {
+ return Err(BuildError::too_many_start_states());
}
let table = vec![DEAD.as_u32(); table_len];
- Ok(StartTable { table, stride, patterns })
+ let start_map = StartByteMap::new(lookm);
+ Ok(StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored: None,
+ universal_start_anchored: None,
+ })
}
}
@@ -3433,7 +3935,7 @@ impl<'a> StartTable<&'a [u32]> {
///
/// # Safety
///
- /// This routine is not safe because it does not check the valdity of the
+ /// This routine is not safe because it does not check the validity of the
/// starting state IDs themselves. In particular, the number of starting
/// IDs can be of variable length, so it's possible that checking their
/// validity cannot be done in constant time. An invalid starting state
@@ -3447,61 +3949,104 @@ impl<'a> StartTable<&'a [u32]> {
unsafe fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
- let (stride, nr) =
- bytes::try_read_u32_as_usize(slice, "start table stride")?;
+ let (kind, nr) = StartKind::from_bytes(slice)?;
slice = &slice[nr..];
- let (patterns, nr) =
- bytes::try_read_u32_as_usize(slice, "start table patterns")?;
+ let (start_map, nr) = StartByteMap::from_bytes(slice)?;
slice = &slice[nr..];
- if stride != Start::count() {
+ let (stride, nr) =
+ wire::try_read_u32_as_usize(slice, "start table stride")?;
+ slice = &slice[nr..];
+ if stride != Start::len() {
return Err(DeserializeError::generic(
"invalid starting table stride",
));
}
- if patterns > PatternID::LIMIT {
+
+ let (maybe_pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "start table patterns")?;
+ slice = &slice[nr..];
+ let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
+ None
+ } else {
+ Some(maybe_pattern_len)
+ };
+ if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
return Err(DeserializeError::generic(
"invalid number of patterns",
));
}
- let pattern_table_size =
- bytes::mul(stride, patterns, "invalid pattern count")?;
- // Our start states always start with a single stride of start states
- // for the entire automaton which permit it to match any pattern. What
- // follows it are an optional set of start states for each pattern.
- let start_state_count = bytes::add(
+
+ let (universal_unanchored, nr) =
+ wire::try_read_u32(slice, "universal unanchored start")?;
+ slice = &slice[nr..];
+ let universal_start_unanchored = if universal_unanchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_unanchored).map_err(|e| {
+ DeserializeError::state_id_error(
+ e,
+ "universal unanchored start",
+ )
+ })?)
+ };
+
+ let (universal_anchored, nr) =
+ wire::try_read_u32(slice, "universal anchored start")?;
+ slice = &slice[nr..];
+ let universal_start_anchored = if universal_anchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_anchored).map_err(|e| {
+ DeserializeError::state_id_error(e, "universal anchored start")
+ })?)
+ };
+
+ let pattern_table_size = wire::mul(
stride,
+ pattern_len.unwrap_or(0),
+ "invalid pattern length",
+ )?;
+ // Our start states always start with a two stride of start states for
+ // the entire automaton. The first stride is for unanchored starting
+ // states and the second stride is for anchored starting states. What
+ // follows it are an optional set of start states for each pattern.
+ let start_state_len = wire::add(
+ wire::mul(2, stride, "start state stride too big")?,
pattern_table_size,
"invalid 'any' pattern starts size",
)?;
- let table_bytes_len = bytes::mul(
- start_state_count,
+ let table_bytes_len = wire::mul(
+ start_state_len,
StateID::SIZE,
"pattern table bytes length",
)?;
- bytes::check_slice_len(slice, table_bytes_len, "start ID table")?;
- bytes::check_alignment::<StateID>(slice)?;
+ wire::check_slice_len(slice, table_bytes_len, "start ID table")?;
+ wire::check_alignment::<StateID>(slice)?;
let table_bytes = &slice[..table_bytes_len];
slice = &slice[table_bytes_len..];
// SAFETY: Since StateID is always representable as a u32, all we need
// to do is ensure that we have the proper length and alignment. We've
// checked both above, so the cast below is safe.
//
- // N.B. This is the only not-safe code in this function, so we mark
- // it explicitly to call it out, even though it is technically
- // superfluous.
- #[allow(unused_unsafe)]
- let table = unsafe {
- core::slice::from_raw_parts(
- table_bytes.as_ptr() as *const u32,
- start_state_count,
- )
+ // N.B. This is the only not-safe code in this function.
+ let table = core::slice::from_raw_parts(
+ table_bytes.as_ptr().cast::<u32>(),
+ start_state_len,
+ );
+ let st = StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored,
+ universal_start_anchored,
};
- let st = StartTable { table, stride, patterns };
- Ok((st, slice.as_ptr() as usize - slice_start))
+ Ok((st, slice.as_ptr().as_usize() - slice_start))
}
}
@@ -3521,17 +4066,39 @@ impl<T: AsRef<[u32]>> StartTable<T> {
}
dst = &mut dst[..nwrite];
+ // write start kind
+ let nw = self.kind.write_to::<E>(dst)?;
+ dst = &mut dst[nw..];
+ // write start byte map
+ let nw = self.start_map.write_to(dst)?;
+ dst = &mut dst[nw..];
// write stride
// Unwrap is OK since the stride is always 4 (currently).
E::write_u32(u32::try_from(self.stride).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
- // write pattern count
+ // write pattern length
// Unwrap is OK since number of patterns is guaranteed to fit in a u32.
- E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ E::write_u32(
+ u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start unanchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_unanchored
+ .map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start anchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
dst = &mut dst[size_of::<u32>()..];
// write start IDs
for &sid in self.table() {
- let n = bytes::write_state_id::<E>(sid, &mut dst);
+ let n = wire::write_state_id::<E>(sid, &mut dst);
dst = &mut dst[n..];
}
Ok(nwrite)
@@ -3540,8 +4107,12 @@ impl<T: AsRef<[u32]>> StartTable<T> {
/// Returns the number of bytes the serialized form of this start ID table
/// will use.
fn write_to_len(&self) -> usize {
- size_of::<u32>() // stride
+ self.kind.write_to_len()
+ + self.start_map.write_to_len()
+ + size_of::<u32>() // stride
+ size_of::<u32>() // # patterns
+ + size_of::<u32>() // universal unanchored start
+ + size_of::<u32>() // universal anchored start
+ (self.table().len() * StateID::SIZE)
}
@@ -3553,6 +4124,16 @@ impl<T: AsRef<[u32]>> StartTable<T> {
&self,
tt: &TransitionTable<T>,
) -> Result<(), DeserializeError> {
+ if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) {
+ return Err(DeserializeError::generic(
+ "found invalid universal unanchored starting state ID",
+ ));
+ }
+ if !self.universal_start_anchored.map_or(true, |s| tt.is_valid(s)) {
+ return Err(DeserializeError::generic(
+ "found invalid universal anchored starting state ID",
+ ));
+ }
for &id in self.table() {
if !tt.is_valid(id) {
return Err(DeserializeError::generic(
@@ -3567,38 +4148,72 @@ impl<T: AsRef<[u32]>> StartTable<T> {
fn as_ref(&self) -> StartTable<&'_ [u32]> {
StartTable {
table: self.table.as_ref(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
stride: self.stride,
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
}
}
/// Converts this start list to an owned value.
#[cfg(feature = "alloc")]
- fn to_owned(&self) -> StartTable<Vec<u32>> {
+ fn to_owned(&self) -> StartTable<alloc::vec::Vec<u32>> {
StartTable {
table: self.table.as_ref().to_vec(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
stride: self.stride,
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
}
}
- /// Return the start state for the given start index and pattern ID. If the
- /// pattern ID is None, then the corresponding start state for the entire
- /// DFA is returned. If the pattern ID is not None, then the corresponding
- /// starting state for the given pattern is returned. If this start table
- /// does not have individual starting states for each pattern, then this
- /// panics.
- fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
- let start_index = index.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => {
- let pid = pid.as_usize();
- assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
- self.stride + (self.stride * pid) + start_index
+ /// Return the start state for the given input and starting configuration.
+ /// This returns an error if the input configuration is not supported by
+ /// this DFA. For example, requesting an unanchored search when the DFA was
+ /// not built with unanchored starting states. Or asking for an anchored
+ /// pattern search with an invalid pattern ID or on a DFA that was not
+ /// built with start states for each pattern.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start(
+ &self,
+ input: &Input<'_>,
+ start: Start,
+ ) -> Result<StateID, MatchError> {
+ let start_index = start.as_usize();
+ let mode = input.get_anchored();
+ let index = match mode {
+ Anchored::No => {
+ if !self.kind.has_unanchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ start_index
+ }
+ Anchored::Yes => {
+ if !self.kind.has_anchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ self.stride + start_index
+ }
+ Anchored::Pattern(pid) => {
+ let len = match self.pattern_len {
+ None => {
+ return Err(MatchError::unsupported_anchored(mode))
+ }
+ Some(len) => len,
+ };
+ if pid.as_usize() >= len {
+ return Ok(DEAD);
+ }
+ (2 * self.stride)
+ + (self.stride * pid.as_usize())
+ + start_index
}
};
- self.table()[index]
+ Ok(self.table()[index])
}
/// Returns an iterator over all start state IDs in this table.
@@ -3611,15 +4226,7 @@ impl<T: AsRef<[u32]>> StartTable<T> {
/// Returns the table as a slice of state IDs.
fn table(&self) -> &[StateID] {
- let integers = self.table.as_ref();
- // SAFETY: This is safe because StateID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts(
- integers.as_ptr() as *const StateID,
- integers.len(),
- )
- }
+ wire::u32s_to_state_ids(self.table.as_ref())
}
/// Return the memory usage, in bytes, of this start list.
@@ -3630,62 +4237,56 @@ impl<T: AsRef<[u32]>> StartTable<T> {
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<T: AsMut<[u32]>> StartTable<T> {
/// Set the start state for the given index and pattern.
///
/// If the pattern ID or state ID are not valid, then this will panic.
- fn set_start(
- &mut self,
- index: Start,
- pattern_id: Option<PatternID>,
- id: StateID,
- ) {
- let start_index = index.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => self
- .stride
- .checked_mul(pid.as_usize())
- .unwrap()
- .checked_add(self.stride)
- .unwrap()
- .checked_add(start_index)
- .unwrap(),
+ fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
+ let start_index = start.as_usize();
+ let index = match anchored {
+ Anchored::No => start_index,
+ Anchored::Yes => self.stride + start_index,
+ Anchored::Pattern(pid) => {
+ let pid = pid.as_usize();
+ let len = self
+ .pattern_len
+ .expect("start states for each pattern enabled");
+ assert!(pid < len, "invalid pattern ID {:?}", pid);
+ self.stride
+ .checked_mul(pid)
+ .unwrap()
+ .checked_add(self.stride.checked_mul(2).unwrap())
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap()
+ }
};
self.table_mut()[index] = id;
}
/// Returns the table as a mutable slice of state IDs.
fn table_mut(&mut self) -> &mut [StateID] {
- let integers = self.table.as_mut();
- // SAFETY: This is safe because StateID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts_mut(
- integers.as_mut_ptr() as *mut StateID,
- integers.len(),
- )
- }
+ wire::u32s_to_state_ids_mut(self.table.as_mut())
}
}
/// An iterator over start state IDs.
///
-/// This iterator yields a triple of start state ID, the start state type
-/// and the pattern ID (if any). The pattern ID is None for start states
-/// corresponding to the entire DFA and non-None for start states corresponding
-/// to a specific pattern. The latter only occurs when the DFA is compiled with
-/// start states for each pattern.
+/// This iterator yields a triple of start state ID, the anchored mode and the
+/// start state type. If a pattern ID is relevant, then the anchored mode will
+/// contain it. Start states with an anchored mode containing a pattern ID will
+/// only occur when the DFA was compiled with start states for each pattern
+/// (which is disabled by default).
pub(crate) struct StartStateIter<'a> {
st: StartTable<&'a [u32]>,
i: usize,
}
impl<'a> Iterator for StartStateIter<'a> {
- type Item = (StateID, Start, Option<PatternID>);
+ type Item = (StateID, Anchored, Start);
- fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+ fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
let i = self.i;
let table = self.st.table();
if i >= table.len() {
@@ -3696,14 +4297,15 @@ impl<'a> Iterator for StartStateIter<'a> {
// This unwrap is okay since the stride of the starting state table
// must always match the number of start state types.
let start_type = Start::from_usize(i % self.st.stride).unwrap();
- let pid = if i < self.st.stride {
- None
+ let anchored = if i < self.st.stride {
+ Anchored::No
+ } else if i < (2 * self.st.stride) {
+ Anchored::Yes
} else {
- Some(
- PatternID::new((i - self.st.stride) / self.st.stride).unwrap(),
- )
+ let pid = (i - (2 * self.st.stride)) / self.st.stride;
+ Anchored::Pattern(PatternID::new(pid).unwrap())
};
- Some((table[i], start_type, pid))
+ Some((table[i], anchored, start_type))
}
}
@@ -3735,105 +4337,93 @@ struct MatchStates<T> {
/// In practice, T is either Vec<u32> or &[u32].
pattern_ids: T,
/// The total number of unique patterns represented by these match states.
- patterns: usize,
+ pattern_len: usize,
}
impl<'a> MatchStates<&'a [u32]> {
unsafe fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
// Read the total number of match states.
- let (count, nr) =
- bytes::try_read_u32_as_usize(slice, "match state count")?;
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(slice, "match state length")?;
slice = &slice[nr..];
// Read the slice start/length pairs.
- let pair_count = bytes::mul(2, count, "match state offset pairs")?;
- let slices_bytes_len = bytes::mul(
- pair_count,
+ let pair_len = wire::mul(2, state_len, "match state offset pairs")?;
+ let slices_bytes_len = wire::mul(
+ pair_len,
PatternID::SIZE,
"match state slice offset byte length",
)?;
- bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?;
- bytes::check_alignment::<PatternID>(slice)?;
+ wire::check_slice_len(slice, slices_bytes_len, "match state slices")?;
+ wire::check_alignment::<PatternID>(slice)?;
let slices_bytes = &slice[..slices_bytes_len];
slice = &slice[slices_bytes_len..];
// SAFETY: Since PatternID is always representable as a u32, all we
// need to do is ensure that we have the proper length and alignment.
// We've checked both above, so the cast below is safe.
//
- // N.B. This is one of the few not-safe snippets in this function, so
- // we mark it explicitly to call it out, even though it is technically
- // superfluous.
- #[allow(unused_unsafe)]
- let slices = unsafe {
- core::slice::from_raw_parts(
- slices_bytes.as_ptr() as *const u32,
- pair_count,
- )
- };
+ // N.B. This is one of the few not-safe snippets in this function,
+ // so we mark it explicitly to call it out.
+ let slices = core::slice::from_raw_parts(
+ slices_bytes.as_ptr().cast::<u32>(),
+ pair_len,
+ );
// Read the total number of unique pattern IDs (which is always 1 more
// than the maximum pattern ID in this automaton, since pattern IDs are
// handed out contiguously starting at 0).
- let (patterns, nr) =
- bytes::try_read_u32_as_usize(slice, "pattern count")?;
+ let (pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "pattern length")?;
slice = &slice[nr..];
- // Now read the pattern ID count. We don't need to store this
+ // Now read the pattern ID length. We don't need to store this
// explicitly, but we need it to know how many pattern IDs to read.
- let (idcount, nr) =
- bytes::try_read_u32_as_usize(slice, "pattern ID count")?;
+ let (idlen, nr) =
+ wire::try_read_u32_as_usize(slice, "pattern ID length")?;
slice = &slice[nr..];
// Read the actual pattern IDs.
let pattern_ids_len =
- bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?;
- bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
- bytes::check_alignment::<PatternID>(slice)?;
+ wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?;
+ wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
+ wire::check_alignment::<PatternID>(slice)?;
let pattern_ids_bytes = &slice[..pattern_ids_len];
slice = &slice[pattern_ids_len..];
// SAFETY: Since PatternID is always representable as a u32, all we
// need to do is ensure that we have the proper length and alignment.
// We've checked both above, so the cast below is safe.
//
- // N.B. This is one of the few not-safe snippets in this function, so
- // we mark it explicitly to call it out, even though it is technically
- // superfluous.
- #[allow(unused_unsafe)]
- let pattern_ids = unsafe {
- core::slice::from_raw_parts(
- pattern_ids_bytes.as_ptr() as *const u32,
- idcount,
- )
- };
+ // N.B. This is one of the few not-safe snippets in this function,
+ // so we mark it explicitly to call it out.
+ let pattern_ids = core::slice::from_raw_parts(
+ pattern_ids_bytes.as_ptr().cast::<u32>(),
+ idlen,
+ );
- let ms = MatchStates { slices, pattern_ids, patterns };
- Ok((ms, slice.as_ptr() as usize - slice_start))
+ let ms = MatchStates { slices, pattern_ids, pattern_len };
+ Ok((ms, slice.as_ptr().as_usize() - slice_start))
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl MatchStates<Vec<u32>> {
- fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> {
- assert!(pattern_count <= PatternID::LIMIT);
- MatchStates {
- slices: vec![],
- pattern_ids: vec![],
- patterns: pattern_count,
- }
+ fn empty(pattern_len: usize) -> MatchStates<Vec<u32>> {
+ assert!(pattern_len <= PatternID::LIMIT);
+ MatchStates { slices: vec![], pattern_ids: vec![], pattern_len }
}
fn new(
matches: &BTreeMap<StateID, Vec<PatternID>>,
- pattern_count: usize,
- ) -> Result<MatchStates<Vec<u32>>, Error> {
- let mut m = MatchStates::empty(pattern_count);
+ pattern_len: usize,
+ ) -> Result<MatchStates<Vec<u32>>, BuildError> {
+ let mut m = MatchStates::empty(pattern_len);
for (_, pids) in matches.iter() {
let start = PatternID::new(m.pattern_ids.len())
- .map_err(|_| Error::too_many_match_pattern_ids())?;
+ .map_err(|_| BuildError::too_many_match_pattern_ids())?;
m.slices.push(start.as_u32());
// This is always correct since the number of patterns in a single
// match state can never exceed maximum number of allowable
@@ -3846,15 +4436,15 @@ impl MatchStates<Vec<u32>> {
m.pattern_ids.push(pid.as_u32());
}
}
- m.patterns = pattern_count;
+ m.pattern_len = pattern_len;
Ok(m)
}
fn new_with_map(
&self,
matches: &BTreeMap<StateID, Vec<PatternID>>,
- ) -> Result<MatchStates<Vec<u32>>, Error> {
- MatchStates::new(matches, self.patterns)
+ ) -> Result<MatchStates<Vec<u32>>, BuildError> {
+ MatchStates::new(matches, self.pattern_len)
}
}
@@ -3872,23 +4462,23 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
}
dst = &mut dst[..nwrite];
- // write state ID count
+ // write state ID length
// Unwrap is OK since number of states is guaranteed to fit in a u32.
- E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+ E::write_u32(u32::try_from(self.len()).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
// write slice offset pairs
for &pid in self.slices() {
- let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+ let n = wire::write_pattern_id::<E>(pid, &mut dst);
dst = &mut dst[n..];
}
- // write unique pattern ID count
+ // write unique pattern ID length
// Unwrap is OK since number of patterns is guaranteed to fit in a u32.
- E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
- // write pattern ID count
+ // write pattern ID length
// Unwrap is OK since we check at construction (and deserialization)
// that the number of patterns is representable as a u32.
E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
@@ -3896,32 +4486,32 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
// write pattern IDs
for &pid in self.pattern_ids() {
- let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+ let n = wire::write_pattern_id::<E>(pid, &mut dst);
dst = &mut dst[n..];
}
Ok(nwrite)
}
- /// Returns the number of bytes the serialized form of this transition
- /// table will use.
+ /// Returns the number of bytes the serialized form of these match states
+ /// will use.
fn write_to_len(&self) -> usize {
- size_of::<u32>() // match state count
+ size_of::<u32>() // match state length
+ (self.slices().len() * PatternID::SIZE)
- + size_of::<u32>() // unique pattern ID count
- + size_of::<u32>() // pattern ID count
+ + size_of::<u32>() // unique pattern ID length
+ + size_of::<u32>() // pattern ID length
+ (self.pattern_ids().len() * PatternID::SIZE)
}
/// Valides that the match state info is itself internally consistent and
/// consistent with the recorded match state region in the given DFA.
fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
- if self.count() != dfa.special.match_len(dfa.stride()) {
+ if self.len() != dfa.special.match_len(dfa.stride()) {
return Err(DeserializeError::generic(
- "match state count mismatch",
+ "match state length mismatch",
));
}
- for si in 0..self.count() {
+ for si in 0..self.len() {
let start = self.slices()[si * 2].as_usize();
let len = self.slices()[si * 2 + 1].as_usize();
if start >= self.pattern_ids().len() {
@@ -3936,7 +4526,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
}
for mi in 0..len {
let pid = self.pattern_id(si, mi);
- if pid.as_usize() >= self.patterns {
+ if pid.as_usize() >= self.pattern_len {
return Err(DeserializeError::generic(
"invalid pattern ID",
));
@@ -3956,10 +4546,10 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
/// }
///
/// Once shuffling is done, use MatchStates::new to convert back.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
let mut map = BTreeMap::new();
- for i in 0..self.count() {
+ for i in 0..self.len() {
let mut pids = vec![];
for j in 0..self.pattern_len(i) {
pids.push(self.pattern_id(i, j));
@@ -3974,17 +4564,17 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
MatchStates {
slices: self.slices.as_ref(),
pattern_ids: self.pattern_ids.as_ref(),
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
}
}
/// Converts these match states to an owned value.
#[cfg(feature = "alloc")]
- fn to_owned(&self) -> MatchStates<Vec<u32>> {
+ fn to_owned(&self) -> MatchStates<alloc::vec::Vec<u32>> {
MatchStates {
slices: self.slices.as_ref().to_vec(),
pattern_ids: self.pattern_ids.as_ref().to_vec(),
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
}
}
@@ -4015,6 +4605,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
///
/// The match index is the index of the pattern ID for the given state.
/// The index must be less than `self.pattern_len(state_index)`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
self.pattern_id_slice(state_index)[match_index]
}
@@ -4023,6 +4614,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
///
/// The match state index is the state index minus the state index of the
/// first match state in the DFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn pattern_len(&self, state_index: usize) -> usize {
self.slices()[state_index * 2 + 1].as_usize()
}
@@ -4031,6 +4623,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
///
/// The match state index is the state index minus the state index of the
/// first match state in the DFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
let start = self.slices()[state_index * 2].as_usize();
let len = self.pattern_len(state_index);
@@ -4038,35 +4631,22 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
}
/// Returns the pattern ID offset slice of u32 as a slice of PatternID.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn slices(&self) -> &[PatternID] {
- let integers = self.slices.as_ref();
- // SAFETY: This is safe because PatternID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts(
- integers.as_ptr() as *const PatternID,
- integers.len(),
- )
- }
+ wire::u32s_to_pattern_ids(self.slices.as_ref())
}
/// Returns the total number of match states.
- fn count(&self) -> usize {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn len(&self) -> usize {
assert_eq!(0, self.slices().len() % 2);
self.slices().len() / 2
}
/// Returns the pattern ID slice of u32 as a slice of PatternID.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn pattern_ids(&self) -> &[PatternID] {
- let integers = self.pattern_ids.as_ref();
- // SAFETY: This is safe because PatternID is guaranteed to be
- // representable as a u32.
- unsafe {
- core::slice::from_raw_parts(
- integers.as_ptr() as *const PatternID,
- integers.len(),
- )
- }
+ wire::u32s_to_pattern_ids(self.pattern_ids.as_ref())
}
/// Return the memory usage, in bytes, of these match pairs.
@@ -4075,6 +4655,86 @@ impl<T: AsRef<[u32]>> MatchStates<T> {
}
}
+/// A common set of flags for both dense and sparse DFAs. This primarily
+/// centralizes the serialization format of these flags at a bitset.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Flags {
+ /// Whether the DFA can match the empty string. When this is false, all
+ /// matches returned by this DFA are guaranteed to have non-zero length.
+ pub(crate) has_empty: bool,
+ /// Whether the DFA should only produce matches with spans that correspond
+ /// to valid UTF-8. This also includes omitting any zero-width matches that
+ /// split the UTF-8 encoding of a codepoint.
+ pub(crate) is_utf8: bool,
+ /// Whether the DFA is always anchored or not, regardless of `Input`
+ /// configuration. This is useful for avoiding a reverse scan even when
+ /// executing unanchored searches.
+ pub(crate) is_always_start_anchored: bool,
+}
+
+impl Flags {
+ /// Creates a set of flags for a DFA from an NFA.
+ ///
+ /// N.B. This constructor was defined at the time of writing because all
+ /// of the flags are derived directly from the NFA. If this changes in the
+ /// future, we might be more thoughtful about how the `Flags` value is
+ /// itself built.
+ #[cfg(feature = "dfa-build")]
+ fn from_nfa(nfa: &thompson::NFA) -> Flags {
+ Flags {
+ has_empty: nfa.has_empty(),
+ is_utf8: nfa.is_utf8(),
+ is_always_start_anchored: nfa.is_always_start_anchored(),
+ }
+ }
+
+ /// Deserializes the flags from the given slice. On success, this also
+ /// returns the number of bytes read from the slice.
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(Flags, usize), DeserializeError> {
+ let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?;
+ let flags = Flags {
+ has_empty: bits & (1 << 0) != 0,
+ is_utf8: bits & (1 << 1) != 0,
+ is_always_start_anchored: bits & (1 << 2) != 0,
+ };
+ Ok((flags, nread))
+ }
+
+ /// Writes these flags to the given byte slice. If the buffer is too small,
+ /// then an error is returned. To determine how big the buffer must be,
+ /// use `write_to_len`.
+ pub(crate) fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ fn bool_to_int(b: bool) -> u32 {
+ if b {
+ 1
+ } else {
+ 0
+ }
+ }
+
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("flag bitset"));
+ }
+ let bits = (bool_to_int(self.has_empty) << 0)
+ | (bool_to_int(self.is_utf8) << 1)
+ | (bool_to_int(self.is_always_start_anchored) << 2);
+ E::write_u32(bits, dst);
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of these flags
+ /// will use.
+ pub(crate) fn write_to_len(&self) -> usize {
+ size_of::<u32>()
+ }
+}
+
/// An iterator over all states in a DFA.
///
/// This iterator yields a tuple for each state. The first element of the
@@ -4093,7 +4753,7 @@ impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
fn next(&mut self) -> Option<State<'a>> {
self.it.next().map(|(index, _)| {
- let id = self.tt.from_index(index);
+ let id = self.tt.to_state_id(index);
self.tt.state(id)
})
}
@@ -4146,7 +4806,7 @@ impl<'a> State<'a> {
/// Analyzes this state to determine whether it can be accelerated. If so,
/// it returns an accelerator that contains at least one byte.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
// We just try to add bytes to our accelerator. Once adding fails
// (because we've added too many bytes), then give up.
@@ -4173,66 +4833,25 @@ impl<'a> State<'a> {
impl<'a> fmt::Debug for State<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- for (i, (start, end, id)) in self.sparse_transitions().enumerate() {
- let index = if f.alternate() {
- id.as_usize()
+ for (i, (start, end, sid)) in self.sparse_transitions().enumerate() {
+ let id = if f.alternate() {
+ sid.as_usize()
} else {
- id.as_usize() >> self.stride2
+ sid.as_usize() >> self.stride2
};
if i > 0 {
write!(f, ", ")?;
}
if start == end {
- write!(f, "{:?} => {:?}", start, index)?;
+ write!(f, "{:?} => {:?}", start, id)?;
} else {
- write!(f, "{:?}-{:?} => {:?}", start, end, index)?;
+ write!(f, "{:?}-{:?} => {:?}", start, end, id)?;
}
}
Ok(())
}
}
-/// A mutable representation of a single DFA state.
-///
-/// `'a` correspondings to the lifetime of a DFA's transition table.
-#[cfg(feature = "alloc")]
-pub(crate) struct StateMut<'a> {
- id: StateID,
- stride2: usize,
- transitions: &'a mut [StateID],
-}
-
-#[cfg(feature = "alloc")]
-impl<'a> StateMut<'a> {
- /// Return an iterator over all transitions in this state. This yields
- /// a number of transitions equivalent to the alphabet length of the
- /// corresponding DFA.
- ///
- /// Each transition is represented by a tuple. The first element is the
- /// input byte for that transition and the second element is a mutable
- /// reference to the transition itself.
- pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> {
- StateTransitionIterMut {
- len: self.transitions.len(),
- it: self.transitions.iter_mut().enumerate(),
- }
- }
-}
-
-#[cfg(feature = "alloc")]
-impl<'a> fmt::Debug for StateMut<'a> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- fmt::Debug::fmt(
- &State {
- id: self.id,
- stride2: self.stride2,
- transitions: self.transitions,
- },
- f,
- )
- }
-}
-
/// An iterator over all transitions in a single DFA state. This yields
/// a number of transitions equivalent to the alphabet length of the
/// corresponding DFA.
@@ -4262,36 +4881,6 @@ impl<'a> Iterator for StateTransitionIter<'a> {
}
}
-/// A mutable iterator over all transitions in a DFA state.
-///
-/// Each transition is represented by a tuple. The first element is the
-/// input byte for that transition and the second element is a mutable
-/// reference to the transition itself.
-#[cfg(feature = "alloc")]
-#[derive(Debug)]
-pub(crate) struct StateTransitionIterMut<'a> {
- len: usize,
- it: iter::Enumerate<slice::IterMut<'a, StateID>>,
-}
-
-#[cfg(feature = "alloc")]
-impl<'a> Iterator for StateTransitionIterMut<'a> {
- type Item = (alphabet::Unit, &'a mut StateID);
-
- fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> {
- self.it.next().map(|(i, id)| {
- let unit = if i + 1 == self.len {
- alphabet::Unit::eoi(i)
- } else {
- let b = u8::try_from(i)
- .expect("raw byte alphabet is never exceeded");
- alphabet::Unit::u8(b)
- };
- (unit, id)
- })
- }
-}
-
/// An iterator over all non-DEAD transitions in a single DFA state using a
/// sparse representation.
///
@@ -4338,104 +4927,164 @@ impl<'a> Iterator for StateSparseTransitionIter<'a> {
}
}
-/// An iterator over pattern IDs for a single match state.
-#[derive(Debug)]
-pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>);
-
-impl<'a> Iterator for PatternIDIter<'a> {
- type Item = PatternID;
-
- fn next(&mut self) -> Option<PatternID> {
- self.0.next().copied()
- }
+/// An error that occurred during the construction of a DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`nfa::thompson::BuildError`](thompson::BuildError)
+/// type from its `source` method via the `std::error::Error` trait. This error
+/// only occurs when using convenience routines for building a DFA directly
+/// from a pattern string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
}
-/// Remapper is an abstraction the manages the remapping of state IDs in a
-/// dense DFA. This is useful when one wants to shuffle states into different
-/// positions in the DFA.
-///
-/// One of the key complexities this manages is the ability to correctly move
-/// one state multiple times.
+/// The kind of error that occurred during the construction of a DFA.
///
-/// Once shuffling is complete, `remap` should be called, which will rewrite
-/// all pertinent transitions to updated state IDs.
-#[cfg(feature = "alloc")]
-#[derive(Debug)]
-struct Remapper {
- /// A map from the index of a state to its pre-multiplied identifier.
- ///
- /// When a state is swapped with another, then their corresponding
- /// locations in this map are also swapped. Thus, its new position will
- /// still point to its old pre-multiplied StateID.
- ///
- /// While there is a bit more to it, this then allows us to rewrite the
- /// state IDs in a DFA's transition table in a single pass. This is done
- /// by iterating over every ID in this map, then iterating over each
- /// transition for the state at that ID and re-mapping the transition from
- /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
- /// in this map where `old_id` *started*, and set it to where it ended up
- /// after all swaps have been completed.
- map: Vec<StateID>,
+/// Note that this error is non-exhaustive. Adding new variants is not
+/// considered a breaking change.
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ /// An error that occurred while constructing an NFA as a precursor step
+ /// before a DFA is compiled.
+ NFA(thompson::BuildError),
+ /// An error that occurred because an unsupported regex feature was used.
+ /// The message string describes which unsupported feature was used.
+ ///
+ /// The primary regex feature that is unsupported by DFAs is the Unicode
+ /// word boundary look-around assertion (`\b`). This can be worked around
+ /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling
+ /// Unicode word boundaries when building a DFA.
+ Unsupported(&'static str),
+ /// An error that occurs if too many states are produced while building a
+ /// DFA.
+ TooManyStates,
+ /// An error that occurs if too many start states are needed while building
+ /// a DFA.
+ ///
+ /// This is a kind of oddball error that occurs when building a DFA with
+ /// start states enabled for each pattern and enough patterns to cause
+ /// the table of start states to overflow `usize`.
+ TooManyStartStates,
+ /// This is another oddball error that can occur if there are too many
+ /// patterns spread out across too many match states.
+ TooManyMatchPatternIDs,
+ /// An error that occurs if the DFA got too big during determinization.
+ DFAExceededSizeLimit { limit: usize },
+ /// An error that occurs if auxiliary storage (not the DFA) used during
+ /// determinization got too big.
+ DeterminizeExceededSizeLimit { limit: usize },
}
-#[cfg(feature = "alloc")]
-impl Remapper {
- fn from_dfa(dfa: &OwnedDFA) -> Remapper {
- Remapper {
- map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(),
+#[cfg(feature = "dfa-build")]
+impl BuildError {
+ /// Return the kind of this error.
+ fn kind(&self) -> &BuildErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn nfa(err: thompson::BuildError) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
+ let msg = "cannot build DFAs for regexes with Unicode word \
+ boundaries; switch to ASCII word boundaries, or \
+ heuristically enable Unicode word boundaries or use a \
+ different regex engine";
+ BuildError { kind: BuildErrorKind::Unsupported(msg) }
+ }
+
+ pub(crate) fn too_many_states() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStates }
+ }
+
+ pub(crate) fn too_many_start_states() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStartStates }
+ }
+
+ pub(crate) fn too_many_match_pattern_ids() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs }
+ }
+
+ pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } }
+ }
+
+ pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit },
}
}
+}
- fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) {
- dfa.swap_states(id1, id2);
- self.map.swap(dfa.to_index(id1), dfa.to_index(id2));
+#[cfg(all(feature = "std", feature = "dfa-build"))]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ BuildErrorKind::NFA(ref err) => Some(err),
+ _ => None,
+ }
}
+}
- fn remap(mut self, dfa: &mut OwnedDFA) {
- // Update the map to account for states that have been swapped
- // multiple times. For example, if (A, C) and (C, G) are swapped, then
- // transitions previously pointing to A should now point to G. But if
- // we don't update our map, they will erroneously be set to C. All we
- // do is follow the swaps in our map until we see our original state
- // ID.
- let oldmap = self.map.clone();
- for i in 0..dfa.state_count() {
- let cur_id = dfa.from_index(i);
- let mut new = oldmap[i];
- if cur_id == new {
- continue;
+#[cfg(feature = "dfa-build")]
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+ BuildErrorKind::Unsupported(ref msg) => {
+ write!(f, "unsupported regex feature for DFAs: {}", msg)
}
- loop {
- let id = oldmap[dfa.to_index(new)];
- if cur_id == id {
- self.map[i] = new;
- break;
- }
- new = id;
+ BuildErrorKind::TooManyStates => write!(
+ f,
+ "number of DFA states exceeds limit of {}",
+ StateID::LIMIT,
+ ),
+ BuildErrorKind::TooManyStartStates => {
+ let stride = Start::len();
+ // The start table has `stride` entries for starting states for
+ // the entire DFA, and then `stride` entries for each pattern
+ // if start states for each pattern are enabled (which is the
+ // only way this error can occur). Thus, the total number of
+ // patterns that can fit in the table is `stride` less than
+ // what we can allocate.
+ let max = usize::try_from(core::isize::MAX).unwrap();
+ let limit = (max - stride) / stride;
+ write!(
+ f,
+ "compiling DFA with start states exceeds pattern \
+ pattern limit of {}",
+ limit,
+ )
}
- }
-
- // To work around the borrow checker for converting state IDs to
- // indices. We cannot borrow self while mutably iterating over a
- // state's transitions. Otherwise, we'd just use dfa.to_index(..).
- let stride2 = dfa.stride2();
- let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
-
- // Now that we've finished shuffling, we need to remap all of our
- // transitions. We don't need to handle re-mapping accelerated states
- // since `accels` is only populated after shuffling.
- for &id in self.map.iter() {
- for (_, next_id) in dfa.state_mut(id).iter_mut() {
- *next_id = self.map[to_index(*next_id)];
+ BuildErrorKind::TooManyMatchPatternIDs => write!(
+ f,
+ "compiling DFA with total patterns in all match states \
+ exceeds limit of {}",
+ PatternID::LIMIT,
+ ),
+ BuildErrorKind::DFAExceededSizeLimit { limit } => write!(
+ f,
+ "DFA exceeded size limit of {:?} during determinization",
+ limit,
+ ),
+ BuildErrorKind::DeterminizeExceededSizeLimit { limit } => {
+ write!(f, "determinization exceeded size limit of {:?}", limit)
}
}
- for start_id in dfa.st.table_mut().iter_mut() {
- *start_id = self.map[to_index(*start_id)];
- }
}
}
-#[cfg(all(test, feature = "alloc"))]
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
mod tests {
use super::*;
@@ -4451,7 +5100,7 @@ mod tests {
let (buf, _) = dfa.to_bytes_native_endian();
let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
- assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap());
+ assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap());
}
#[test]
@@ -4464,7 +5113,27 @@ mod tests {
assert_eq!(
Some(HalfMatch::must(0, 0)),
- dfa.find_leftmost_fwd(b"foo12345").unwrap()
+ dfa.try_search_fwd(&Input::new("foo12345")).unwrap()
);
}
+
+ // See the analogous test in src/hybrid/dfa.rs.
+ #[test]
+ fn heuristic_unicode_reverse() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+ }
}
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs
index 61603481b..19f99f5d6 100644
--- a/vendor/regex-automata/src/dfa/determinize.rs
+++ b/vendor/regex-automata/src/dfa/determinize.rs
@@ -1,18 +1,18 @@
-use alloc::{
- collections::BTreeMap,
- vec::{self, Vec},
-};
+use alloc::{collections::BTreeMap, vec::Vec};
use crate::{
- dfa::{dense, Error, DEAD},
+ dfa::{
+ dense::{self, BuildError},
+ DEAD,
+ },
nfa::thompson,
util::{
self,
alphabet::{self, ByteSet},
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
- id::{PatternID, StateID},
- matchtypes::MatchKind,
- sparse_set::{SparseSet, SparseSets},
+ primitives::{PatternID, StateID},
+ search::{Anchored, MatchKind},
+ sparse_set::SparseSets,
start::Start,
},
};
@@ -20,7 +20,6 @@ use crate::{
/// A builder for configuring and running a DFA determinizer.
#[derive(Clone, Debug)]
pub(crate) struct Config {
- anchored: bool,
match_kind: MatchKind,
quit: ByteSet,
dfa_size_limit: Option<usize>,
@@ -32,7 +31,6 @@ impl Config {
/// configured before calling `run`.
pub fn new() -> Config {
Config {
- anchored: false,
match_kind: MatchKind::LeftmostFirst,
quit: ByteSet::empty(),
dfa_size_limit: None,
@@ -48,7 +46,7 @@ impl Config {
&self,
nfa: &thompson::NFA,
dfa: &mut dense::OwnedDFA,
- ) -> Result<(), Error> {
+ ) -> Result<(), BuildError> {
let dead = State::dead();
let quit = State::dead();
let mut cache = StateMap::default();
@@ -71,21 +69,13 @@ impl Config {
builder_states: alloc::vec![dead, quit],
cache,
memory_usage_state: 0,
- sparses: SparseSets::new(nfa.len()),
+ sparses: SparseSets::new(nfa.states().len()),
stack: alloc::vec![],
scratch_state_builder: StateBuilderEmpty::new(),
};
runner.run()
}
- /// Whether to build an anchored DFA or not. When disabled (the default),
- /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
- /// the anchored start state of the NFA is used to start the DFA.
- pub fn anchored(&mut self, yes: bool) -> &mut Config {
- self.anchored = yes;
- self
- }
-
/// The match semantics to use for determinization.
///
/// MatchKind::All corresponds to the standard textbook construction.
@@ -222,20 +212,21 @@ impl<'a> Runner<'a> {
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
- fn run(mut self) -> Result<(), Error> {
- if self.nfa.has_word_boundary_unicode()
+ fn run(mut self) -> Result<(), BuildError> {
+ if self.nfa.look_set_any().contains_word_unicode()
&& !self.config.quit.contains_range(0x80, 0xFF)
{
- return Err(Error::unsupported_dfa_word_boundary_unicode());
+ return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}
// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
// transitions for bytes that are guaranteed to produce identical
- // results.
+ // results. Since computing the representatives needs to do a little
+ // work, we do it once here because we'll be iterating over them a lot.
let representatives: Vec<alphabet::Unit> =
- self.dfa.byte_classes().representatives().collect();
+ self.dfa.byte_classes().representatives(..).collect();
// The set of all DFA state IDs that still need to have their
// transitions set. We start by seeding this with all starting states.
let mut uncompiled = alloc::vec![];
@@ -259,10 +250,13 @@ impl<'a> Runner<'a> {
}
}
}
- trace!(
- "determinization complete, memory usage: {}, dense DFA size: {}",
+ debug!(
+ "determinization complete, memory usage: {}, \
+ dense DFA size: {}, \
+ is reverse? {}",
self.memory_usage(),
self.dfa.memory_usage(),
+ self.nfa.is_reverse(),
);
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
@@ -270,21 +264,23 @@ impl<'a> Runner<'a> {
// corresponding to the key.
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
self.cache.clear();
- #[allow(unused_variables)]
- let mut total_pat_count = 0;
+ #[cfg(feature = "logging")]
+ let mut total_pat_len = 0;
for (i, state) in self.builder_states.into_iter().enumerate() {
if let Some(pat_ids) = state.match_pattern_ids() {
- let id = self.dfa.from_index(i);
- total_pat_count += pat_ids.len();
+ let id = self.dfa.to_state_id(i);
+ log! {
+ total_pat_len += pat_ids.len();
+ }
matches.insert(id, pat_ids);
}
}
log! {
use core::mem::size_of;
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
- let pats = total_pat_count * size_of::<PatternID>();
+ let pats = total_pat_len * size_of::<PatternID>();
let mem = (matches.len() * per_elem) + pats;
- log::trace!("matches map built, memory usage: {}", mem);
+ log::debug!("matches map built, memory usage: {}", mem);
}
// At this point, we shuffle the "special" states in the final DFA.
// This permits a DFA's match loop to detect a match condition (among
@@ -306,7 +302,7 @@ impl<'a> Runner<'a> {
&mut self,
dfa_id: StateID,
unit: alphabet::Unit,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
// Compute the set of all reachable NFA states, including epsilons.
let empty_builder = self.get_state_builder();
let builder = util::determinize::next(
@@ -326,15 +322,32 @@ impl<'a> Runner<'a> {
fn add_all_starts(
&mut self,
dfa_state_ids: &mut Vec<StateID>,
- ) -> Result<(), Error> {
- // Always add the (possibly unanchored) start states for matching any
- // of the patterns in this DFA.
- self.add_start_group(None, dfa_state_ids)?;
+ ) -> Result<(), BuildError> {
+ // These should be the first states added.
+ assert!(dfa_state_ids.is_empty());
+ // We only want to add (un)anchored starting states that is consistent
+ // with our DFA's configuration. Unconditionally adding both (although
+ // it is the default) can make DFAs quite a bit bigger.
+ if self.dfa.start_kind().has_unanchored() {
+ self.add_start_group(Anchored::No, dfa_state_ids)?;
+ }
+ if self.dfa.start_kind().has_anchored() {
+ self.add_start_group(Anchored::Yes, dfa_state_ids)?;
+ }
+ // I previously has an 'assert' here checking that either
+ // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
+ // turns out this isn't always true. For example, the NFA might have
+ // one or more patterns but where all such patterns are just 'fail'
+ // states. These will ultimately just compile down to DFA dead states,
+ // and since the dead state was added earlier, no new DFA states are
+ // added. And thus, it is valid and okay for 'dfa_state_ids' to be
+ // empty even if there are a non-zero number of patterns in the NFA.
+
// We only need to compute anchored start states for each pattern if it
// was requested to do so.
- if self.dfa.has_starts_for_each_pattern() {
- for pid in PatternID::iter(self.dfa.pattern_count()) {
- self.add_start_group(Some(pid), dfa_state_ids)?;
+ if self.dfa.starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
}
}
Ok(())
@@ -348,15 +361,19 @@ impl<'a> Runner<'a> {
/// start states (if the DFA is unanchored). When the pattern_id is
/// present, then this will compile a group of anchored start states that
/// only match the given pattern.
+ ///
+ /// This panics if `anchored` corresponds to an invalid pattern ID.
fn add_start_group(
&mut self,
- pattern_id: Option<PatternID>,
+ anchored: Anchored,
dfa_state_ids: &mut Vec<StateID>,
- ) -> Result<(), Error> {
- let nfa_start = match pattern_id {
- Some(pid) => self.nfa.start_pattern(pid),
- None if self.config.anchored => self.nfa.start_anchored(),
- None => self.nfa.start_unanchored(),
+ ) -> Result<(), BuildError> {
+ let nfa_start = match anchored {
+ Anchored::No => self.nfa.start_unanchored(),
+ Anchored::Yes => self.nfa.start_anchored(),
+ Anchored::Pattern(pid) => {
+ self.nfa.start_pattern(pid).expect("valid pattern ID")
+ }
};
// When compiling start states, we're careful not to build additional
@@ -365,36 +382,68 @@ impl<'a> Runner<'a> {
// states for 'NonWordByte' and 'WordByte' starting configurations.
// Instead, the 'WordByte' starting configuration can just point
// directly to the start state for the 'NonWordByte' config.
+ //
+ // Note though that we only need to care about assertions in the prefix
+ // of an NFA since this only concerns the starting states. (Actually,
+ // the most precisely thing we could do it is look at the prefix
+ // assertions of each pattern when 'anchored == Anchored::Pattern',
+ // and then only compile extra states if the prefix is non-empty.) But
+ // we settle for simplicity here instead of absolute minimalism. It is
+ // somewhat rare, after all, for multiple patterns in the same regex to
+ // have different prefix look-arounds.
let (id, is_new) =
self.add_one_start(nfa_start, Start::NonWordByte)?;
- self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::NonWordByte, id);
if is_new {
dfa_state_ids.push(id);
}
- if !self.nfa.has_word_boundary() {
- self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ if !self.nfa.look_set_prefix_any().contains_word() {
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
} else {
let (id, is_new) =
self.add_one_start(nfa_start, Start::WordByte)?;
- self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
if is_new {
dfa_state_ids.push(id);
}
}
- if !self.nfa.has_any_anchor() {
- self.dfa.set_start_state(Start::Text, pattern_id, id);
- self.dfa.set_start_state(Start::Line, pattern_id, id);
+ if !self.nfa.look_set_prefix_any().contains_anchor() {
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
} else {
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
- self.dfa.set_start_state(Start::Text, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
if is_new {
dfa_state_ids.push(id);
}
- let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
- self.dfa.set_start_state(Start::Line, pattern_id, id);
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
if is_new {
dfa_state_ids.push(id);
}
@@ -414,13 +463,14 @@ impl<'a> Runner<'a> {
&mut self,
nfa_start: StateID,
start: Start,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
// Compute the look-behind assertions that are true in this starting
// configuration, and the determine the epsilon closure. While
// computing the epsilon closure, we only follow condiional epsilon
- // transitions that satisfy the look-behind assertions in 'facts'.
+ // transitions that satisfy the look-behind assertions in 'look_have'.
let mut builder_matches = self.get_state_builder().into_matches();
util::determinize::set_lookbehind_from_start(
+ self.nfa,
&start,
&mut builder_matches,
);
@@ -428,7 +478,7 @@ impl<'a> Runner<'a> {
util::determinize::epsilon_closure(
self.nfa,
nfa_start,
- *builder_matches.look_have(),
+ builder_matches.look_have(),
&mut self.stack,
&mut self.sparses.set1,
);
@@ -455,7 +505,7 @@ impl<'a> Runner<'a> {
fn maybe_add_state(
&mut self,
builder: StateBuilderNFA,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
@@ -476,7 +526,7 @@ impl<'a> Runner<'a> {
fn add_state(
&mut self,
builder: StateBuilderNFA,
- ) -> Result<StateID, Error> {
+ ) -> Result<StateID, BuildError> {
let id = self.dfa.add_empty_state()?;
if !self.config.quit.is_empty() {
for b in self.config.quit.iter() {
@@ -489,19 +539,21 @@ impl<'a> Runner<'a> {
}
let state = builder.to_state();
// States use reference counting internally, so we only need to count
- // their memroy usage once.
+ // their memory usage once.
self.memory_usage_state += state.memory_usage();
self.builder_states.push(state.clone());
self.cache.insert(state, id);
self.put_state_builder(builder);
if let Some(limit) = self.config.dfa_size_limit {
if self.dfa.memory_usage() > limit {
- return Err(Error::dfa_exceeded_size_limit(limit));
+ return Err(BuildError::dfa_exceeded_size_limit(limit));
}
}
if let Some(limit) = self.config.determinize_size_limit {
if self.memory_usage() > limit {
- return Err(Error::determinize_exceeded_size_limit(limit));
+ return Err(BuildError::determinize_exceeded_size_limit(
+ limit,
+ ));
}
}
Ok(id)
diff --git a/vendor/regex-automata/src/dfa/error.rs b/vendor/regex-automata/src/dfa/error.rs
deleted file mode 100644
index 6497a4cff..000000000
--- a/vendor/regex-automata/src/dfa/error.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-use crate::{
- nfa,
- util::{
- id::{PatternID, StateID},
- start::Start,
- },
-};
-
-/// An error that occurred during the construction of a DFA.
-///
-/// This error does not provide many introspection capabilities. There are
-/// generally only two things you can do with it:
-///
-/// * Obtain a human readable message via its `std::fmt::Display` impl.
-/// * Access an underlying [`nfa::thompson::Error`] type from its `source`
-/// method via the `std::error::Error` trait. This error only occurs when using
-/// convenience routines for building a DFA directly from a pattern string.
-///
-/// When the `std` feature is enabled, this implements the `std::error::Error`
-/// trait.
-#[derive(Clone, Debug)]
-pub struct Error {
- kind: ErrorKind,
-}
-
-/// The kind of error that occurred during the construction of a DFA.
-///
-/// Note that this error is non-exhaustive. Adding new variants is not
-/// considered a breaking change.
-#[derive(Clone, Debug)]
-enum ErrorKind {
- /// An error that occurred while constructing an NFA as a precursor step
- /// before a DFA is compiled.
- NFA(nfa::thompson::Error),
- /// An error that occurred because an unsupported regex feature was used.
- /// The message string describes which unsupported feature was used.
- ///
- /// The primary regex feature that is unsupported by DFAs is the Unicode
- /// word boundary look-around assertion (`\b`). This can be worked around
- /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the
- /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary)
- /// option when building a DFA.
- Unsupported(&'static str),
- /// An error that occurs if too many states are produced while building a
- /// DFA.
- TooManyStates,
- /// An error that occurs if too many start states are needed while building
- /// a DFA.
- ///
- /// This is a kind of oddball error that occurs when building a DFA with
- /// start states enabled for each pattern and enough patterns to cause
- /// the table of start states to overflow `usize`.
- TooManyStartStates,
- /// This is another oddball error that can occur if there are too many
- /// patterns spread out across too many match states.
- TooManyMatchPatternIDs,
- /// An error that occurs if the DFA got too big during determinization.
- DFAExceededSizeLimit { limit: usize },
- /// An error that occurs if auxiliary storage (not the DFA) used during
- /// determinization got too big.
- DeterminizeExceededSizeLimit { limit: usize },
-}
-
-impl Error {
- /// Return the kind of this error.
- fn kind(&self) -> &ErrorKind {
- &self.kind
- }
-
- pub(crate) fn nfa(err: nfa::thompson::Error) -> Error {
- Error { kind: ErrorKind::NFA(err) }
- }
-
- pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error {
- let msg = "cannot build DFAs for regexes with Unicode word \
- boundaries; switch to ASCII word boundaries, or \
- heuristically enable Unicode word boundaries or use a \
- different regex engine";
- Error { kind: ErrorKind::Unsupported(msg) }
- }
-
- pub(crate) fn too_many_states() -> Error {
- Error { kind: ErrorKind::TooManyStates }
- }
-
- pub(crate) fn too_many_start_states() -> Error {
- Error { kind: ErrorKind::TooManyStartStates }
- }
-
- pub(crate) fn too_many_match_pattern_ids() -> Error {
- Error { kind: ErrorKind::TooManyMatchPatternIDs }
- }
-
- pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error {
- Error { kind: ErrorKind::DFAExceededSizeLimit { limit } }
- }
-
- pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error {
- Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } }
- }
-}
-
-#[cfg(feature = "std")]
-impl std::error::Error for Error {
- fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
- match self.kind() {
- ErrorKind::NFA(ref err) => Some(err),
- ErrorKind::Unsupported(_) => None,
- ErrorKind::TooManyStates => None,
- ErrorKind::TooManyStartStates => None,
- ErrorKind::TooManyMatchPatternIDs => None,
- ErrorKind::DFAExceededSizeLimit { .. } => None,
- ErrorKind::DeterminizeExceededSizeLimit { .. } => None,
- }
- }
-}
-
-impl core::fmt::Display for Error {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- match self.kind() {
- ErrorKind::NFA(_) => write!(f, "error building NFA"),
- ErrorKind::Unsupported(ref msg) => {
- write!(f, "unsupported regex feature for DFAs: {}", msg)
- }
- ErrorKind::TooManyStates => write!(
- f,
- "number of DFA states exceeds limit of {}",
- StateID::LIMIT,
- ),
- ErrorKind::TooManyStartStates => {
- let stride = Start::count();
- // The start table has `stride` entries for starting states for
- // the entire DFA, and then `stride` entries for each pattern
- // if start states for each pattern are enabled (which is the
- // only way this error can occur). Thus, the total number of
- // patterns that can fit in the table is `stride` less than
- // what we can allocate.
- let limit = ((core::isize::MAX as usize) - stride) / stride;
- write!(
- f,
- "compiling DFA with start states exceeds pattern \
- pattern limit of {}",
- limit,
- )
- }
- ErrorKind::TooManyMatchPatternIDs => write!(
- f,
- "compiling DFA with total patterns in all match states \
- exceeds limit of {}",
- PatternID::LIMIT,
- ),
- ErrorKind::DFAExceededSizeLimit { limit } => write!(
- f,
- "DFA exceeded size limit of {:?} during determinization",
- limit,
- ),
- ErrorKind::DeterminizeExceededSizeLimit { limit } => {
- write!(f, "determinization exceeded size limit of {:?}", limit)
- }
- }
- }
-}
diff --git a/vendor/regex-automata/src/dfa/minimize.rs b/vendor/regex-automata/src/dfa/minimize.rs
index 80e2f4e73..fea925bdc 100644
--- a/vendor/regex-automata/src/dfa/minimize.rs
+++ b/vendor/regex-automata/src/dfa/minimize.rs
@@ -6,7 +6,7 @@ use crate::{
dfa::{automaton::Automaton, dense, DEAD},
util::{
alphabet,
- id::{PatternID, StateID},
+ primitives::{PatternID, StateID},
},
};
@@ -152,13 +152,13 @@ impl<'a> Minimizer<'a> {
// At this point, we now have a minimal partitioning of states, where
// each partition is an equivalence class of DFA states. Now we need to
- // use this partioning to update the DFA to only contain one state for
+ // use this partitioning to update the DFA to only contain one state for
// each partition.
// Create a map from DFA state ID to the representative ID of the
// equivalence class to which it belongs. The representative ID of an
// equivalence class of states is the minimum ID in that class.
- let mut state_to_part = vec![DEAD; self.dfa.state_count()];
+ let mut state_to_part = vec![DEAD; self.dfa.state_len()];
for p in &self.partitions {
p.iter(|id| state_to_part[as_index(id)] = p.min());
}
@@ -167,7 +167,7 @@ impl<'a> Minimizer<'a> {
// create a map from equivalence IDs to the new IDs. Thus, the new
// minimal ID of *any* state in the unminimized DFA can be obtained
// with minimals_ids[state_to_part[old_id]].
- let mut minimal_ids = vec![DEAD; self.dfa.state_count()];
+ let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
let mut new_index = 0;
for state in self.dfa.states() {
if state_to_part[as_index(state.id())] == state.id() {
@@ -184,15 +184,13 @@ impl<'a> Minimizer<'a> {
// Re-map this DFA in place such that the only states remaining
// correspond to the representative states of every equivalence class.
- for id in (0..self.dfa.state_count()).map(as_state_id) {
+ for id in (0..self.dfa.state_len()).map(as_state_id) {
// If this state isn't a representative for an equivalence class,
// then we skip it since it won't appear in the minimal DFA.
if state_to_part[as_index(id)] != id {
continue;
}
- for (_, next) in self.dfa.state_mut(id).iter_mut() {
- *next = remap(*next);
- }
+ self.dfa.remap_state(id, remap);
self.dfa.swap_states(id, minimal_ids[as_index(id)]);
}
// Trim off all unused states from the pre-minimized DFA. This
@@ -208,8 +206,12 @@ impl<'a> Minimizer<'a> {
// We're already allocating so much that this is probably fine. If this
// turns out to be costly, then I guess add a `starts_mut` iterator.
let starts: Vec<_> = self.dfa.starts().collect();
- for (old_start_id, start_type, pid) in starts {
- self.dfa.set_start_state(start_type, pid, remap(old_start_id));
+ for (old_start_id, anchored, start_type) in starts {
+ self.dfa.set_start_state(
+ anchored,
+ start_type,
+ remap(old_start_id),
+ );
}
// Update the match state pattern ID list for multi-regexes. All we
@@ -305,7 +307,7 @@ impl<'a> Minimizer<'a> {
for state in dfa.states() {
if dfa.is_match_state(state.id()) {
let mut pids = vec![];
- for i in 0..dfa.match_count(state.id()) {
+ for i in 0..dfa.match_len(state.id()) {
pids.push(dfa.match_pattern(state.id(), i));
}
matching
diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs
index 6f9fe605e..4bb870435 100644
--- a/vendor/regex-automata/src/dfa/mod.rs
+++ b/vendor/regex-automata/src/dfa/mod.rs
@@ -1,5 +1,5 @@
/*!
-A module for building and searching with determinstic finite automata (DFAs).
+A module for building and searching with deterministic finite automata (DFAs).
Like other modules in this crate, DFAs support a rich regex syntax with Unicode
features. DFAs also have extensive options for configuring the best space vs
@@ -26,20 +26,25 @@ DFAs implement. (A `regex::Regex` is generic over this trait.)
[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
[`dense::DFA::from_bytes`]).
+There is also a [`onepass`] module that provides a [one-pass
+DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
+of regexes it can be built with, it supports reporting the spans of matching
+capturing groups. It is the only DFA in this crate capable of such a thing.
+
# Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
-use regex_automata::{MultiMatch, dfa::regex::Regex};
+use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -51,36 +56,15 @@ simultaneously. You can use this support with standard leftmost-first style
searching to find non-overlapping matches:
```
-use regex_automata::{MultiMatch, dfa::regex::Regex};
+# if cfg!(miri) { return Ok(()); } // miri takes too long
+use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
let text = b"@foo bar";
-let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(1, 0, 4),
- MultiMatch::must(0, 5, 8),
-]);
-# Ok::<(), Box<dyn std::error::Error>>(())
-```
-
-Or use overlapping style searches to find all possible occurrences:
-
-```
-use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}};
-
-// N.B. For overlapping searches, we need the underlying DFA to report all
-// possible matches.
-let re = Regex::builder()
- .dense(dense::Config::new().match_kind(MatchKind::All))
- .build_many(&[r"\w{3}", r"\S{3}"])?;
-let text = b"@foo bar";
-let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect();
-assert_eq!(matches, vec![
- MultiMatch::must(1, 0, 3),
- MultiMatch::must(0, 1, 4),
- MultiMatch::must(1, 1, 4),
- MultiMatch::must(0, 5, 8),
- MultiMatch::must(1, 5, 8),
+ Match::must(1, 0..4),
+ Match::must(0, 5..8),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -96,14 +80,14 @@ Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
`Regex::new`:
```
-use regex_automata::{MultiMatch, dfa::regex::Regex};
+use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -112,7 +96,7 @@ If you already have dense DFAs for some reason, they can be converted to sparse
DFAs and used to build a new `Regex`. For example:
```
-use regex_automata::{MultiMatch, dfa::regex::Regex};
+use regex_automata::{Match, dfa::regex::Regex};
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let sparse_re = Regex::builder().build_from_dfas(
@@ -120,10 +104,10 @@ let sparse_re = Regex::builder().build_from_dfas(
dense_re.reverse().to_sparse()?,
);
let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = sparse_re.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -136,7 +120,7 @@ bit contrived, this same technique can be used in your program to
deserialize a DFA at start up time or by memory mapping a file.
```
-use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}};
+use regex_automata::{Match, dfa::{dense, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both the forward and reverse DFAs, see note below
@@ -150,10 +134,10 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -183,7 +167,7 @@ valid DFA.
The same process can be achieved with sparse DFAs as well:
```
-use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}};
+use regex_automata::{Match, dfa::{sparse, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both
@@ -197,17 +181,17 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
Conversely, dense DFAs must be be aligned to the same alignment as a
-[`StateID`](crate::util::id::StateID).
+[`StateID`](crate::util::primitives::StateID).
# Support for `no_std` and `alloc`-only
@@ -232,8 +216,8 @@ you would any regex.
Deserialization can happen anywhere. For example, with bytes embedded into a
binary or with a file memory mapped at runtime.
-TODO: Include link to `regex-cli` here pointing out how to generate Rust code
-for deserializing DFAs.
+The `regex-cli` command (found in the same repository as this crate) can be
+used to serialize DFAs to files and generate Rust code to read them.
# Syntax
@@ -283,7 +267,7 @@ the regexes in this module are almost universally slow to compile, especially
when they contain large Unicode character classes. For example, on my system,
compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
a sparse regex takes about the same time but only uses about 1.2MB of
-memory.) Conversly, compiling the same regex without Unicode support, e.g.,
+memory.) Conversely, compiling the same regex without Unicode support, e.g.,
`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
reason, you should only use Unicode character classes if you absolutely need
them! (They are enabled by default though.)
@@ -299,10 +283,10 @@ optimizations means that searches may run much slower than what you're
accustomed to, although, it does provide more predictable and consistent
performance.
* There is no `&str` API like in the regex crate. In this module, all APIs
-operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8
-boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8),
-[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or
-[`regex::Config::utf8`] are disabled.
+operate on `&[u8]`. By default, match indices are
+guaranteed to fall on UTF-8 boundaries, unless either of
+[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
+[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
With some of the downsides out of the way, here are some positive differences:
@@ -334,9 +318,11 @@ via [`dense::Config::minimize`], but it can increase compilation times
dramatically.
*/
-pub use crate::dfa::automaton::{Automaton, OverlappingState};
-#[cfg(feature = "alloc")]
-pub use crate::dfa::error::Error;
+#[cfg(feature = "dfa-search")]
+pub use crate::dfa::{
+ automaton::{Automaton, OverlappingState},
+ start::StartKind,
+};
/// This is an alias for a state ID of zero. It has special significance
/// because it always corresponds to the first state in a DFA, and the first
@@ -344,20 +330,31 @@ pub use crate::dfa::error::Error;
/// of its transitions set to itself. Moreover, the dead state is used as a
/// sentinel for various things. e.g., In search, reaching a dead state means
/// that the search must stop.
-const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO;
+const DEAD: crate::util::primitives::StateID =
+ crate::util::primitives::StateID::ZERO;
-mod accel;
-mod automaton;
+#[cfg(feature = "dfa-search")]
pub mod dense;
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-onepass")]
+pub mod onepass;
+#[cfg(feature = "dfa-search")]
+pub mod regex;
+#[cfg(feature = "dfa-search")]
+pub mod sparse;
+
+#[cfg(feature = "dfa-search")]
+pub(crate) mod accel;
+#[cfg(feature = "dfa-search")]
+mod automaton;
+#[cfg(feature = "dfa-build")]
mod determinize;
-#[cfg(feature = "alloc")]
-pub(crate) mod error;
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
mod minimize;
-pub mod regex;
+#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
+mod remapper;
+#[cfg(feature = "dfa-search")]
mod search;
-pub mod sparse;
+#[cfg(feature = "dfa-search")]
mod special;
-#[cfg(feature = "transducer")]
-mod transducer;
+#[cfg(feature = "dfa-search")]
+mod start;
diff --git a/vendor/regex-automata/src/dfa/onepass.rs b/vendor/regex-automata/src/dfa/onepass.rs
new file mode 100644
index 000000000..44691d0c8
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/onepass.rs
@@ -0,0 +1,3188 @@
+/*!
+A DFA that can return spans for matching capturing groups.
+
+This module is the home of a [one-pass DFA](DFA).
+
+This module also contains a [`Builder`] and a [`Config`] for building and
+configuring a one-pass DFA.
+*/
+
+// A note on naming and credit:
+//
+// As far as I know, Russ Cox came up with the practical vision and
+// implementation of a "one-pass regex engine." He mentions and describes it
+// briefly in the third article of his regexp article series:
+// https://swtch.com/~rsc/regexp/regexp3.html
+//
+// Cox's implementation is in RE2, and the implementation below is most
+// heavily inspired by RE2's. The key thing they have in common is that
+// their transitions are defined over an alphabet of bytes. In contrast,
+// Go's regex engine also has a one-pass engine, but its transitions are
+// more firmly rooted on Unicode codepoints. The ideas are the same, but the
+// implementations are different.
+//
+// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA."
+// They're both true in their own ways:
+//
+// * The "one-pass" criterion is generally a property of the NFA itself. In
+// particular, it is said that an NFA is one-pass if, after each byte of input
+// during a search, there is at most one "VM thread" remaining to take for the
+// next byte of input. That is, there is never any ambiguity as to the path to
+// take through the NFA during a search.
+//
+// * On the other hand, once a one-pass NFA has its representation converted
+// to something where a constant number of instructions is used for each byte
+// of input, the implementation looks a lot more like a DFA. It's technically
+// more powerful than a DFA since it has side effects (storing offsets inside
+// of slots activated by a transition), but it is far closer to a DFA than an
+// NFA simulation.
+//
+// Thus, in this crate, we call it a one-pass DFA.
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ dfa::{remapper::Remapper, DEAD},
+ nfa::thompson::{self, NFA},
+ util::{
+ alphabet::ByteClasses,
+ captures::Captures,
+ escape::DebugByte,
+ int::{Usize, U32, U64, U8},
+ look::{Look, LookSet, UnicodeWordBoundaryError},
+ primitives::{NonMaxUsize, PatternID, StateID},
+ search::{Anchored, Input, Match, MatchError, MatchKind, Span},
+ sparse_set::SparseSet,
+ },
+};
+
+/// The configuration used for building a [one-pass DFA](DFA).
+///
+/// A one-pass DFA configuration is a simple data object that is typically used
+/// with [`Builder::configure`]. It can be cheaply cloned.
+///
+/// A default configuration can be created either with `Config::new`, or
+/// perhaps more conveniently, with [`DFA::config`].
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ match_kind: Option<MatchKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ size_limit: Option<Option<usize>>,
+}
+
+impl Config {
+ /// Return a new default one-pass DFA configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
+ /// where all possible matches are visited.
+ ///
+ /// When it comes to the one-pass DFA, it is rarer for preference order and
+ /// "longest match" to actually disagree. Since if they did disagree, then
+ /// the regex typically isn't one-pass. For example, searching `Samwise`
+ /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and
+ /// `Samwise` for "longest match" or "all" matching. However, this regex is
+ /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)`
+ /// is one-pass and `Sam|Samwise` may be optimized to it.
+ ///
+ /// The other main difference is that "all" match semantics don't support
+ /// non-greedy matches. "All" match semantics always try to match as much
+ /// as possible.
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// one-pass DFA.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the DFA. When this start state is used, then the DFA will
+ /// only search for matches for the pattern specified, even if there are
+ /// other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the DFA.
+ ///
+ /// You might want to enable this option when you want to both search for
+ /// anchored matches of any pattern or to search for anchored matches of
+ /// one particular pattern while using the same DFA. (Otherwise, you would
+ /// need to compile a new DFA for each pattern.)
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a multi-regex and then search for
+ /// matches for a any of the patterns or matches for a specific pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "123abc";
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ ///
+ /// // A normal multi-pattern search will show pattern 1 matches.
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// // If we only want to report pattern 0 matches, then we'll get no
+ /// // match here.
+ /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the DFA's alphabet or not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging a one-pass DFA.
+ ///
+ /// When enabled, the DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence class because they never discriminate between a match and a
+ /// non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from (approximately) `#states * 256 *
+ /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the
+ /// number of equivalence classes (rounded up to the nearest power of 2).
+ /// As a result, total space usage can decrease substantially. Moreover,
+ /// since a smaller alphabet is used, DFA compilation becomes faster as
+ /// well.
+ ///
+ /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+ /// does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Set a size limit on the total heap used by a one-pass DFA.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// construction of a one-pass DFA. If the DFA's heap usage exceeds
+ /// this configured limit, then construction is stopped and an error is
+ /// returned.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a one-pass DFA that fails to build because of
+ /// a configured size limit. This particular example also serves as a
+ /// cautionary tale demonstrating just how big DFAs with large Unicode
+ /// character classes can get.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// // 6MB isn't enough!
+ /// DFA::builder()
+ /// .configure(DFA::config().size_limit(Some(6_000_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 7MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().size_limit(Some(7_000_000)))
+ /// .build(r"\w{20}")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "A".repeat(20);
+ /// re.captures(&mut cache, &haystack, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// While one needs a little more than 3MB to represent `\w{20}`, it
+ /// turns out that you only need a little more than 4KB to represent
+ /// `(?-u:\w{20})`. So only use Unicode if you need it!
+ pub fn size_limit(mut self, limit: Option<usize>) -> Config {
+ self.size_limit = Some(limit);
+ self
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns the DFA size limit of this configuration if one was set.
+ /// The size limit is total number of bytes on the heap that a DFA is
+ /// permitted to use. If the DFA exceeds this limit during construction,
+ /// then construction is stopped and an error is returned.
+ pub fn get_size_limit(&self) -> Option<usize> {
+ self.size_limit.unwrap_or(None)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ match_kind: o.match_kind.or(self.match_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ size_limit: o.size_limit.or(self.size_limit),
+ }
+ }
+}
+
+/// A builder for a [one-pass DFA](DFA).
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction and the DFA construction. This builder is different from a
+/// general purpose regex builder in that it permits fine grain configuration
+/// of the construction process. The trade off for this is complexity, and
+/// the possibility of setting a configuration that might not make sense. For
+/// example, there are two different UTF-8 modes:
+///
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`] controls whether empty matches that split a
+/// Unicode codepoint are reported or not.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the NFA.
+/// This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::onepass::DFA,
+/// nfa::thompson,
+/// util::syntax,
+/// Match,
+/// };
+///
+/// let re = DFA::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
+/// re.captures(&mut cache, haystack, &mut caps);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on a one-pass DFA Config,
+/// // since that only impacts regexes that can
+/// // produce matches of length 0.
+/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
+}
+
+impl Builder {
+ /// Create a new one-pass DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
+ }
+ }
+
+ /// Build a one-pass DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(feature = "syntax")]
+ pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a one-pass DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ #[cfg(feature = "syntax")]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<DFA, BuildError> {
+ let nfa =
+ self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
+ self.build_from_nfa(nfa)
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a DFA if you already have an NFA in
+ /// hand.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match};
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().shrink(true))
+ /// .build(r"[a-z0-9]+")?;
+ /// let re = DFA::builder().build_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// re.captures(&mut cache, "foo123bar", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> {
+ // Why take ownership if we're just going to pass a reference to the
+ // NFA to our internal builder? Well, the first thing to note is that
+ // an NFA uses reference counting internally, so either choice is going
+ // to be cheap. So there isn't much cost either way.
+ //
+ // The real reason is that a one-pass DFA, semantically, shares
+ // ownership of an NFA. This is unlike other DFAs that don't share
+ // ownership of an NFA at all, primarily because they want to be
+ // self-contained in order to support cheap (de)serialization.
+ //
+ // But then why pass a '&nfa' below if we want to share ownership?
+ // Well, it turns out that using a '&NFA' in our internal builder
+ // separates its lifetime from the DFA we're building, and this turns
+ // out to make code a bit more composable. e.g., We can iterate over
+ // things inside the NFA while borrowing the builder as mutable because
+ // we know the NFA cannot be mutated. So TL;DR --- this weirdness is
+ // "because borrow checker."
+ InternalBuilder::new(self.config.clone(), &nfa).build()
+ }
+
+ /// Apply the given one-pass DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a one-pass DFA directly
+ /// from a pattern.
+ #[cfg(feature = "syntax")]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ #[cfg(feature = "syntax")]
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+/// An internal builder for encapsulating the state necessary to build a
+/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`.
+///
+/// There is no separate pass for determining whether the NFA is one-pass or
+/// not. We just try to build the DFA. If during construction we discover that
+/// it is not one-pass, we bail out. This is likely to lead to some undesirable
+/// expense in some cases, so it might make sense to try an identify common
+/// patterns in the NFA that make it definitively not one-pass. That way, we
+/// can avoid ever trying to build a one-pass DFA in the first place. For
+/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by
+/// default, it's probably not a trivial cost to try and build a one-pass DFA
+/// for it and then fail.
+///
+/// Note that some (immutable) fields are duplicated here. For example, the
+/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing,
+/// but we duplicate them because it makes composition easier below. Otherwise,
+/// since the borrow checker can't see through method calls, the mutable borrow
+/// we use to mutate the DFA winds up preventing borrowing from any other part
+/// of the DFA, even though we aren't mutating those parts. We only do this
+/// because the duplication is cheap.
+#[derive(Debug)]
+struct InternalBuilder<'a> {
+ /// The DFA we're building.
+ dfa: DFA,
+ /// An unordered collection of NFA state IDs that we haven't yet tried to
+ /// build into a DFA state yet.
+ ///
+ /// This collection does not ultimately wind up including every NFA state
+ /// ID. Instead, each ID represents a "start" state for a sub-graph of the
+ /// NFA. The set of NFA states we then use to build a DFA state consists
+ /// of that "start" state and all states reachable from it via epsilon
+ /// transitions.
+ uncompiled_nfa_ids: Vec<StateID>,
+ /// A map from NFA state ID to DFA state ID. This is useful for easily
+ /// determining whether an NFA state has been used as a "starting" point
+ /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD,
+ /// and since DEAD is specially added and never corresponds to any NFA
+ /// state, it follows that a mapping to DEAD implies the NFA state has
+ /// no corresponding DFA state yet.
+ nfa_to_dfa_id: Vec<StateID>,
+ /// A stack used to traverse the NFA states that make up a single DFA
+ /// state. Traversal occurs until the stack is empty, and we only push to
+ /// the stack when the state ID isn't in 'seen'. Actually, even more than
+ /// that, if we try to push something on to this stack that is already in
+ /// 'seen', then we bail out on construction completely, since it implies
+ /// that the NFA is not one-pass.
+ stack: Vec<(StateID, Epsilons)>,
+ /// The set of NFA states that we've visited via 'stack'.
+ seen: SparseSet,
+ /// Whether a match NFA state has been observed while constructing a
+ /// one-pass DFA state. Once a match state is seen, assuming we are using
+ /// leftmost-first match semantics, then we don't add any more transitions
+ /// to the DFA state we're building.
+ matched: bool,
+ /// The config passed to the builder.
+ ///
+ /// This is duplicated in dfa.config.
+ config: Config,
+ /// The NFA we're building a one-pass DFA from.
+ ///
+ /// This is duplicated in dfa.nfa.
+ nfa: &'a NFA,
+ /// The equivalence classes that make up the alphabet for this DFA>
+ ///
+ /// This is duplicated in dfa.classes.
+ classes: ByteClasses,
+}
+
+impl<'a> InternalBuilder<'a> {
+ /// Create a new builder with an initial empty DFA.
+ fn new(config: Config, nfa: &'a NFA) -> InternalBuilder {
+ let classes = if !config.get_byte_classes() {
+ // A one-pass DFA will always use the equivalence class map, but
+ // enabling this option is useful for debugging. Namely, this will
+ // cause all transitions to be defined over their actual bytes
+ // instead of an opaque equivalence class identifier. The former is
+ // much easier to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ nfa.byte_classes().clone()
+ };
+ // Normally a DFA alphabet includes the EOI symbol, but we don't need
+ // that in the one-pass DFA since we handle look-around explicitly
+ // without encoding it into the DFA. Thus, we don't need to delay
+ // matches by 1 byte. However, we reuse the space that *would* be used
+ // by the EOI transition by putting match information there (like which
+ // pattern matches and which look-around assertions need to hold). So
+ // this means our real alphabet length is 1 fewer than what the byte
+ // classes report, since we don't use EOI.
+ let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap();
+ let stride2 = classes.stride2();
+ let dfa = DFA {
+ config: config.clone(),
+ nfa: nfa.clone(),
+ table: vec![],
+ starts: vec![],
+ // Since one-pass DFAs have a smaller state ID max than
+ // StateID::MAX, it follows that StateID::MAX is a valid initial
+ // value for min_match_id since no state ID can ever be greater
+ // than it. In the case of a one-pass DFA with no match states, the
+ // min_match_id will keep this sentinel value.
+ min_match_id: StateID::MAX,
+ classes: classes.clone(),
+ alphabet_len,
+ stride2,
+ pateps_offset: alphabet_len,
+ // OK because PatternID::MAX*2 is guaranteed not to overflow.
+ explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(),
+ };
+ InternalBuilder {
+ dfa,
+ uncompiled_nfa_ids: vec![],
+ nfa_to_dfa_id: vec![DEAD; nfa.states().len()],
+ stack: vec![],
+ seen: SparseSet::new(nfa.states().len()),
+ matched: false,
+ config,
+ nfa,
+ classes,
+ }
+ }
+
+ /// Build the DFA from the NFA given to this builder. If the NFA is not
+ /// one-pass, then return an error. An error may also be returned if a
+ /// particular limit is exceeded. (Some limits, like the total heap memory
+ /// used, are configurable. Others, like the total patterns or slots, are
+ /// hard-coded based on representational limitations.)
+ fn build(mut self) -> Result<DFA, BuildError> {
+ self.nfa.look_set_any().available().map_err(BuildError::word)?;
+ for look in self.nfa.look_set_any().iter() {
+ // This is a future incompatibility check where if we add any
+ // more look-around assertions, then the one-pass DFA either
+ // needs to reject them (what we do here) or it needs to have its
+ // Transition representation modified to be capable of storing the
+ // new assertions.
+ if look.as_repr() > Look::WordUnicodeNegate.as_repr() {
+ return Err(BuildError::unsupported_look(look));
+ }
+ }
+ if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT
+ {
+ return Err(BuildError::too_many_patterns(
+ PatternEpsilons::PATTERN_ID_LIMIT,
+ ));
+ }
+ if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT {
+ return Err(BuildError::not_one_pass(
+ "too many explicit capturing groups (max is 16)",
+ ));
+ }
+ assert_eq!(DEAD, self.add_empty_state()?);
+
+ // This is where the explicit slots start. We care about this because
+ // we only need to track explicit slots. The implicit slots---two for
+ // each pattern---are tracked as part of the search routine itself.
+ let explicit_slot_start = self.nfa.pattern_len() * 2;
+ self.add_start_state(None, self.nfa.start_anchored())?;
+ if self.config.get_starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_state(
+ Some(pid),
+ self.nfa.start_pattern(pid).unwrap(),
+ )?;
+ }
+ }
+ // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids'
+ // as a stack are. It is really an unordered *set* of NFA state IDs.
+ // If it, for example, in practice led to discovering whether a regex
+ // was or wasn't one-pass later than if we processed NFA state IDs in
+ // ascending order, then that would make this routine more costly in
+ // the somewhat common case of a regex that isn't one-pass.
+ while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() {
+ let dfa_id = self.nfa_to_dfa_id[nfa_id];
+ // Once we see a match, we keep going, but don't add any new
+ // transitions. Normally we'd just stop, but we have to keep
+ // going in order to verify that our regex is actually one-pass.
+ self.matched = false;
+ // The NFA states we've already explored for this DFA state.
+ self.seen.clear();
+ // The NFA states to explore via epsilon transitions. If we ever
+ // try to push an NFA state that we've already seen, then the NFA
+ // is not one-pass because it implies there are multiple epsilon
+ // transition paths that lead to the same NFA state. In other
+ // words, there is ambiguity.
+ self.stack_push(nfa_id, Epsilons::empty())?;
+ while let Some((id, epsilons)) = self.stack.pop() {
+ match *self.nfa.state(id) {
+ thompson::State::ByteRange { ref trans } => {
+ self.compile_transition(dfa_id, trans, epsilons)?;
+ }
+ thompson::State::Sparse(ref sparse) => {
+ for trans in sparse.transitions.iter() {
+ self.compile_transition(dfa_id, trans, epsilons)?;
+ }
+ }
+ thompson::State::Dense(ref dense) => {
+ for trans in dense.iter() {
+ self.compile_transition(dfa_id, &trans, epsilons)?;
+ }
+ }
+ thompson::State::Look { look, next } => {
+ let looks = epsilons.looks().insert(look);
+ self.stack_push(next, epsilons.set_looks(looks))?;
+ }
+ thompson::State::Union { ref alternates } => {
+ for &sid in alternates.iter().rev() {
+ self.stack_push(sid, epsilons)?;
+ }
+ }
+ thompson::State::BinaryUnion { alt1, alt2 } => {
+ self.stack_push(alt2, epsilons)?;
+ self.stack_push(alt1, epsilons)?;
+ }
+ thompson::State::Capture { next, slot, .. } => {
+ let slot = slot.as_usize();
+ let epsilons = if slot < explicit_slot_start {
+ // If this is an implicit slot, we don't care
+ // about it, since we handle implicit slots in
+ // the search routine. We can get away with that
+ // because there are 2 implicit slots for every
+ // pattern.
+ epsilons
+ } else {
+ // Offset our explicit slots so that they start
+ // at index 0.
+ let offset = slot - explicit_slot_start;
+ epsilons.set_slots(epsilons.slots().insert(offset))
+ };
+ self.stack_push(next, epsilons)?;
+ }
+ thompson::State::Fail => {
+ continue;
+ }
+ thompson::State::Match { pattern_id } => {
+ // If we found two different paths to a match state
+ // for the same DFA state, then we have ambiguity.
+ // Thus, it's not one-pass.
+ if self.matched {
+ return Err(BuildError::not_one_pass(
+ "multiple epsilon transitions to match state",
+ ));
+ }
+ self.matched = true;
+ // Shove the matching pattern ID and the 'epsilons'
+ // into the current DFA state's pattern epsilons. The
+ // 'epsilons' includes the slots we need to capture
+ // before reporting the match and also the conditional
+ // epsilon transitions we need to check before we can
+ // report a match.
+ self.dfa.set_pattern_epsilons(
+ dfa_id,
+ PatternEpsilons::empty()
+ .set_pattern_id(pattern_id)
+ .set_epsilons(epsilons),
+ );
+ // N.B. It is tempting to just bail out here when
+ // compiling a leftmost-first DFA, since we will never
+ // compile any more transitions in that case. But we
+ // actually need to keep going in order to verify that
+ // we actually have a one-pass regex. e.g., We might
+ // see more Match states (e.g., for other patterns)
+ // that imply that we don't have a one-pass regex.
+ // So instead, we mark that we've found a match and
+ // continue on. When we go to compile a new DFA state,
+ // we just skip that part. But otherwise check that the
+ // one-pass property is upheld.
+ }
+ }
+ }
+ }
+ self.shuffle_states();
+ Ok(self.dfa)
+ }
+
+ /// Shuffle all match states to the end of the transition table and set
+ /// 'min_match_id' to the ID of the first such match state.
+ ///
+ /// The point of this is to make it extremely cheap to determine whether
+ /// a state is a match state or not. We need to check on this on every
+ /// transition during a search, so it being cheap is important. This
+ /// permits us to check it by simply comparing two state identifiers, as
+ /// opposed to looking for the pattern ID in the state's `PatternEpsilons`.
+ /// (Which requires a memory load and some light arithmetic.)
+ fn shuffle_states(&mut self) {
+ let mut remapper = Remapper::new(&self.dfa);
+ let mut next_dest = self.dfa.last_state_id();
+ for i in (0..self.dfa.state_len()).rev() {
+ let id = StateID::must(i);
+ let is_match =
+ self.dfa.pattern_epsilons(id).pattern_id().is_some();
+ if !is_match {
+ continue;
+ }
+ remapper.swap(&mut self.dfa, next_dest, id);
+ self.dfa.min_match_id = next_dest;
+ next_dest = self.dfa.prev_state_id(next_dest).expect(
+ "match states should be a proper subset of all states",
+ );
+ }
+ remapper.remap(&mut self.dfa);
+ }
+
+ /// Compile the given NFA transition into the DFA state given.
+ ///
+ /// 'Epsilons' corresponds to any conditional epsilon transitions that need
+ /// to be satisfied to follow this transition, and any slots that need to
+ /// be saved if the transition is followed.
+ ///
+ /// If this transition indicates that the NFA is not one-pass, then
+ /// this returns an error. (This occurs, for example, if the DFA state
+ /// already has a transition defined for the same input symbols as the
+ /// given transition, *and* the result of the old and new transitions is
+ /// different.)
+ fn compile_transition(
+ &mut self,
+ dfa_id: StateID,
+ trans: &thompson::Transition,
+ epsilons: Epsilons,
+ ) -> Result<(), BuildError> {
+ let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?;
+ for byte in self
+ .classes
+ .representatives(trans.start..=trans.end)
+ .filter_map(|r| r.as_u8())
+ {
+ let oldtrans = self.dfa.transition(dfa_id, byte);
+ let newtrans =
+ Transition::new(self.matched, next_dfa_id, epsilons);
+ // If the old transition points to the DEAD state, then we know
+ // 'byte' has not been mapped to any transition for this DFA state
+ // yet. So set it unconditionally. Otherwise, we require that the
+ // old and new transitions are equivalent. Otherwise, there is
+ // ambiguity and thus the regex is not one-pass.
+ if oldtrans.state_id() == DEAD {
+ self.dfa.set_transition(dfa_id, byte, newtrans);
+ } else if oldtrans != newtrans {
+ return Err(BuildError::not_one_pass(
+ "conflicting transition",
+ ));
+ }
+ }
+ Ok(())
+ }
+
+ /// Add a start state to the DFA corresponding to the given NFA starting
+ /// state ID.
+ ///
+ /// If adding a state would blow any limits (configured or hard-coded),
+ /// then an error is returned.
+ ///
+ /// If the starting state is an anchored state for a particular pattern,
+ /// then callers must provide the pattern ID for that starting state.
+ /// Callers must also ensure that the first starting state added is the
+ /// start state for all patterns, and then each anchored starting state for
+ /// each pattern (if necessary) added in order. Otherwise, this panics.
+ fn add_start_state(
+ &mut self,
+ pid: Option<PatternID>,
+ nfa_id: StateID,
+ ) -> Result<StateID, BuildError> {
+ match pid {
+ // With no pid, this should be the start state for all patterns
+ // and thus be the first one.
+ None => assert!(self.dfa.starts.is_empty()),
+ // With a pid, we want it to be at self.dfa.starts[pid+1].
+ Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()),
+ }
+ let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?;
+ self.dfa.starts.push(dfa_id);
+ Ok(dfa_id)
+ }
+
+ /// Add a new DFA state corresponding to the given NFA state. If adding a
+ /// state would blow any limits (configured or hard-coded), then an error
+ /// is returned. If a DFA state already exists for the given NFA state,
+ /// then that DFA state's ID is returned and no new states are added.
+ ///
+ /// It is not expected that this routine is called for every NFA state.
+ /// Instead, an NFA state ID will usually correspond to the "start" state
+ /// for a sub-graph of the NFA, where all states in the sub-graph are
+ /// reachable via epsilon transitions (conditional or unconditional). That
+ /// sub-graph of NFA states is ultimately what produces a single DFA state.
+ fn add_dfa_state_for_nfa_state(
+ &mut self,
+ nfa_id: StateID,
+ ) -> Result<StateID, BuildError> {
+ // If we've already built a DFA state for the given NFA state, then
+ // just return that. We definitely do not want to have more than one
+ // DFA state in existence for the same NFA state, since all but one of
+ // them will likely become unreachable. And at least some of them are
+ // likely to wind up being incomplete.
+ let existing_dfa_id = self.nfa_to_dfa_id[nfa_id];
+ if existing_dfa_id != DEAD {
+ return Ok(existing_dfa_id);
+ }
+ // If we don't have any DFA state yet, add it and then add the given
+ // NFA state to the list of states to explore.
+ let dfa_id = self.add_empty_state()?;
+ self.nfa_to_dfa_id[nfa_id] = dfa_id;
+ self.uncompiled_nfa_ids.push(nfa_id);
+ Ok(dfa_id)
+ }
+
+ /// Unconditionally add a new empty DFA state. If adding it would exceed
+ /// any limits (configured or hard-coded), then an error is returned. The
+ /// ID of the new state is returned on success.
+ ///
+ /// The added state is *not* a match state.
+ fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
+ let state_limit = Transition::STATE_ID_LIMIT;
+ // Note that unlike dense and lazy DFAs, we specifically do NOT
+ // premultiply our state IDs here. The reason is that we want to pack
+ // our state IDs into 64-bit transitions with other info, so the fewer
+ // the bits we use for state IDs the better. If we premultiply, then
+ // our state ID space shrinks. We justify this by the assumption that
+ // a one-pass DFA is just already doing a fair bit more work than a
+ // normal DFA anyway, so an extra multiplication to compute a state
+ // transition doesn't seem like a huge deal.
+ let next_id = self.dfa.table.len() >> self.dfa.stride2();
+ let id = StateID::new(next_id)
+ .map_err(|_| BuildError::too_many_states(state_limit))?;
+ if id.as_u64() > Transition::STATE_ID_LIMIT {
+ return Err(BuildError::too_many_states(state_limit));
+ }
+ self.dfa
+ .table
+ .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride()));
+ // The default empty value for 'PatternEpsilons' is sadly not all
+ // zeroes. Instead, a special sentinel is used to indicate that there
+ // is no pattern. So we need to explicitly set the pattern epsilons to
+ // the correct "empty" PatternEpsilons.
+ self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty());
+ if let Some(size_limit) = self.config.get_size_limit() {
+ if self.dfa.memory_usage() > size_limit {
+ return Err(BuildError::exceeded_size_limit(size_limit));
+ }
+ }
+ Ok(id)
+ }
+
+ /// Push the given NFA state ID and its corresponding epsilons (slots and
+ /// conditional epsilon transitions) on to a stack for use in a depth first
+ /// traversal of a sub-graph of the NFA.
+ ///
+ /// If the given NFA state ID has already been pushed on to the stack, then
+ /// it indicates the regex is not one-pass and this correspondingly returns
+ /// an error.
+ fn stack_push(
+ &mut self,
+ nfa_id: StateID,
+ epsilons: Epsilons,
+ ) -> Result<(), BuildError> {
+ // If we already have seen a match and we are compiling a leftmost
+ // first DFA, then we shouldn't add any more states to look at. This is
+ // effectively how preference order and non-greediness is implemented.
+ // if !self.config.get_match_kind().continue_past_first_match()
+ // && self.matched
+ // {
+ // return Ok(());
+ // }
+ if !self.seen.insert(nfa_id) {
+ return Err(BuildError::not_one_pass(
+ "multiple epsilon transitions to same state",
+ ));
+ }
+ self.stack.push((nfa_id, epsilons));
+ Ok(())
+ }
+}
+
+/// A one-pass DFA for executing a subset of anchored regex searches while
+/// resolving capturing groups.
+///
+/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is
+/// one-pass when there is never any ambiguity about how to continue a search.
+/// For example, `a*a` is not one-pass becuase during a search, it's not
+/// possible to know whether to continue matching the `a*` or to move on to
+/// the single `a`. However, `a*b` is one-pass, because for every byte in the
+/// input, it's always clear when to move on from `a*` to `b`.
+///
+/// # Only anchored searches are supported
+///
+/// In this crate, especially for DFAs, unanchored searches are implemented by
+/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix
+/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will
+/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches
+/// any byte, and there is therefore ambiguity as to when the prefix should
+/// stop matching and something else should start matching.
+///
+/// Therefore, one-pass DFAs do not support unanchored searches. In addition
+/// to many regexes simply not being one-pass, it implies that one-pass DFAs
+/// have limited utility. With that said, when a one-pass DFA can be used, it
+/// can potentially provide a dramatic speed up over alternatives like the
+/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
+/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular,
+/// a one-pass DFA is the only DFA capable of reporting the spans of matching
+/// capturing groups.
+///
+/// To clarify, when we say that unanchored searches are not supported, what
+/// that actually means is:
+///
+/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always
+/// do anchored searches.
+/// * Since iterators are most useful in the context of unanchored searches,
+/// there is no `DFA::captures_iter` method.
+/// * For lower level routines like [`DFA::try_search`], an error will be
+/// returned if the given [`Input`] is configured to do an unanchored search or
+/// search for an invalid pattern ID. (Note that an [`Input`] is configured to
+/// do an unanchored search by default, so just giving a `Input::new` is
+/// guaranteed to return an error.)
+///
+/// # Other limitations
+///
+/// In addition to the [configurable heap limit](Config::size_limit) and
+/// the requirement that a regex pattern be one-pass, there are some other
+/// limitations:
+///
+/// * There is an internal limit on the total number of explicit capturing
+/// groups that appear across all patterns. It is somewhat small and there is
+/// no way to configure it. If your pattern(s) exceed this limit, then building
+/// a one-pass DFA will fail.
+/// * If the number of patterns exceeds an internal unconfigurable limit, then
+/// building a one-pass DFA will fail. This limit is quite large and you're
+/// unlikely to hit it.
+/// * If the total number of states exceeds an internal unconfigurable limit,
+/// then building a one-pass DFA will fail. This limit is quite large and
+/// you're unlikely to hit it.
+///
+/// # Other examples of regexes that aren't one-pass
+///
+/// One particularly unfortunate example is that enabling Unicode can cause
+/// regexes that were one-pass to no longer be one-pass. Consider the regex
+/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no
+/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s`
+/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get
+/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s`
+/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the
+/// overlap between UTF-8 automata, the use of Unicode character classes will
+/// tend to vastly increase the likelihood of a regex not being one-pass.
+///
+/// # How does one know if a regex is one-pass or not?
+///
+/// At the time of writing, the only way to know is to try and build a one-pass
+/// DFA. The one-pass property is checked while constructing the DFA.
+///
+/// This does mean that you might potentially waste some CPU cycles and memory
+/// by optimistically trying to build a one-pass DFA. But this is currently the
+/// only way. In the future, building a one-pass DFA might be able to use some
+/// heuristics to detect common violations of the one-pass property and bail
+/// more quickly.
+///
+/// # Resource usage
+///
+/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource
+/// usage. Namely, construction of a one-pass DFA has a time and space
+/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time
+/// and space complexity is `O(2^n)`.) This smaller time bound is achieved
+/// because there is at most one DFA state created for each NFA state. If
+/// additional DFA states would be required, then the pattern is not one-pass
+/// and construction will fail.
+///
+/// Note though that currently, this DFA uses a fully dense representation.
+/// This means that while its space complexity is no worse than an NFA, it may
+/// in practice use more memory because of higher constant factors. The reason
+/// for this trade off is two-fold. Firstly, a dense representation makes the
+/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be
+/// one-pass. Therefore, most one-pass DFAs are usually pretty small.
+///
+/// # Example
+///
+/// This example shows that the one-pass DFA implements Unicode word boundaries
+/// correctly while simultaneously reporting spans for capturing groups that
+/// participate in a match. (This is the only DFA that implements full support
+/// for Unicode word boundaries.)
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{dfa::onepass::DFA, Match, Span};
+///
+/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// re.captures(&mut cache, "Шерлок Холмс", &mut caps);
+/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match());
+/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first"));
+/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last"));
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: iteration
+///
+/// Unlike other regex engines in this crate, this one does not provide
+/// iterator search functions. This is because a one-pass DFA only supports
+/// anchored searches, and so iterator functions are generally not applicable.
+///
+/// However, if you know that all of your matches are
+/// directly adjacent, then an iterator can be used. The
+/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for
+/// this purpose:
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::onepass::DFA,
+/// util::iter::Searcher,
+/// Anchored, Input, Span,
+/// };
+///
+/// let re = DFA::new(r"\w(\d)\w")?;
+/// let (mut cache, caps) = (re.create_cache(), re.create_captures());
+/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes);
+///
+/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| {
+/// Ok(re.try_search(&mut cache, input, caps)?)
+/// }).infallible();
+/// let caps0 = it.next().unwrap();
+/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA {
+ /// The configuration provided by the caller.
+ config: Config,
+ /// The NFA used to build this DFA.
+ ///
+ /// NOTE: We probably don't need to store the NFA here, but we use enough
+ /// bits from it that it's convenient to do so. And there really isn't much
+ /// cost to doing so either, since an NFA is reference counted internally.
+ nfa: NFA,
+ /// The transition table. Given a state ID 's' and a byte of haystack 'b',
+ /// the next state is `table[sid + classes[byte]]`.
+ ///
+ /// The stride of this table (i.e., the number of columns) is always
+ /// a power of 2, even if the alphabet length is smaller. This makes
+ /// converting between state IDs and state indices very cheap.
+ ///
+ /// Note that the stride always includes room for one extra "transition"
+ /// that isn't actually a transition. It is a 'PatternEpsilons' that is
+ /// used for match states only. Because of this, the maximum number of
+ /// active columns in the transition table is 257, which means the maximum
+ /// stride is 512 (the next power of 2 greater than or equal to 257).
+ table: Vec<Transition>,
+ /// The DFA state IDs of the starting states.
+ ///
+ /// `starts[0]` is always present and corresponds to the starting state
+ /// when searching for matches of any pattern in the DFA.
+ ///
+ /// `starts[i]` where i>0 corresponds to the starting state for the pattern
+ /// ID 'i-1'. These starting states are optional.
+ starts: Vec<StateID>,
+ /// Every state ID >= this value corresponds to a match state.
+ ///
+ /// This is what a search uses to detect whether a state is a match state
+ /// or not. It requires only a simple comparison instead of bit-unpacking
+ /// the PatternEpsilons from every state.
+ min_match_id: StateID,
+ /// The alphabet of this DFA, split into equivalence classes. Bytes in the
+ /// same equivalence class can never discriminate between a match and a
+ /// non-match.
+ classes: ByteClasses,
+ /// The number of elements in each state in the transition table. This may
+ /// be less than the stride, since the stride is always a power of 2 and
+ /// the alphabet length can be anything up to and including 256.
+ alphabet_len: usize,
+ /// The number of columns in the transition table, expressed as a power of
+ /// 2.
+ stride2: usize,
+ /// The offset at which the PatternEpsilons for a match state is stored in
+ /// the transition table.
+ ///
+ /// PERF: One wonders whether it would be better to put this in a separate
+ /// allocation, since only match states have a non-empty PatternEpsilons
+ /// and the number of match states tends be dwarfed by the number of
+ /// non-match states. So this would save '8*len(non_match_states)' for each
+ /// DFA. The question is whether moving this to a different allocation will
+ /// lead to a perf hit during searches. You might think dealing with match
+ /// states is rare, but some regexes spend a lot of time in match states
+ /// gobbling up input. But... match state handling is already somewhat
+ /// expensive, so maybe this wouldn't do much? Either way, it's worth
+ /// experimenting.
+ pateps_offset: usize,
+ /// The first explicit slot index. This refers to the first slot appearing
+ /// immediately after the last implicit slot. It is always 'patterns.len()
+ /// * 2'.
+ ///
+ /// We record this because we only store the explicit slots in our DFA
+ /// transition table that need to be saved. Implicit slots are handled
+ /// automatically as part of the search.
+ explicit_slot_start: usize,
+}
+
+impl DFA {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding one-pass DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new("foo[0-9]+bar")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "foo12345barzzz", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ #[inline]
+ pub fn new(pattern: &str) -> Result<DFA, BuildError> {
+ DFA::builder().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "abc123", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match());
+ ///
+ /// re.captures(&mut cache, "123abc", &mut caps);
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ #[inline]
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
+ DFA::builder().build_many(patterns)
+ }
+
+ /// Like `new`, but builds a one-pass DFA directly from an NFA. This is
+ /// useful if you already have an NFA, or even if you hand-assembled the
+ /// NFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to hand assemble a regular expression via its HIR,
+ /// compile an NFA from it and build a one-pass DFA from the NFA.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA,
+ /// nfa::thompson::NFA,
+ /// Match,
+ /// };
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = DFA::new_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let expected = Some(Match::must(0, 0..1));
+ /// re.captures(&mut cache, "A", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> {
+ DFA::builder().build_from_nfa(nfa)
+ }
+
+ /// Create a new one-pass DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// let mut cache = dfa.create_cache();
+ /// let mut caps = dfa.create_captures();
+ ///
+ /// let expected = Match::must(0, 0..0);
+ /// dfa.captures(&mut cache, "", &mut caps);
+ /// assert_eq!(Some(expected), caps.get_match());
+ /// dfa.captures(&mut cache, "foo", &mut caps);
+ /// assert_eq!(Some(expected), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(nfa)
+ }
+
+ /// Create a new one-pass DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::onepass::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// let mut cache = dfa.create_cache();
+ /// let mut caps = dfa.create_captures();
+ ///
+ /// dfa.captures(&mut cache, "", &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ /// dfa.captures(&mut cache, "foo", &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(nfa)
+ }
+
+ /// Return a default configuration for a DFA.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to change the match semantics of this DFA from
+ /// its default "leftmost first" to "all." When using "all," non-greediness
+ /// doesn't apply and neither does preference order matching. Instead, the
+ /// longest match possible is always returned. (Although, by construction,
+ /// it's impossible for a one-pass DFA to have a different answer for
+ /// "preference order" vs "longest match.")
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind};
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build(r"(abc)+?")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// re.captures(&mut cache, "abcabc", &mut caps);
+ /// // Normally, the non-greedy repetition would give us a 0..3 match.
+ /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a DFA.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::onepass::DFA,
+ /// nfa::thompson,
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 0..8));
+ /// re.captures(&mut cache, haystack, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new empty set of capturing groups that is guaranteed to be
+ /// valid for the search APIs on this DFA.
+ ///
+ /// A `Captures` value created for a specific DFA cannot be used with any
+ /// other DFA.
+ ///
+ /// This is a convenience function for [`Captures::all`]. See the
+ /// [`Captures`] documentation for an explanation of its alternative
+ /// constructors that permit the DFA to do less work during a search, and
+ /// thus might make it faster.
+ #[inline]
+ pub fn create_captures(&self) -> Captures {
+ Captures::all(self.nfa.group_info().clone())
+ }
+
+ /// Create a new cache for this DFA.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// DFA. If you want to reuse the cache for another DFA, then you
+ /// must call [`Cache::reset`] with that DFA (or, equivalently,
+ /// [`DFA::reset_cache`]).
+ #[inline]
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this DFA (and only this DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different DFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re1 = DFA::new(r"\w")?;
+ /// let re2 = DFA::new(r"\W")?;
+ /// let mut caps1 = re1.create_captures();
+ /// let mut caps2 = re2.create_captures();
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the one-pass DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ cache.reset(self);
+ }
+
+ /// Return the config for this one-pass DFA.
+ #[inline]
+ pub fn get_config(&self) -> &Config {
+ &self.config
+ }
+
+ /// Returns a reference to the underlying NFA.
+ #[inline]
+ pub fn get_nfa(&self) -> &NFA {
+ &self.nfa
+ }
+
+ /// Returns the total number of patterns compiled into this DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this returns `0`.
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.get_nfa().pattern_len()
+ }
+
+ /// Returns the total number of states in this one-pass DFA.
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn state_len(&self) -> usize {
+ self.table.len() >> self.stride2()
+ }
+
+ /// Returns the total number of elements in the alphabet for this DFA.
+ ///
+ /// That is, this returns the total number of transitions that each
+ /// state in this DFA must have. The maximum alphabet size is 256, which
+ /// corresponds to each possible byte value.
+ ///
+ /// The alphabet size may be less than 256 though, and unless
+ /// [`Config::byte_classes`] is disabled, it is typically must less than
+ /// 256. Namely, bytes are grouped into equivalence classes such that no
+ /// two bytes in the same class can distinguish a match from a non-match.
+ /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could
+ /// all be in the same equivalence class. This leads to a massive space
+ /// savings.
+ ///
+ /// Note though that the alphabet length does _not_ necessarily equal the
+ /// total stride space taken up by a single DFA state in the transition
+ /// table. Namely, for performance reasons, the stride is always the
+ /// smallest power of two that is greater than or equal to the alphabet
+ /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
+ /// often more useful. The alphabet length is typically useful only for
+ /// informational purposes.
+ ///
+ /// Note also that unlike dense or sparse DFAs, a one-pass DFA does
+ /// not have a special end-of-input (EOI) transition. This is because
+ /// a one-pass DFA handles look-around assertions explicitly (like the
+ /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build
+ /// them into the transitions of the DFA.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ self.alphabet_len
+ }
+
+ /// Returns the total stride for every state in this DFA, expressed as the
+ /// exponent of a power of 2. The stride is the amount of space each state
+ /// takes up in the transition table, expressed as a number of transitions.
+ /// (Unused transitions map to dead states.)
+ ///
+ /// The stride of a DFA is always equivalent to the smallest power of
+ /// 2 that is greater than or equal to the DFA's alphabet length. This
+ /// definition uses extra space, but possibly permits faster translation
+ /// between state identifiers and their corresponding offsets in this DFA's
+ /// transition table.
+ ///
+ /// For example, if the DFA's stride is 16 transitions, then its `stride2`
+ /// is `4` since `2^4 = 16`.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride
+ /// of `512`). The maximum in theory should be `8`, but because of some
+ /// implementation quirks that may be relaxed in the future, it is one more
+ /// than `8`. (Do note that a maximal stride is incredibly rare, as it
+ /// would imply that there is almost no redundant in the regex pattern.)
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn stride2(&self) -> usize {
+ self.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ ///
+ /// Please see [`DFA::stride2`] for more information. In particular, this
+ /// returns the stride as the number of transitions, where as `stride2`
+ /// returns it as the exponent of a power of 2.
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<onepass::DFA>()`.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ self.table.len() * size_of::<Transition>()
+ + self.starts.len() * size_of::<StateID>()
+ }
+}
+
+impl DFA {
+ /// Executes an anchored leftmost forward search, and returns true if and
+ /// only if this one-pass DFA matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future
+ /// input will never lead to a different result. In particular, if the
+ /// underlying DFA enters a match state, then this routine will return
+ /// `true` immediately without inspecting any future input. (Consider how
+ /// this might make a difference given the regex `a+` on the haystack
+ /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
+ /// but routines like `find` need to continue searching because `+` is
+ /// greedy by default.)
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// This shows basic usage:
+ ///
+ /// ```
+ /// use regex_automata::dfa::onepass::DFA;
+ ///
+ /// let re = DFA::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "foo12345bar"));
+ /// assert!(!re.is_match(&mut cache, "foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: consistency with search APIs
+ ///
+ /// `is_match` is guaranteed to return `true` whenever `captures` returns
+ /// a match. This includes searches that are executed entirely within a
+ /// codepoint:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Input};
+ ///
+ /// let re = DFA::new("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Notice that when UTF-8 mode is disabled, then the above reports a
+ /// match because the restriction against zero-width matches that split a
+ /// codepoint has been lifted:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input};
+ ///
+ /// let re = DFA::builder()
+ /// .thompson(NFA::config().utf8(false))
+ /// .build("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> bool {
+ let mut input = input.into().earliest(true);
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ self.try_search_slots(cache, &input, &mut []).unwrap().is_some()
+ }
+
+ /// Executes an anchored leftmost forward search, and returns a `Match` if
+ /// and only if this one-pass DFA matches the given haystack.
+ ///
+ /// This routine only includes the overall match span. To get access to the
+ /// individual spans of each capturing group, use [`DFA::captures`].
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..8);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over later parts.
+ /// let re = DFA::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..3);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> Option<Match> {
+ let mut input = input.into();
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut slots = [None, None];
+ let pid =
+ self.try_search_slots(cache, &input, &mut slots).unwrap()?;
+ let start = slots[0].unwrap().get();
+ let end = slots[1].unwrap().get();
+ return Some(Match::new(pid, Span { start, end }));
+ }
+ let ginfo = self.get_nfa().group_info();
+ let slots_len = ginfo.implicit_slot_len();
+ let mut slots = vec![None; slots_len];
+ let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?;
+ let start = slots[pid.as_usize() * 2].unwrap().get();
+ let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
+ Some(Match::new(pid, Span { start, end }))
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
+ /// is guaranteed to return `false`.
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// This shows a simple example of a one-pass regex that extracts
+ /// capturing group spans.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match, Span};
+ ///
+ /// let re = DFA::new(
+ /// // Notice that we use ASCII here. The corresponding Unicode regex
+ /// // is sadly not one-pass.
+ /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ caps: &mut Captures,
+ ) {
+ let mut input = input.into();
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ self.try_search(cache, &input, caps).unwrap();
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
+ /// is guaranteed to return `false`.
+ ///
+ /// The differences with [`DFA::captures`] are:
+ ///
+ /// 1. This returns an error instead of panicking if the search fails.
+ /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing
+ /// the same input for multiple searches, which _may_ be important for
+ /// latency.
+ /// 3. This does not automatically change the [`Anchored`] mode from `No`
+ /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an
+ /// error is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-regex that permits searching
+ /// for specific patterns. Note that this is somewhat less useful than
+ /// in other regex engines, since a one-pass DFA by definition has no
+ /// ambiguity about which pattern can match at a position. That is, if it
+ /// were possible for two different patterns to match at the same starting
+ /// position, then the multi-regex would not be one-pass and construction
+ /// would have failed.
+ ///
+ /// Nevertheless, this can still be useful if you only care about matches
+ /// for a specific pattern, and want the DFA to report "no match" even if
+ /// some other pattern would have matched.
+ ///
+ /// Note that in order to make use of this functionality,
+ /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled
+ /// by default since it may result in higher memory usage.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "123abc";
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ ///
+ /// // A normal multi-pattern search will show pattern 1 matches.
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// // If we only want to report pattern 0 matches, then we'll get no
+ /// // match here.
+ /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match};
+ ///
+ /// // one-pass DFAs fully support Unicode word boundaries!
+ /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass.
+ /// // :-(
+ /// let re = DFA::new(r"\b[0-9]{3}\b")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes);
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes);
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ caps: &mut Captures,
+ ) -> Result<(), MatchError> {
+ let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
+ caps.set_pattern(pid);
+ Ok(())
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// `slots`, and returns the matching pattern ID. The contents of the
+ /// slots for patterns other than the matching pattern are unspecified. If
+ /// no match was found, then `None` is returned and the contents of all
+ /// `slots` is unspecified.
+ ///
+ /// This is like [`DFA::try_search`], but it accepts a raw slots slice
+ /// instead of a `Captures` value. This is useful in contexts where you
+ /// don't want or need to allocate a `Captures`.
+ ///
+ /// It is legal to pass _any_ number of slots to this routine. If the regex
+ /// engine would otherwise write a slot offset that doesn't fit in the
+ /// provided slice, then it is simply skipped. In general though, there are
+ /// usually three slice lengths you might want to use:
+ ///
+ /// * An empty slice, if you only care about which pattern matched.
+ /// * A slice with
+ /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len)
+ /// slots, if you only care about the overall match spans for each matching
+ /// pattern.
+ /// * A slice with
+ /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
+ /// permits recording match offsets for every capturing group in every
+ /// pattern.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID};
+ ///
+ /// let re = DFA::new_many(&[
+ /// r"[a-zA-Z]+",
+ /// r"[0-9]+",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("123").anchored(Anchored::Yes);
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ if !utf8empty {
+ return self.try_search_slots_imp(cache, input, slots);
+ }
+ // See PikeVM::try_search_slots for why we do this.
+ let min = self.get_nfa().group_info().implicit_slot_len();
+ if slots.len() >= min {
+ return self.try_search_slots_imp(cache, input, slots);
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut enough = [None, None];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger
+ // than `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ return Ok(got);
+ }
+ let mut enough = vec![None; min];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger than
+ // `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ Ok(got)
+ }
+
+ #[inline(never)]
+ fn try_search_slots_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ match self.search_imp(cache, input, slots)? {
+ None => return Ok(None),
+ Some(pid) if !utf8empty => return Ok(Some(pid)),
+ Some(pid) => {
+ // These slot indices are always correct because we know our
+ // 'pid' is valid and thus we know that the slot indices for it
+ // are valid.
+ let slot_start = pid.as_usize().wrapping_mul(2);
+ let slot_end = slot_start.wrapping_add(1);
+ // OK because we know we have a match and we know our caller
+ // provided slots are big enough (which we make true above if
+ // the caller didn't). Namely, we're only here when 'utf8empty'
+ // is true, and when that's true, we require slots for every
+ // pattern.
+ let start = slots[slot_start].unwrap().get();
+ let end = slots[slot_end].unwrap().get();
+ // If our match splits a codepoint, then we cannot report is
+ // as a match. And since one-pass DFAs only support anchored
+ // searches, we don't try to skip ahead to find the next match.
+ // We can just quit with nothing.
+ if start == end && !input.is_char_boundary(start) {
+ return Ok(None);
+ }
+ Ok(Some(pid))
+ }
+ }
+ }
+}
+
+impl DFA {
+ fn search_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ // PERF: Some ideas. I ran out of steam after my initial impl to try
+ // many of these.
+ //
+ // 1) Try doing more state shuffling. Right now, all we do is push
+ // match states to the end of the transition table so that we can do
+ // 'if sid >= self.min_match_id' to know whether we're in a match
+ // state or not. But what about doing something like dense DFAs and
+ // pushing dead, match and states with captures/looks all toward the
+ // beginning of the transition table. Then we could do 'if sid <=
+ // self.max_special_id', in which case, we need to do some special
+ // handling of some sort. Otherwise, we get the happy path, just
+ // like in a DFA search. The main argument against this is that the
+ // one-pass DFA is likely to be used most often with capturing groups
+ // and if capturing groups are common, then this might wind up being a
+ // pessimization.
+ //
+ // 2) Consider moving 'PatternEpsilons' out of the transition table.
+ // It is only needed for match states and usually a small minority of
+ // states are match states. Therefore, we're using an extra 'u64' for
+ // most states.
+ //
+ // 3) I played around with the match state handling and it seems like
+ // there is probably a lot left on the table for improvement. The
+ // key tension is that the 'find_match' routine is a giant mess, but
+ // splitting it out into a non-inlineable function is a non-starter
+ // because the match state might consume input, so 'find_match' COULD
+ // be called quite a lot, and a function call at that point would trash
+ // perf. In theory, we could detect whether a match state consumes
+ // input and then specialize our search routine based on that. In that
+ // case, maybe an extra function call is OK, but even then, it might be
+ // too much of a latency hit. Another idea is to just try and figure
+ // out how to reduce the code size of 'find_match'. RE2 has a trick
+ // here where the match handling isn't done if we know the next byte of
+ // input yields a match too. Maybe we adopt that?
+ //
+ // This just might be a tricky DFA to optimize.
+
+ if input.is_done() {
+ return Ok(None);
+ }
+ // We unfortunately have a bit of book-keeping to do to set things
+ // up. We do have to setup our cache and clear all of our slots. In
+ // particular, clearing the slots is necessary for the case where we
+ // report a match, but one of the capturing groups didn't participate
+ // in the match but had a span set from a previous search. That would
+ // be bad. In theory, we could avoid all this slot clearing if we knew
+ // that every slot was always activated for every match. Then we would
+ // know they would always be overwritten when a match is found.
+ let explicit_slots_len = core::cmp::min(
+ Slots::LIMIT,
+ slots.len().saturating_sub(self.explicit_slot_start),
+ );
+ cache.setup_search(explicit_slots_len);
+ for slot in cache.explicit_slots() {
+ *slot = None;
+ }
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ // We set the starting slots for every pattern up front. This does
+ // increase our latency somewhat, but it avoids having to do it every
+ // time we see a match state (which could be many times in a single
+ // search if the match state consumes input).
+ for pid in self.nfa.patterns() {
+ let i = pid.as_usize() * 2;
+ if i >= slots.len() {
+ break;
+ }
+ slots[i] = NonMaxUsize::new(input.start());
+ }
+ let mut pid = None;
+ let mut next_sid = match input.get_anchored() {
+ Anchored::Yes => self.start(),
+ Anchored::Pattern(pid) => self.start_pattern(pid)?,
+ Anchored::No => {
+ // If the regex is itself always anchored, then we're fine,
+ // even if the search is configured to be unanchored.
+ if !self.nfa.is_always_start_anchored() {
+ return Err(MatchError::unsupported_anchored(
+ Anchored::No,
+ ));
+ }
+ self.start()
+ }
+ };
+ let leftmost_first =
+ matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst);
+ for at in input.start()..input.end() {
+ let sid = next_sid;
+ let trans = self.transition(sid, input.haystack()[at]);
+ next_sid = trans.state_id();
+ let epsilons = trans.epsilons();
+ if sid >= self.min_match_id {
+ if self.find_match(cache, input, at, sid, slots, &mut pid) {
+ if input.get_earliest()
+ || (leftmost_first && trans.match_wins())
+ {
+ return Ok(pid);
+ }
+ }
+ }
+ if sid == DEAD
+ || (!epsilons.looks().is_empty()
+ && !self.nfa.look_matcher().matches_set_inline(
+ epsilons.looks(),
+ input.haystack(),
+ at,
+ ))
+ {
+ return Ok(pid);
+ }
+ epsilons.slots().apply(at, cache.explicit_slots());
+ }
+ if next_sid >= self.min_match_id {
+ self.find_match(
+ cache,
+ input,
+ input.end(),
+ next_sid,
+ slots,
+ &mut pid,
+ );
+ }
+ Ok(pid)
+ }
+
+ /// Assumes 'sid' is a match state and looks for whether a match can
+ /// be reported. If so, appropriate offsets are written to 'slots' and
+ /// 'matched_pid' is set to the matching pattern ID.
+ ///
+ /// Even when 'sid' is a match state, it's possible that a match won't
+ /// be reported. For example, when the conditional epsilon transitions
+ /// leading to the match state aren't satisfied at the given position in
+ /// the haystack.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_match(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ at: usize,
+ sid: StateID,
+ slots: &mut [Option<NonMaxUsize>],
+ matched_pid: &mut Option<PatternID>,
+ ) -> bool {
+ debug_assert!(sid >= self.min_match_id);
+ let pateps = self.pattern_epsilons(sid);
+ let epsilons = pateps.epsilons();
+ if !epsilons.looks().is_empty()
+ && !self.nfa.look_matcher().matches_set_inline(
+ epsilons.looks(),
+ input.haystack(),
+ at,
+ )
+ {
+ return false;
+ }
+ let pid = pateps.pattern_id_unchecked();
+ // This calculation is always correct because we know our 'pid' is
+ // valid and thus we know that the slot indices for it are valid.
+ let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1);
+ // Set the implicit 'end' slot for the matching pattern. (The 'start'
+ // slot was set at the beginning of the search.)
+ if slot_end < slots.len() {
+ slots[slot_end] = NonMaxUsize::new(at);
+ }
+ // If the caller provided enough room, copy the previously recorded
+ // explicit slots from our scratch space to the caller provided slots.
+ // We *also* need to set any explicit slots that are active as part of
+ // the path to the match state.
+ if self.explicit_slot_start < slots.len() {
+ // NOTE: The 'cache.explicit_slots()' slice is setup at the
+ // beginning of every search such that it is guaranteed to return a
+ // slice of length equivalent to 'slots[explicit_slot_start..]'.
+ slots[self.explicit_slot_start..]
+ .copy_from_slice(cache.explicit_slots());
+ epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]);
+ }
+ *matched_pid = Some(pid);
+ true
+ }
+}
+
+impl DFA {
+ /// Returns the anchored start state for matching any pattern in this DFA.
+ fn start(&self) -> StateID {
+ self.starts[0]
+ }
+
+ /// Returns the anchored start state for matching the given pattern. If
+ /// 'starts_for_each_pattern'
+ /// was not enabled, then this returns an error. If the given pattern is
+ /// not in this DFA, then `Ok(None)` is returned.
+ fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> {
+ if !self.config.get_starts_for_each_pattern() {
+ return Err(MatchError::unsupported_anchored(Anchored::Pattern(
+ pid,
+ )));
+ }
+ // 'starts' always has non-zero length. The first entry is always the
+ // anchored starting state for all patterns, and the following entries
+ // are optional and correspond to the anchored starting states for
+ // patterns at pid+1. Thus, starts.len()-1 corresponds to the total
+ // number of patterns that one can explicitly search for. (And it may
+ // be zero.)
+ Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD))
+ }
+
+ /// Returns the transition from the given state ID and byte of input. The
+ /// transition includes the next state ID, the slots that should be saved
+ /// and any conditional epsilon transitions that must be satisfied in order
+ /// to take this transition.
+ fn transition(&self, sid: StateID, byte: u8) -> Transition {
+ let offset = sid.as_usize() << self.stride2();
+ let class = self.classes.get(byte).as_usize();
+ self.table[offset + class]
+ }
+
+ /// Set the transition from the given state ID and byte of input to the
+ /// transition given.
+ fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) {
+ let offset = sid.as_usize() << self.stride2();
+ let class = self.classes.get(byte).as_usize();
+ self.table[offset + class] = to;
+ }
+
+ /// Return an iterator of "sparse" transitions for the given state ID.
+ /// "sparse" in this context means that consecutive transitions that are
+ /// equivalent are returned as one group, and transitions to the DEAD state
+ /// are ignored.
+ ///
+ /// This winds up being useful for debug printing, since it's much terser
+ /// to display runs of equivalent transitions than the transition for every
+ /// possible byte value. Indeed, in practice, it's very common for runs
+ /// of equivalent transitions to appear.
+ fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> {
+ let start = sid.as_usize() << self.stride2();
+ let end = start + self.alphabet_len();
+ SparseTransitionIter {
+ it: self.table[start..end].iter().enumerate(),
+ cur: None,
+ }
+ }
+
+ /// Return the pattern epsilons for the given state ID.
+ ///
+ /// If the given state ID does not correspond to a match state ID, then the
+ /// pattern epsilons returned is empty.
+ fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons {
+ let offset = sid.as_usize() << self.stride2();
+ PatternEpsilons(self.table[offset + self.pateps_offset].0)
+ }
+
+ /// Set the pattern epsilons for the given state ID.
+ fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) {
+ let offset = sid.as_usize() << self.stride2();
+ self.table[offset + self.pateps_offset] = Transition(pateps.0);
+ }
+
+ /// Returns the state ID prior to the one given. This returns None if the
+ /// given ID is the first DFA state.
+ fn prev_state_id(&self, id: StateID) -> Option<StateID> {
+ if id == DEAD {
+ None
+ } else {
+ // CORRECTNESS: Since 'id' is not the first state, subtracting 1
+ // is always valid.
+ Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap()))
+ }
+ }
+
+ /// Returns the state ID of the last state in this DFA's transition table.
+ /// "last" in this context means the last state to appear in memory, i.e.,
+ /// the one with the greatest ID.
+ fn last_state_id(&self) -> StateID {
+ // CORRECTNESS: A DFA table is always non-empty since it always at
+ // least contains a DEAD state. Since every state has the same stride,
+ // we can just compute what the "next" state ID would have been and
+ // then subtract 1 from it.
+ StateID::new_unchecked(
+ (self.table.len() >> self.stride2()).checked_sub(1).unwrap(),
+ )
+ }
+
+ /// Move the transitions from 'id1' to 'id2' and vice versa.
+ ///
+ /// WARNING: This does not update the rest of the transition table to have
+ /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves
+ /// the states in memory.
+ pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ let o1 = id1.as_usize() << self.stride2();
+ let o2 = id2.as_usize() << self.stride2();
+ for b in 0..self.stride() {
+ self.table.swap(o1 + b, o2 + b);
+ }
+ }
+
+ /// Map all state IDs in this DFA (transition table + start states)
+ /// according to the closure given.
+ pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ for i in 0..self.state_len() {
+ let offset = i << self.stride2();
+ for b in 0..self.alphabet_len() {
+ let next = self.table[offset + b].state_id();
+ self.table[offset + b].set_state_id(map(next));
+ }
+ }
+ for i in 0..self.starts.len() {
+ self.starts[i] = map(self.starts[i]);
+ }
+ }
+}
+
+impl core::fmt::Debug for DFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ fn debug_state_transitions(
+ f: &mut core::fmt::Formatter,
+ dfa: &DFA,
+ sid: StateID,
+ ) -> core::fmt::Result {
+ for (i, (start, end, trans)) in
+ dfa.sparse_transitions(sid).enumerate()
+ {
+ let next = trans.state_id();
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(
+ f,
+ "{:?} => {:?}",
+ DebugByte(start),
+ next.as_usize(),
+ )?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )?;
+ }
+ if trans.match_wins() {
+ write!(f, " (MW)")?;
+ }
+ if !trans.epsilons().is_empty() {
+ write!(f, " ({:?})", trans.epsilons())?;
+ }
+ }
+ Ok(())
+ }
+
+ writeln!(f, "onepass::DFA(")?;
+ for index in 0..self.state_len() {
+ let sid = StateID::must(index);
+ let pateps = self.pattern_epsilons(sid);
+ if sid == DEAD {
+ write!(f, "D ")?;
+ } else if pateps.pattern_id().is_some() {
+ write!(f, "* ")?;
+ } else {
+ write!(f, " ")?;
+ }
+ write!(f, "{:06?}", sid.as_usize())?;
+ if !pateps.is_empty() {
+ write!(f, " ({:?})", pateps)?;
+ }
+ write!(f, ": ")?;
+ debug_state_transitions(f, self, sid)?;
+ write!(f, "\n")?;
+ }
+ writeln!(f, "")?;
+ for (i, &sid) in self.starts.iter().enumerate() {
+ if i == 0 {
+ writeln!(f, "START(ALL): {:?}", sid.as_usize())?;
+ } else {
+ writeln!(
+ f,
+ "START(pattern: {:?}): {:?}",
+ i - 1,
+ sid.as_usize(),
+ )?;
+ }
+ }
+ writeln!(f, "state length: {:?}", self.state_len())?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An iterator over groups of consecutive equivalent transitions in a single
+/// state.
+#[derive(Debug)]
+struct SparseTransitionIter<'a> {
+ it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>,
+ cur: Option<(u8, u8, Transition)>,
+}
+
+impl<'a> Iterator for SparseTransitionIter<'a> {
+ type Item = (u8, u8, Transition);
+
+ fn next(&mut self) -> Option<(u8, u8, Transition)> {
+ while let Some((b, &trans)) = self.it.next() {
+ // Fine because we'll never have more than u8::MAX transitions in
+ // one state.
+ let b = b.as_u8();
+ let (prev_start, prev_end, prev_trans) = match self.cur {
+ Some(t) => t,
+ None => {
+ self.cur = Some((b, b, trans));
+ continue;
+ }
+ };
+ if prev_trans == trans {
+ self.cur = Some((prev_start, b, prev_trans));
+ } else {
+ self.cur = Some((b, b, trans));
+ if prev_trans.state_id() != DEAD {
+ return Some((prev_start, prev_end, prev_trans));
+ }
+ }
+ }
+ if let Some((start, end, trans)) = self.cur.take() {
+ if trans.state_id() != DEAD {
+ return Some((start, end, trans));
+ }
+ }
+ None
+ }
+}
+
+/// A cache represents mutable state that a one-pass [`DFA`] requires during a
+/// search.
+///
+/// For a given one-pass DFA, its corresponding cache may be created either via
+/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every
+/// way, except the former does not require explicitly importing `Cache`.
+///
+/// A particular `Cache` is coupled with the one-pass DFA from which it was
+/// created. It may only be used with that one-pass DFA. A cache and its
+/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
+/// only be used with the new one-pass DFA (and not the old one).
+#[derive(Clone, Debug)]
+pub struct Cache {
+ /// Scratch space used to store slots during a search. Basically, we use
+ /// the caller provided slots to store slots known when a match occurs.
+ /// But after a match occurs, we might continue a search but ultimately
+ /// fail to extend the match. When continuing the search, we need some
+ /// place to store candidate capture offsets without overwriting the slot
+ /// offsets recorded for the most recently seen match.
+ explicit_slots: Vec<Option<NonMaxUsize>>,
+ /// The number of slots in the caller-provided 'Captures' value for the
+ /// current search. This is always at most 'explicit_slots.len()', but
+ /// might be less than it, if the caller provided fewer slots to fill.
+ explicit_slot_len: usize,
+}
+
+impl Cache {
+ /// Create a new [`onepass::DFA`](DFA) cache.
+ ///
+ /// A potentially more convenient routine to create a cache is
+ /// [`DFA::create_cache`], as it does not require also importing the
+ /// `Cache` type.
+ ///
+ /// If you want to reuse the returned `Cache` with some other one-pass DFA,
+ /// then you must call [`Cache::reset`] with the desired one-pass DFA.
+ pub fn new(re: &DFA) -> Cache {
+ let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 };
+ cache.reset(re);
+ cache
+ }
+
+ /// Reset this cache such that it can be used for searching with a
+ /// different [`onepass::DFA`](DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different one-pass DFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different one-pass
+ /// DFA.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re1 = DFA::new(r"\w")?;
+ /// let re2 = DFA::new(r"\W")?;
+ /// let mut caps1 = re1.create_captures();
+ /// let mut caps2 = re2.create_captures();
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the one-pass DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &DFA) {
+ let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len();
+ self.explicit_slots.resize(explicit_slot_len, None);
+ self.explicit_slot_len = explicit_slot_len;
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>()
+ }
+
+ fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] {
+ &mut self.explicit_slots[..self.explicit_slot_len]
+ }
+
+ fn setup_search(&mut self, explicit_slot_len: usize) {
+ self.explicit_slot_len = explicit_slot_len;
+ }
+}
+
+/// Represents a single transition in a one-pass DFA.
+///
+/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds
+/// to the transition epsilons, which contains the slots that should be saved
+/// when this transition is followed and the conditional epsilon transitions
+/// that must be satisfied in order to follow this transition.
+#[derive(Clone, Copy, Eq, PartialEq)]
+struct Transition(u64);
+
+impl Transition {
+ const STATE_ID_BITS: u64 = 21;
+ const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS;
+ const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS;
+ const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1);
+ const INFO_MASK: u64 = 0x000003FF_FFFFFFFF;
+
+ /// Return a new transition to the given state ID with the given epsilons.
+ fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition {
+ let match_wins =
+ if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 };
+ let sid = sid.as_u64() << Transition::STATE_ID_SHIFT;
+ Transition(sid | match_wins | epsilons.0)
+ }
+
+ /// Returns true if and only if this transition points to the DEAD state.
+ fn is_dead(self) -> bool {
+ self.state_id() == DEAD
+ }
+
+ /// Return whether this transition has a "match wins" property.
+ ///
+ /// When a transition has this property, it means that if a match has been
+ /// found and the search uses leftmost-first semantics, then that match
+ /// should be returned immediately instead of continuing on.
+ ///
+ /// The "match wins" name comes from RE2, which uses a pretty much
+ /// identical mechanism for implementing leftmost-first semantics.
+ fn match_wins(&self) -> bool {
+ (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1
+ }
+
+ /// Return the "next" state ID that this transition points to.
+ fn state_id(&self) -> StateID {
+ // OK because a Transition has a valid StateID in its upper bits by
+ // construction. The cast to usize is also correct, even on 16-bit
+ // targets because, again, we know the upper bits is a valid StateID,
+ // which can never overflow usize on any supported target.
+ StateID::new_unchecked(
+ (self.0 >> Transition::STATE_ID_SHIFT).as_usize(),
+ )
+ }
+
+ /// Set the "next" state ID in this transition.
+ fn set_state_id(&mut self, sid: StateID) {
+ *self = Transition::new(self.match_wins(), sid, self.epsilons());
+ }
+
+ /// Return the epsilons embedded in this transition.
+ fn epsilons(&self) -> Epsilons {
+ Epsilons(self.0 & Transition::INFO_MASK)
+ }
+}
+
+impl core::fmt::Debug for Transition {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_dead() {
+ return write!(f, "0");
+ }
+ write!(f, "{}", self.state_id().as_usize())?;
+ if self.match_wins() {
+ write!(f, "-MW")?;
+ }
+ if !self.epsilons().is_empty() {
+ write!(f, "-{:?}", self.epsilons())?;
+ }
+ Ok(())
+ }
+}
+
+/// A representation of a match state's pattern ID along with the epsilons for
+/// when a match occurs.
+///
+/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly
+/// one pattern ID. If it had more, then the original NFA would not have been
+/// one-pass.
+///
+/// The "epsilons" part of this corresponds to what was found in the epsilon
+/// transitions between the transition taken in the last byte of input and the
+/// ultimate match state. This might include saving slots and/or conditional
+/// epsilon transitions that must be satisfied before one can report the match.
+///
+/// Technically, every state has room for a 'PatternEpsilons', but it is only
+/// ever non-empty for match states.
+#[derive(Clone, Copy)]
+struct PatternEpsilons(u64);
+
+impl PatternEpsilons {
+ const PATTERN_ID_BITS: u64 = 22;
+ const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS;
+ // A sentinel value indicating that this is not a match state. We don't
+ // use 0 since 0 is a valid pattern ID.
+ const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF;
+ const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE;
+ const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000;
+ const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF;
+
+ /// Return a new empty pattern epsilons that has no pattern ID and has no
+ /// epsilons. This is suitable for non-match states.
+ fn empty() -> PatternEpsilons {
+ PatternEpsilons(
+ PatternEpsilons::PATTERN_ID_NONE
+ << PatternEpsilons::PATTERN_ID_SHIFT,
+ )
+ }
+
+ /// Whether this pattern epsilons is empty or not. It's empty when it has
+ /// no pattern ID and an empty epsilons.
+ fn is_empty(self) -> bool {
+ self.pattern_id().is_none() && self.epsilons().is_empty()
+ }
+
+ /// Return the pattern ID in this pattern epsilons if one exists.
+ fn pattern_id(self) -> Option<PatternID> {
+ let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
+ if pid == PatternEpsilons::PATTERN_ID_LIMIT {
+ None
+ } else {
+ Some(PatternID::new_unchecked(pid.as_usize()))
+ }
+ }
+
+ /// Returns the pattern ID without checking whether it's valid. If this is
+ /// called and there is no pattern ID in this `PatternEpsilons`, then this
+ /// will likely produce an incorrect result or possibly even a panic or
+ /// an overflow. But safety will not be violated.
+ ///
+ /// This is useful when you know a particular state is a match state. If
+ /// it's a match state, then it must have a pattern ID.
+ fn pattern_id_unchecked(self) -> PatternID {
+ let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
+ PatternID::new_unchecked(pid.as_usize())
+ }
+
+ /// Return a new pattern epsilons with the given pattern ID, but the same
+ /// epsilons.
+ fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons {
+ PatternEpsilons(
+ (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT)
+ | (self.0 & PatternEpsilons::EPSILONS_MASK),
+ )
+ }
+
+ /// Return the epsilons part of this pattern epsilons.
+ fn epsilons(self) -> Epsilons {
+ Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK)
+ }
+
+ /// Return a new pattern epsilons with the given epsilons, but the same
+ /// pattern ID.
+ fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
+ PatternEpsilons(
+ (self.0 & PatternEpsilons::PATTERN_ID_MASK)
+ | u64::from(epsilons.0),
+ )
+ }
+}
+
+impl core::fmt::Debug for PatternEpsilons {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_empty() {
+ return write!(f, "N/A");
+ }
+ if let Some(pid) = self.pattern_id() {
+ write!(f, "{}", pid.as_usize())?;
+ }
+ if !self.epsilons().is_empty() {
+ if self.pattern_id().is_some() {
+ write!(f, "/")?;
+ }
+ write!(f, "{:?}", self.epsilons())?;
+ }
+ Ok(())
+ }
+}
+
+/// Epsilons represents all of the NFA epsilons transitions that went into a
+/// single transition in a single DFA state. In this case, it only represents
+/// the epsilon transitions that have some kind of non-consuming side effect:
+/// either the transition requires storing the current position of the search
+/// into a slot, or the transition is conditional and requires the current
+/// position in the input to satisfy an assertion before the transition may be
+/// taken.
+///
+/// This folds the cumulative effect of a group of NFA states (all connected
+/// by epsilon transitions) down into a single set of bits. While these bits
+/// can represent all possible conditional epsilon transitions, it only permits
+/// storing up to a somewhat small number of slots.
+///
+/// Epsilons is represented as a 42-bit integer. For example, it is packed into
+/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a
+/// `StateID` and a special "match wins" property.)
+#[derive(Clone, Copy)]
+struct Epsilons(u64);
+
+impl Epsilons {
+ const SLOT_MASK: u64 = 0x000003FF_FFFFFC00;
+ const SLOT_SHIFT: u64 = 10;
+ const LOOK_MASK: u64 = 0x00000000_000003FF;
+
+ /// Create a new empty epsilons. It has no slots and no assertions that
+ /// need to be satisfied.
+ fn empty() -> Epsilons {
+ Epsilons(0)
+ }
+
+ /// Returns true if this epsilons contains no slots and no assertions.
+ fn is_empty(self) -> bool {
+ self.0 == 0
+ }
+
+ /// Returns the slot epsilon transitions.
+ fn slots(self) -> Slots {
+ Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32())
+ }
+
+ /// Set the slot epsilon transitions.
+ fn set_slots(self, slots: Slots) -> Epsilons {
+ Epsilons(
+ (u64::from(slots.0) << Epsilons::SLOT_SHIFT)
+ | (self.0 & Epsilons::LOOK_MASK),
+ )
+ }
+
+ /// Return the set of look-around assertions in these epsilon transitions.
+ fn looks(self) -> LookSet {
+ LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() }
+ }
+
+ /// Set the look-around assertions on these epsilon transitions.
+ fn set_looks(self, look_set: LookSet) -> Epsilons {
+ Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits))
+ }
+}
+
+impl core::fmt::Debug for Epsilons {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut wrote = false;
+ if !self.slots().is_empty() {
+ write!(f, "{:?}", self.slots())?;
+ wrote = true;
+ }
+ if !self.looks().is_empty() {
+ if wrote {
+ write!(f, "/")?;
+ }
+ write!(f, "{:?}", self.looks())?;
+ wrote = true;
+ }
+ if !wrote {
+ write!(f, "N/A")?;
+ }
+ Ok(())
+ }
+}
+
+/// The set of epsilon transitions indicating that the current position in a
+/// search should be saved to a slot.
+///
+/// This *only* represents explicit slots. So for example, the pattern
+/// `[a-z]+([0-9]+)([a-z]+)` has:
+///
+/// * 3 capturing groups, thus 6 slots.
+/// * 1 implicit capturing group, thus 2 implicit slots.
+/// * 2 explicit capturing groups, thus 4 explicit slots.
+///
+/// While implicit slots are represented by epsilon transitions in an NFA, we
+/// do not explicitly represent them here. Instead, implicit slots are assumed
+/// to be present and handled automatically in the search code. Therefore,
+/// that means we only need to represent explicit slots in our epsilon
+/// transitions.
+///
+/// Its representation is a bit set. The bit 'i' is set if and only if there
+/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That
+/// is, the bit 'i' corresponds to the first explicit slot and the first
+/// explicit slot appears immediately following the last implicit slot. (If
+/// this is confusing, see `GroupInfo` for more details on how slots works.)
+///
+/// A single `Slots` represents all the active slots in a sub-graph of an NFA,
+/// where all the states are connected by epsilon transitions. In effect, when
+/// traversing the one-pass DFA during a search, all slots set in a particular
+/// transition must be captured by recording the current search position.
+///
+/// The API of `Slots` requires the caller to handle the explicit slot offset.
+/// That is, a `Slots` doesn't know where the explicit slots start for a
+/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they
+/// need to do the arithmetic above to find 'c', which is the real actual slot
+/// index in the corresponding NFA.
+#[derive(Clone, Copy)]
+struct Slots(u32);
+
+impl Slots {
+ const LIMIT: usize = 32;
+
+ /// Insert the slot at the given bit index.
+ fn insert(self, slot: usize) -> Slots {
+ debug_assert!(slot < Slots::LIMIT);
+ Slots(self.0 | (1 << slot.as_u32()))
+ }
+
+ /// Remove the slot at the given bit index.
+ fn remove(self, slot: usize) -> Slots {
+ debug_assert!(slot < Slots::LIMIT);
+ Slots(self.0 & !(1 << slot.as_u32()))
+ }
+
+ /// Returns true if and only if this set contains no slots.
+ fn is_empty(self) -> bool {
+ self.0 == 0
+ }
+
+ /// Returns an iterator over all of the set bits in this set.
+ fn iter(self) -> SlotsIter {
+ SlotsIter { slots: self }
+ }
+
+ /// For the position `at` in the current haystack, copy it to
+ /// `caller_explicit_slots` for all slots that are in this set.
+ ///
+ /// Callers may pass a slice of any length. Slots in this set bigger than
+ /// the length of the given explicit slots are simply skipped.
+ ///
+ /// The slice *must* correspond only to the explicit slots and the first
+ /// element of the slice must always correspond to the first explicit slot
+ /// in the corresponding NFA.
+ fn apply(
+ self,
+ at: usize,
+ caller_explicit_slots: &mut [Option<NonMaxUsize>],
+ ) {
+ if self.is_empty() {
+ return;
+ }
+ let at = NonMaxUsize::new(at);
+ for slot in self.iter() {
+ if slot >= caller_explicit_slots.len() {
+ break;
+ }
+ caller_explicit_slots[slot] = at;
+ }
+ }
+}
+
+impl core::fmt::Debug for Slots {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "S")?;
+ for slot in self.iter() {
+ write!(f, "-{:?}", slot)?;
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all of the bits set in a slot set.
+///
+/// This returns the bit index that is set, so callers may need to offset it
+/// to get the actual NFA slot index.
+#[derive(Debug)]
+struct SlotsIter {
+ slots: Slots,
+}
+
+impl Iterator for SlotsIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ // Number of zeroes here is always <= u8::MAX, and so fits in a usize.
+ let slot = self.slots.0.trailing_zeros().as_usize();
+ if slot >= Slots::LIMIT {
+ return None;
+ }
+ self.slots = self.slots.remove(slot);
+ Some(slot)
+ }
+}
+
+/// An error that occurred during the construction of a one-pass DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`thompson::BuildError`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building a one-pass DFA directly from a pattern
+/// string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a one-pass DFA.
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ NFA(crate::nfa::thompson::BuildError),
+ Word(UnicodeWordBoundaryError),
+ TooManyStates { limit: u64 },
+ TooManyPatterns { limit: u64 },
+ UnsupportedLook { look: Look },
+ ExceededSizeLimit { limit: usize },
+ NotOnePass { msg: &'static str },
+}
+
+impl BuildError {
+ fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ fn word(err: UnicodeWordBoundaryError) -> BuildError {
+ BuildError { kind: BuildErrorKind::Word(err) }
+ }
+
+ fn too_many_states(limit: u64) -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStates { limit } }
+ }
+
+ fn too_many_patterns(limit: u64) -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyPatterns { limit } }
+ }
+
+ fn unsupported_look(look: Look) -> BuildError {
+ BuildError { kind: BuildErrorKind::UnsupportedLook { look } }
+ }
+
+ fn exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
+ }
+
+ fn not_one_pass(msg: &'static str) -> BuildError {
+ BuildError { kind: BuildErrorKind::NotOnePass { msg } }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ use self::BuildErrorKind::*;
+
+ match self.kind {
+ NFA(ref err) => Some(err),
+ Word(ref err) => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use self::BuildErrorKind::*;
+
+ match self.kind {
+ NFA(_) => write!(f, "error building NFA"),
+ Word(_) => write!(f, "NFA contains Unicode word boundary"),
+ TooManyStates { limit } => write!(
+ f,
+ "one-pass DFA exceeded a limit of {:?} for number of states",
+ limit,
+ ),
+ TooManyPatterns { limit } => write!(
+ f,
+ "one-pass DFA exceeded a limit of {:?} for number of patterns",
+ limit,
+ ),
+ UnsupportedLook { look } => write!(
+ f,
+ "one-pass DFA does not support the {:?} assertion",
+ look,
+ ),
+ ExceededSizeLimit { limit } => write!(
+ f,
+ "one-pass DFA exceeded size limit of {:?} during building",
+ limit,
+ ),
+ NotOnePass { msg } => write!(
+ f,
+ "one-pass DFA could not be built because \
+ pattern is not one-pass: {}",
+ msg,
+ ),
+ }
+ }
+}
+
+#[cfg(all(test, feature = "syntax"))]
+mod tests {
+ use alloc::string::ToString;
+
+ use super::*;
+
+ #[test]
+ fn fail_conflicting_transition() {
+ let predicate = |err: &str| err.contains("conflicting transition");
+
+ let err = DFA::new(r"a*[ab]").unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ #[test]
+ fn fail_multiple_epsilon() {
+ let predicate = |err: &str| {
+ err.contains("multiple epsilon transitions to same state")
+ };
+
+ let err = DFA::new(r"(^|$)a").unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ #[test]
+ fn fail_multiple_match() {
+ let predicate = |err: &str| {
+ err.contains("multiple epsilon transitions to match state")
+ };
+
+ let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ // This test is meant to build a one-pass regex with the maximum number of
+ // possible slots.
+ //
+ // NOTE: Remember that the slot limit only applies to explicit capturing
+ // groups. Any number of implicit capturing groups is supported (up to the
+ // maximum number of supported patterns), since implicit groups are handled
+ // by the search loop itself.
+ #[test]
+ fn max_slots() {
+ // One too many...
+ let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)";
+ assert!(DFA::new(pat).is_err());
+ // Just right.
+ let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)";
+ assert!(DFA::new(pat).is_ok());
+ }
+
+ // This test ensures that the one-pass DFA works with all look-around
+ // assertions that we expect it to work with.
+ //
+ // The utility of this test is that each one-pass transition has a small
+ // amount of space to store look-around assertions. Currently, there is
+ // logic in the one-pass constructor to ensure there aren't more than ten
+ // possible assertions. And indeed, there are only ten possible assertions
+ // (at time of writing), so this is okay. But conceivably, more assertions
+ // could be added. So we check that things at least work with what we
+ // expect them to work with.
+ #[test]
+ fn assertions() {
+ // haystack anchors
+ assert!(DFA::new(r"^").is_ok());
+ assert!(DFA::new(r"$").is_ok());
+
+ // line anchors
+ assert!(DFA::new(r"(?m)^").is_ok());
+ assert!(DFA::new(r"(?m)$").is_ok());
+ assert!(DFA::new(r"(?Rm)^").is_ok());
+ assert!(DFA::new(r"(?Rm)$").is_ok());
+
+ // word boundaries
+ if cfg!(feature = "unicode-word-boundary") {
+ assert!(DFA::new(r"\b").is_ok());
+ assert!(DFA::new(r"\B").is_ok());
+ }
+ assert!(DFA::new(r"(?-u)\b").is_ok());
+ assert!(DFA::new(r"(?-u)\B").is_ok());
+ }
+
+ #[cfg(not(miri))] // takes too long on miri
+ #[test]
+ fn is_one_pass() {
+ use crate::util::syntax;
+
+ assert!(DFA::new(r"a*b").is_ok());
+ if cfg!(feature = "unicode-perl") {
+ assert!(DFA::new(r"\w").is_ok());
+ }
+ assert!(DFA::new(r"(?-u)\w*\s").is_ok());
+ assert!(DFA::new(r"(?s:.)*?").is_ok());
+ assert!(DFA::builder()
+ .syntax(syntax::Config::new().utf8(false))
+ .build(r"(?s-u:.)*?")
+ .is_ok());
+ }
+
+ #[test]
+ fn is_not_one_pass() {
+ assert!(DFA::new(r"a*a").is_err());
+ assert!(DFA::new(r"(?s-u:.)*?").is_err());
+ assert!(DFA::new(r"(?s:.)*?a").is_err());
+ }
+
+ #[cfg(not(miri))]
+ #[test]
+ fn is_not_one_pass_bigger() {
+ assert!(DFA::new(r"\w*\s").is_err());
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs
index d0917e17d..f39c1c055 100644
--- a/vendor/regex-automata/src/dfa/regex.rs
+++ b/vendor/regex-automata/src/dfa/regex.rs
@@ -18,16 +18,17 @@ See the [parent module](crate::dfa) for examples.
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
+#[cfg(feature = "dfa-build")]
+use crate::dfa::dense::BuildError;
use crate::{
- dfa::automaton::{Automaton, OverlappingState},
- util::prefilter::{self, Prefilter},
- MatchError, MultiMatch,
+ dfa::{automaton::Automaton, dense},
+ util::{iter, search::Input},
+ Anchored, Match, MatchError,
};
#[cfg(feature = "alloc")]
use crate::{
- dfa::{dense, error::Error, sparse},
- nfa::thompson,
- util::matchtypes::MatchKind,
+ dfa::{sparse, StartKind},
+ util::search::MatchKind,
};
// When the alloc feature is enabled, the regex type sets its A type parameter
@@ -42,20 +43,16 @@ macro_rules! define_regex_type {
($(#[$doc:meta])*) => {
#[cfg(feature = "alloc")]
$(#[$doc])*
- pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> {
- prefilter: Option<P>,
+ pub struct Regex<A = dense::OwnedDFA> {
forward: A,
reverse: A,
- utf8: bool,
}
#[cfg(not(feature = "alloc"))]
$(#[$doc])*
- pub struct Regex<A, P = prefilter::None> {
- prefilter: Option<P>,
+ pub struct Regex<A> {
forward: A,
reverse: A,
- utf8: bool,
}
};
}
@@ -79,86 +76,26 @@ define_regex_type!(
/// memory but search faster, while sparse DFAs use less memory but search
/// more slowly.
///
+ /// # Crate features
+ ///
+ /// Note that despite what the documentation auto-generates, the _only_
+ /// crate feature needed to use this type is `dfa-search`. You do _not_
+ /// need to enable the `alloc` feature.
+ ///
/// By default, a regex's automaton type parameter is set to
/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
/// in-memory work loads, this is the most convenient type that gives the
/// best search performance. When the `alloc` feature is disabled, no
/// default type is used.
///
- /// A `Regex` also has a `P` type parameter, which is used to select the
- /// prefilter used during search. By default, no prefilter is enabled by
- /// setting the type to default to [`prefilter::None`]. A prefilter can be
- /// enabled by using the [`Regex::prefilter`] method.
- ///
/// # When should I use this?
///
/// Generally speaking, if you can afford the overhead of building a full
/// DFA for your regex, and you don't need things like capturing groups,
/// then this is a good choice if you're looking to optimize for matching
/// speed. Note however that its speed may be worse than a general purpose
- /// regex engine if you don't select a good [prefilter].
- ///
- /// # Earliest vs Leftmost vs Overlapping
- ///
- /// The search routines exposed on a `Regex` reflect three different ways
- /// of searching:
- ///
- /// * "earliest" means to stop as soon as a match has been detected.
- /// * "leftmost" means to continue matching until the underlying
- /// automaton cannot advance. This reflects "standard" searching you
- /// might be used to in other regex engines. e.g., This permits
- /// non-greedy and greedy searching to work as you would expect.
- /// * "overlapping" means to find all possible matches, even if they
- /// overlap.
- ///
- /// Generally speaking, when doing an overlapping search, you'll want to
- /// build your regex DFAs with [`MatchKind::All`] semantics. Using
- /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
- /// likely to lead to odd behavior since `LeftmostFirst` specifically omits
- /// some matches that can never be reported due to its semantics.
- ///
- /// The following example shows the differences between how these different
- /// types of searches impact looking for matches of `[a-z]+` in the
- /// haystack `abc`.
- ///
- /// ```
- /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch};
- ///
- /// let pattern = r"[a-z]+";
- /// let haystack = "abc".as_bytes();
- ///
- /// // With leftmost-first semantics, we test "earliest" and "leftmost".
- /// let re = dfa::regex::Builder::new()
- /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst))
- /// .build(pattern)?;
- ///
- /// // "earliest" searching isn't impacted by greediness
- /// let mut it = re.find_earliest_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// // "leftmost" searching supports greediness (and non-greediness)
- /// let mut it = re.find_leftmost_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// // For overlapping, we want "all" match kind semantics.
- /// let re = dfa::regex::Builder::new()
- /// .dense(dense::Config::new().match_kind(MatchKind::All))
- /// .build(pattern)?;
- ///
- /// // In the overlapping search, we find all three possible matches
- /// // starting at the beginning of the haystack.
- /// let mut it = re.find_overlapping_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
+ /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
+ /// underlying DFA.
///
/// # Sparse DFAs
///
@@ -203,18 +140,16 @@ define_regex_type!(
///
/// # Fallibility
///
- /// In non-default configurations, the DFAs generated in this module may
- /// return an error during a search. (Currently, the only way this happens
- /// is if quit bytes are added or Unicode word boundaries are heuristically
- /// enabled, both of which are turned off by default.) For convenience, the
- /// main search routines, like [`find_leftmost`](Regex::find_leftmost),
- /// will panic if an error occurs. However, if you need to use DFAs
- /// which may produce an error at search time, then there are fallible
- /// equivalents of all search routines. For example, for `find_leftmost`,
- /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost).
- /// The routines prefixed with `try_` return `Result<Option<MultiMatch>,
- /// MatchError>`, where as the infallible routines simply return
- /// `Option<MultiMatch>`.
+ /// Most of the search routines defined on this type will _panic_ when the
+ /// underlying search fails. This might be because the DFA gave up because
+ /// it saw a quit byte, whether configured explicitly or via heuristic
+ /// Unicode word boundary support, although neither are enabled by default.
+ /// Or it might fail because an invalid `Input` configuration is given,
+ /// for example, with an unsupported [`Anchored`] mode.
+ ///
+ /// If you need to handle these error cases instead of allowing them to
+ /// trigger a panic, then the lower level [`Regex::try_search`] provides
+ /// a fallible API that never panics.
///
/// # Example
///
@@ -224,18 +159,19 @@ define_regex_type!(
/// across a line boundary.
///
/// ```
- /// use regex_automata::{dfa::{self, regex::Regex}, MatchError};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dense(dfa::dense::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
///
- /// let haystack = "foo\nbar".as_bytes();
+ /// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
- /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
- /// let got = re.try_find_leftmost(haystack).unwrap_err();
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = re.try_search(&input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -243,7 +179,7 @@ define_regex_type!(
#[derive(Clone, Debug)]
);
-#[cfg(feature = "alloc")]
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex {
/// Parse the given regular expression using the default configuration and
/// return the corresponding regex.
@@ -254,16 +190,16 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 14)),
- /// re.find_leftmost(b"zzzfoo12345barzzz"),
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn new(pattern: &str) -> Result<Regex, Error> {
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Builder::new().build(pattern)
}
@@ -273,26 +209,28 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
///
- /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> {
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
Builder::new().build_many(patterns)
}
}
-#[cfg(feature = "alloc")]
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex<sparse::DFA<Vec<u8>>> {
/// Parse the given regular expression using the default configuration,
/// except using sparse DFAs, and return the corresponding regex.
@@ -303,18 +241,18 @@ impl Regex<sparse::DFA<Vec<u8>>> {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 14)),
- /// re.find_leftmost(b"zzzfoo12345barzzz"),
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_sparse(
pattern: &str,
- ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_sparse(pattern)
}
@@ -325,64 +263,29 @@ impl Regex<sparse::DFA<Vec<u8>>> {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
///
- /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_many_sparse<P: AsRef<str>>(
patterns: &[P],
- ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_many_sparse(patterns)
}
}
/// Convenience routines for regex construction.
-#[cfg(feature = "alloc")]
-impl Regex {
- /// Return a default configuration for a `Regex`.
- ///
- /// This is a convenience routine to avoid needing to import the `Config`
- /// type when customizing the construction of a regex.
- ///
- /// # Example
- ///
- /// This example shows how to disable UTF-8 mode for `Regex` iteration.
- /// When UTF-8 mode is disabled, the position immediately following an
- /// empty match is where the next search begins, instead of the next
- /// position of a UTF-8 encoded codepoint.
- ///
- /// ```
- /// use regex_automata::{dfa::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn config() -> Config {
- Config::new()
- }
-
+impl Regex<dense::DFA<&'static [u32]>> {
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
@@ -394,20 +297,18 @@ impl Regex {
/// everywhere.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
- /// dfa::regex::Regex,
- /// nfa::thompson,
- /// MultiMatch, SyntaxConfig,
+ /// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
- /// let expected = Some(MultiMatch::must(0, 1, 9));
- /// let got = re.find_leftmost(haystack);
+ /// let expected = Some(Match::must(0, 1..9));
+ /// let got = re.find(haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -418,7 +319,7 @@ impl Regex {
}
/// Standard search routines for finding and iterating over matches.
-impl<A: Automaton, P: Prefilter> Regex<A, P> {
+impl<A: Automaton> Regex<A> {
/// Returns true if and only if this regex matches the given haystack.
///
/// This routine may short circuit if it knows that scanning future input
@@ -428,65 +329,37 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> {
///
/// # Panics
///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match`](Regex::try_is_match).
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::dfa::regex::Regex;
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// let re = Regex::new("foo[0-9]+bar")?;
- /// assert_eq!(true, re.is_match(b"foo12345bar"));
- /// assert_eq!(false, re.is_match(b"foobar"));
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn is_match(&self, haystack: &[u8]) -> bool {
- self.is_match_at(haystack, 0, haystack.len())
- }
-
- /// Returns the first position at which a match is found.
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest`](Regex::try_find_earliest).
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
- ///
- /// // Normally, the leftmost first match would greedily consume as many
- /// // decimal digits as it could. But a match is detected as soon as one
- /// // digit is seen.
- /// let re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 4)),
- /// re.find_earliest(b"foo12345"),
- /// );
+ /// use regex_automata::dfa::regex::Regex;
///
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the "earliest" match semantics detect a match earlier.
- /// let re = Regex::new("abc|a")?;
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc"));
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, re.is_match("foo12345bar"));
+ /// assert_eq!(false, re.is_match("foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> {
- self.find_earliest_at(haystack, 0, haystack.len())
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
+ // Not only can we do an "earliest" search, but we can avoid doing a
+ // reverse scan too.
+ let input = input.into().earliest(true);
+ self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
}
/// Returns the start and end offset of the leftmost match. If no match
@@ -494,131 +367,41 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> {
///
/// # Panics
///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
- /// // Greediness is applied appropriately when compared to find_earliest.
+ /// // Greediness is applied appropriately.
/// let re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 11)),
- /// re.find_leftmost(b"zzzfoo12345zzz"),
- /// );
+ /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the default leftmost-first match semantics demand that we find the
/// // earliest match that prefers earlier parts of the pattern over latter
/// // parts.
/// let re = Regex::new("abc|a")?;
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc"));
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> {
- self.find_leftmost_at(haystack, 0, haystack.len())
- }
-
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping`](Regex::try_find_overlapping).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let haystack = "@foo".as_bytes();
- /// let mut state = dfa::OverlappingState::start();
- ///
- /// let expected = Some(MultiMatch::must(1, 0, 4));
- /// let got = re.find_overlapping(haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
- /// // The first pattern also matches at the same position, so re-running
- /// // the search will yield another match. Notice also that the first
- /// // pattern is returned after the second. This is because the second
- /// // pattern begins its match before the first, is therefore an earlier
- /// // match and is thus reported first.
- /// let expected = Some(MultiMatch::must(0, 1, 4));
- /// let got = re.find_overlapping(haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_overlapping(
- &self,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.find_overlapping_at(haystack, 0, haystack.len(), state)
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error during iteration, then iteration
- /// panics. This only occurs in non-default configurations where quit bytes
- /// are used or Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an "earliest" iterator.
- ///
- /// ```
- /// use regex_automata::{dfa::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::new("[0-9]+")?;
- /// let haystack = "123".as_bytes();
- ///
- /// // Normally, a standard leftmost iterator would return a single
- /// // match, but since "earliest" detects matches earlier, we get
- /// // three matches.
- /// let mut it = re.find_earliest_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_earliest_iter<'r, 't>(
- &'r self,
- haystack: &'t [u8],
- ) -> FindEarliestMatches<'r, 't, A, P> {
- FindEarliestMatches::new(self, haystack)
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
+ self.try_search(&input.into()).unwrap()
}
/// Returns an iterator over all non-overlapping leftmost matches in the
@@ -628,621 +411,119 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> {
///
/// # Panics
///
- /// If the underlying DFAs return an error during iteration, then iteration
- /// panics. This only occurs in non-default configurations where quit bytes
- /// are used or Unicode word boundaries are heuristically enabled.
+ /// If the search returns an error during iteration, then iteration
+ /// panics. See [`Regex::find`] for the panic conditions.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ /// Use [`Regex::try_search`] with
+ /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
+ /// handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+")?;
- /// let text = b"foo1 foo12 foo123";
- /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(text).collect();
/// assert_eq!(matches, vec![
- /// MultiMatch::must(0, 0, 4),
- /// MultiMatch::must(0, 5, 10),
- /// MultiMatch::must(0, 11, 17),
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_leftmost_iter<'r, 't>(
- &'r self,
- haystack: &'t [u8],
- ) -> FindLeftmostMatches<'r, 't, A, P> {
- FindLeftmostMatches::new(self, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error during iteration, then iteration
- /// panics. This only occurs in non-default configurations where quit bytes
- /// are used or Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let haystack = "@foo".as_bytes();
- ///
- /// let mut it = re.find_overlapping_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_overlapping_iter<'r, 't>(
- &'r self,
- haystack: &'t [u8],
- ) -> FindOverlappingMatches<'r, 't, A, P> {
- FindOverlappingMatches::new(self, haystack)
- }
-}
-
-/// Lower level infallible search routines that permit controlling where
-/// the search starts and ends in a particular sequence. This is useful for
-/// executing searches that need to take surrounding context into account. This
-/// is required for correctly implementing iteration because of look-around
-/// operators (`^`, `$`, `\b`).
-impl<A: Automaton, P: Prefilter> Regex<A, P> {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match_at`](Regex::try_is_match_at).
- pub fn is_match_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- self.try_is_match_at(haystack, start, end).unwrap()
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
- pub fn find_earliest_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_earliest_at(haystack, start, end).unwrap()
- }
-
- /// Returns the same as `find_leftmost`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches within the
- /// same haystack, which cannot be done correctly by simply providing a
- /// subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
- pub fn find_leftmost_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_leftmost_at(haystack, start, end).unwrap()
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying DFAs return an error, then this routine panics. This
- /// only occurs in non-default configurations where quit bytes are used or
- /// Unicode word boundaries are heuristically enabled.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
- pub fn find_overlapping_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.try_find_overlapping_at(haystack, start, end, state).unwrap()
- }
-}
-
-/// Fallible search routines. These may return an error when the underlying
-/// DFAs have been configured in a way that permits them to fail during a
-/// search.
-///
-/// Errors during search only occur when the DFA has been explicitly
-/// configured to do so, usually by specifying one or more "quit" bytes or by
-/// heuristically enabling Unicode word boundaries.
-///
-/// Errors will never be returned using the default configuration. So these
-/// fallible routines are only needed for particular configurations.
-impl<A: Automaton, P: Prefilter> Regex<A, P> {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match`](Regex::is_match).
- pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> {
- self.try_is_match_at(haystack, 0, haystack.len())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest`](Regex::find_earliest).
- pub fn try_find_earliest(
- &self,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at(haystack, 0, haystack.len())
- }
-
- /// Returns the start and end offset of the leftmost match. If no match
- /// exists, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost`](Regex::find_leftmost).
- pub fn try_find_leftmost(
- &self,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at(haystack, 0, haystack.len())
- }
-
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping`](Regex::find_overlapping).
- pub fn try_find_overlapping(
- &self,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at(haystack, 0, haystack.len(), state)
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_iter`](Regex::find_earliest_iter).
- pub fn try_find_earliest_iter<'r, 't>(
- &'r self,
- haystack: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 't, A, P> {
- TryFindEarliestMatches::new(self, haystack)
- }
-
- /// Returns an iterator over all non-overlapping leftmost matches in the
- /// given bytes. If no match exists, then the iterator yields no elements.
- ///
- /// This corresponds to the "standard" regex search iterator.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
- pub fn try_find_leftmost_iter<'r, 't>(
- &'r self,
- haystack: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 't, A, P> {
- TryFindLeftmostMatches::new(self, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
- pub fn try_find_overlapping_iter<'r, 't>(
+ #[inline]
+ pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
&'r self,
- haystack: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 't, A, P> {
- TryFindOverlappingMatches::new(self, haystack)
+ input: I,
+ ) -> FindMatches<'r, 'h, A> {
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, it }
}
}
/// Lower level fallible search routines that permit controlling where the
/// search starts and ends in a particular sequence.
-impl<A: Automaton, P: Prefilter> Regex<A, P> {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match_at`](Regex::is_match_at).
- pub fn try_is_match_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<bool, MatchError> {
- self.forward()
- .find_earliest_fwd_at(
- self.scanner().as_mut(),
- None,
- haystack,
- start,
- end,
- )
- .map(|x| x.is_some())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_at`](Regex::find_earliest_at).
- pub fn try_find_earliest_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at_imp(
- self.scanner().as_mut(),
- haystack,
- start,
- end,
- )
- }
-
- /// The implementation of "earliest" searching, where a prefilter scanner
- /// may be given.
- fn try_find_earliest_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- // N.B. We use `&&A` here to call `Automaton` methods, which ensures
- // that we always use the `impl Automaton for &A` for calling methods.
- // Since this is the usual way that automata are used, this helps
- // reduce the number of monomorphized copies of the search code.
- let (fwd, rev) = (self.forward(), self.reverse());
- let end = match (&fwd)
- .find_earliest_fwd_at(pre, None, haystack, start, end)?
- {
- None => return Ok(None),
- Some(end) => end,
- };
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // leftmost case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof.
- let start = (&rev)
- .find_earliest_rev_at(None, haystack, start, end.offset())?
- .expect("reverse search must match if forward search does");
- assert_eq!(
- start.pattern(),
- end.pattern(),
- "forward and reverse search must match same pattern"
- );
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
- }
-
+impl<A: Automaton> Regex<A> {
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
- /// # Searching a substring of the haystack
+ /// This is like [`Regex::find`] but with two differences:
///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
+ /// 1. It is not generic over `Into<Input>` and instead accepts a
+ /// `&Input`. This permits reusing the same `Input` for multiple searches
+ /// without needing to create a new one. This _may_ help with latency.
+ /// 2. It returns an error if the search could not complete where as
+ /// [`Regex::find`] will panic.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_at`](Regex::find_leftmost_at).
- pub fn try_find_leftmost_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at_imp(
- self.scanner().as_mut(),
- haystack,
- start,
- end,
- )
- }
-
- /// The implementation of leftmost searching, where a prefilter scanner
- /// may be given.
- fn try_find_leftmost_at_imp(
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ #[inline]
+ pub fn try_search(
&self,
- scanner: Option<&mut prefilter::Scanner>,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- // N.B. We use `&&A` here to call `Automaton` methods, which ensures
- // that we always use the `impl Automaton for &A` for calling methods.
- // Since this is the usual way that automata are used, this helps
- // reduce the number of monomorphized copies of the search code.
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
let (fwd, rev) = (self.forward(), self.reverse());
- let end = match (&fwd)
- .find_leftmost_fwd_at(scanner, None, haystack, start, end)?
- {
+ let end = match fwd.try_search_fwd(input)? {
None => return Ok(None),
Some(end) => end,
};
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // leftmost case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof. Why not
- // just provide the pattern anyway? Well, if it is needed, then leaving
- // it out gives us a chance to find a witness.
- let start = (&rev)
- .find_leftmost_rev_at(None, haystack, start, end.offset())?
+ // This special cases an empty match at the beginning of the search. If
+ // our end matches our start, then since a reverse DFA can't match past
+ // the start, it must follow that our starting position is also our end
+ // position. So short circuit and skip the reverse search.
+ if input.start() == end.offset() {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ end.offset()..end.offset(),
+ )));
+ }
+ // We can also skip the reverse search if we know our search was
+ // anchored. This occurs either when the input config is anchored or
+ // when we know the regex itself is anchored. In this case, we know the
+ // start of the match, if one is found, must be the start of the
+ // search.
+ if self.is_anchored(input) {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ input.start()..end.offset(),
+ )));
+ }
+ // N.B. I have tentatively convinced myself that it isn't necessary
+ // to specify the specific pattern for the reverse search since the
+ // reverse search will always find the same pattern to match as the
+ // forward search. But I lack a rigorous proof. Why not just provide
+ // the pattern anyway? Well, if it is needed, then leaving it out
+ // gives us a chance to find a witness. (Also, if we don't need to
+ // specify the pattern, then we don't need to build the reverse DFA
+ // with 'starts_for_each_pattern' enabled.)
+ //
+ // We also need to be careful to disable 'earliest' for the reverse
+ // search, since it could be enabled for the forward search. In the
+ // reverse case, to satisfy "leftmost" criteria, we need to match
+ // as much as we can. We also need to be careful to make the search
+ // anchored. We don't want the reverse search to report any matches
+ // other than the one beginning at the end of our forward search.
+ let revsearch = input
+ .clone()
+ .span(input.start()..end.offset())
+ .anchored(Anchored::Yes)
+ .earliest(false);
+ let start = rev
+ .try_search_rev(&revsearch)?
.expect("reverse search must match if forward search does");
assert_eq!(
start.pattern(),
@@ -1250,132 +531,22 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> {
"forward and reverse search must match same pattern",
);
assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used or Unicode word boundaries are heuristically
- /// enabled.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_at`](Regex::find_overlapping_at).
- pub fn try_find_overlapping_at(
- &self,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at_imp(
- self.scanner().as_mut(),
- haystack,
- start,
- end,
- state,
- )
+ Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
- /// The implementation of overlapping search at a given range in
- /// `haystack`, where `scanner` is a prefilter (if active) and `state` is
- /// the current state of the search.
- fn try_find_overlapping_at_imp(
- &self,
- scanner: Option<&mut prefilter::Scanner>,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- // N.B. We use `&&A` here to call `Automaton` methods, which ensures
- // that we always use the `impl Automaton for &A` for calling methods.
- // Since this is the usual way that automata are used, this helps
- // reduce the number of monomorphized copies of the search code.
- let (fwd, rev) = (self.forward(), self.reverse());
- // TODO: Decide whether it's worth making this assert work. It doesn't
- // work currently because 'has_starts_for_each_pattern' isn't on the
- // Automaton trait. Without this assert, we still get a panic, but it's
- // a bit more inscrutable.
- // assert!(
- // rev.has_starts_for_each_pattern(),
- // "overlapping searches require that the reverse DFA is \
- // compiled with the 'starts_for_each_pattern' option",
- // );
- let end = match (&fwd).find_overlapping_fwd_at(
- scanner, None, haystack, start, end, state,
- )? {
- None => return Ok(None),
- Some(end) => end,
- };
- // Unlike the leftmost cases, the reverse overlapping search may match
- // a different pattern than the forward search. See test failures when
- // using `None` instead of `Some(end.pattern())` below. Thus, we must
- // run our reverse search using the pattern that matched in the forward
- // direction.
- let start = (&rev)
- .find_leftmost_rev_at(
- Some(end.pattern()),
- haystack,
- 0,
- end.offset(),
- )?
- .expect("reverse search must match if forward search does");
- assert!(start.offset() <= end.offset());
- assert_eq!(start.pattern(), end.pattern());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ /// Returns true if either the given input specifies an anchored search
+ /// or if the underlying DFA is always anchored.
+ fn is_anchored(&self, input: &Input<'_>) -> bool {
+ match input.get_anchored() {
+ Anchored::No => self.forward().is_always_start_anchored(),
+ Anchored::Yes | Anchored::Pattern(_) => true,
+ }
}
}
/// Non-search APIs for querying information about the regex and setting a
/// prefilter.
-impl<A: Automaton, P: Prefilter> Regex<A, P> {
- /// Attach the given prefilter to this regex.
- pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> {
- Regex {
- prefilter: Some(prefilter),
- forward: self.forward,
- reverse: self.reverse,
- utf8: self.utf8,
- }
- }
-
- /// Remove any prefilter from this regex.
- pub fn without_prefilter(self) -> Regex<A> {
- Regex {
- prefilter: None,
- forward: self.forward,
- reverse: self.reverse,
- utf8: self.utf8,
- }
- }
-
+impl<A: Automaton> Regex<A> {
/// Return the underlying DFA responsible for forward matching.
///
/// This is useful for accessing the underlying DFA and converting it to
@@ -1399,471 +570,48 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::dfa::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
- /// assert_eq!(3, re.pattern_count());
+ /// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn pattern_count(&self) -> usize {
- assert_eq!(
- self.forward().pattern_count(),
- self.reverse().pattern_count()
- );
- self.forward().pattern_count()
- }
-
- /// Convenience function for returning this regex's prefilter as a trait
- /// object.
- ///
- /// If this regex doesn't have a prefilter, then `None` is returned.
- pub fn prefilter(&self) -> Option<&dyn Prefilter> {
- match self.prefilter {
- None => None,
- Some(ref x) => Some(&*x),
- }
- }
-
- /// Convenience function for returning a prefilter scanner.
- fn scanner(&self) -> Option<prefilter::Scanner> {
- self.prefilter().map(prefilter::Scanner::new)
+ pub fn pattern_len(&self) -> usize {
+ assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
+ self.forward().pattern_len()
}
}
-/// An iterator over all non-overlapping earliest matches for a particular
-/// infallible search.
+/// An iterator over all non-overlapping matches for an infallible search.
///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
+/// The iterator yields a [`Match`] value until no more matches could be found.
+/// If the underlying regex engine returns an error, then a panic occurs.
///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
+/// The type parameters are as follows:
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct FindEarliestMatches<'r, 't, A, P>(
- TryFindEarliestMatches<'r, 't, A, P>,
-);
-
-impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> {
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> FindEarliestMatches<'r, 't, A, P> {
- FindEarliestMatches(TryFindEarliestMatches::new(re, text))
- }
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for FindEarliestMatches<'r, 't, A, P>
-{
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// infallible search.
+/// * `A` represents the type of the underlying DFA that implements the
+/// [`Automaton`] trait.
///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
+/// The lifetime parameters are as follows:
///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
+/// * `'h` represents the lifetime of the haystack being searched.
+/// * `'r` represents the lifetime of the regex object itself.
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct FindLeftmostMatches<'r, 't, A, P>(
- TryFindLeftmostMatches<'r, 't, A, P>,
-);
-
-impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> {
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> FindLeftmostMatches<'r, 't, A, P> {
- FindLeftmostMatches(TryFindLeftmostMatches::new(re, text))
- }
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for FindLeftmostMatches<'r, 't, A, P>
-{
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all overlapping matches for a particular infallible
-/// search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
-///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>(
- TryFindOverlappingMatches<'r, 't, A, P>,
-);
-
-impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> {
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> FindOverlappingMatches<'r, 't, A, P> {
- FindOverlappingMatches(TryFindOverlappingMatches::new(re, text))
- }
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for FindOverlappingMatches<'r, 't, A, P>
-{
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping earliest matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct TryFindEarliestMatches<'r, 't, A, P> {
- re: &'r Regex<A, P>,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> {
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 't, A, P> {
- let scanner = re.scanner();
- TryFindEarliestMatches {
- re,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for TryFindEarliestMatches<'r, 't, A, P>
-{
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_earliest_at_imp(
- self.scanner.as_mut(),
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct TryFindLeftmostMatches<'r, 't, A, P> {
- re: &'r Regex<A, P>,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> {
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 't, A, P> {
- let scanner = re.scanner();
- TryFindLeftmostMatches {
- re,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for TryFindLeftmostMatches<'r, 't, A, P>
-{
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_leftmost_at_imp(
- self.scanner.as_mut(),
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all overlapping matches for a particular fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// `A` is the type used to represent the underlying DFAs used by the regex,
-/// while `P` is the type of prefilter used, if any. The lifetime variables are
-/// as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> {
- re: &'r Regex<A, P>,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- state: OverlappingState,
-}
-
-impl<'r, 't, A: Automaton, P: Prefilter>
- TryFindOverlappingMatches<'r, 't, A, P>
-{
- fn new(
- re: &'r Regex<A, P>,
- text: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 't, A, P> {
- let scanner = re.scanner();
- TryFindOverlappingMatches {
- re,
- scanner,
- text,
- last_end: 0,
- state: OverlappingState::start(),
- }
- }
+/// This iterator can be created with the [`Regex::find_iter`] method.
+#[derive(Debug)]
+pub struct FindMatches<'r, 'h, A> {
+ re: &'r Regex<A>,
+ it: iter::Searcher<'h>,
}
-impl<'r, 't, A: Automaton, P: Prefilter> Iterator
- for TryFindOverlappingMatches<'r, 't, A, P>
-{
- type Item = Result<MultiMatch, MatchError>;
+impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
+ type Item = Match;
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_overlapping_at_imp(
- self.scanner.as_mut(),
- self.text,
- self.last_end,
- self.text.len(),
- &mut self.state,
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- // Unlike the non-overlapping case, we're OK with empty matches at this
- // level. In particular, the overlapping search algorithm is itself
- // responsible for ensuring that progress is always made.
- self.last_end = m.end();
- Some(Ok(m))
- }
-}
-
-/// The configuration used for compiling a DFA-backed regex.
-///
-/// A regex configuration is a simple data object that is typically used with
-/// [`Builder::configure`].
-#[cfg(feature = "alloc")]
-#[derive(Clone, Copy, Debug, Default)]
-pub struct Config {
- utf8: Option<bool>,
-}
-
-#[cfg(feature = "alloc")]
-impl Config {
- /// Return a new default regex compiler configuration.
- pub fn new() -> Config {
- Config::default()
- }
-
- /// Whether to enable UTF-8 mode or not.
- ///
- /// When UTF-8 mode is enabled (the default) and an empty match is seen,
- /// the iterators on [`Regex`] will always start the next search at the
- /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
- /// mode is disabled, such searches are begun at the next byte offset.
- ///
- /// If this mode is enabled and invalid UTF-8 is given to search, then
- /// behavior is unspecified.
- ///
- /// Generally speaking, one should enable this when
- /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
- /// and
- /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
- /// are enabled, and disable it otherwise.
- ///
- /// # Example
- ///
- /// This example demonstrates the differences between when this option is
- /// enabled and disabled. The differences only arise when the regex can
- /// return matches of length zero.
- ///
- /// In this first snippet, we show the results when UTF-8 mode is disabled.
- ///
- /// ```
- /// use regex_automata::{dfa::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- ///
- /// And in this snippet, we execute the same search on the same haystack,
- /// but with UTF-8 mode enabled. Notice that byte offsets that would
- /// otherwise split the encoding of `☃` are not returned.
- ///
- /// ```
- /// use regex_automata::{dfa::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(true))
- /// .build(r"")?;
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn utf8(mut self, yes: bool) -> Config {
- self.utf8 = Some(yes);
- self
- }
-
- /// Returns true if and only if this configuration has UTF-8 mode enabled.
- ///
- /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
- /// [`Regex`] will always start the next search at the next UTF-8 encoded
- /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
- /// next byte offset.
- pub fn get_utf8(&self) -> bool {
- self.utf8.unwrap_or(true)
- }
-
- /// Overwrite the default configuration such that the options in `o` are
- /// always used. If an option in `o` is not set, then the corresponding
- /// option in `self` is used. If it's not set in `self` either, then it
- /// remains not set.
- pub(crate) fn overwrite(self, o: Config) -> Config {
- Config { utf8: o.utf8.or(self.utf8) }
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut it } = *self;
+ it.advance(|input| re.try_search(input))
}
}
@@ -1874,17 +622,15 @@ impl Config {
/// itself. This builder is different from a general purpose regex builder in
/// that it permits fine grain configuration of the construction process. The
/// trade off for this is complexity, and the possibility of setting a
-/// configuration that might not make sense. For example, there are three
+/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
-/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
-/// pattern itself can contain sub-expressions that match invalid UTF-8.
-/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
-/// controls whether the implicit unanchored prefix added to the NFA can
-/// match through invalid UTF-8 or not.
-/// * [`Config::utf8`] controls how the regex iterators themselves advance
-/// the starting position of the next search when a match with zero length is
-/// found.
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
+/// how the regex iterators themselves advance the starting position of the
+/// next search when a match with zero length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
@@ -1919,57 +665,51 @@ impl Config {
///
/// # Example
///
-/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
-/// the regex itself. This is generally what you want for matching on
-/// arbitrary bytes.
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
-/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
-/// .configure(Regex::config().utf8(false))
-/// .syntax(SyntaxConfig::new().utf8(false))
+/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
-/// let expected = Some(MultiMatch::must(0, 1, 9));
-/// let got = re.find_leftmost(haystack);
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find(haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
-/// // on the syntax permits this. Notice also that the
-/// // search was unanchored and skipped over invalid UTF-8.
-/// // Disabling UTF-8 on the Thompson NFA permits this.
-/// //
-/// // N.B. This example does not show the impact of
-/// // disabling UTF-8 mode on Config, since that
-/// // only impacts regexes that can produce matches of
-/// // length 0.
+/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
-#[cfg(feature = "alloc")]
#[derive(Clone, Debug)]
pub struct Builder {
- config: Config,
+ #[cfg(feature = "dfa-build")]
dfa: dense::Builder,
}
-#[cfg(feature = "alloc")]
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
- Builder { config: Config::default(), dfa: dense::Builder::new() }
+ Builder {
+ #[cfg(feature = "dfa-build")]
+ dfa: dense::Builder::new(),
+ }
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
- pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
@@ -1977,38 +717,42 @@ impl Builder {
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_sparse(
&self,
pattern: &str,
- ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
self.build_many_sparse(&[pattern])
}
/// Build a regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
- ) -> Result<Regex, Error> {
+ ) -> Result<Regex, BuildError> {
let forward = self.dfa.build_many(patterns)?;
let reverse = self
.dfa
.clone()
.configure(
dense::Config::new()
- .anchored(true)
- .match_kind(MatchKind::All)
- .starts_for_each_pattern(true),
+ .prefilter(None)
+ .specialize_start_states(false)
+ .start_kind(StartKind::Anchored)
+ .match_kind(MatchKind::All),
)
- .thompson(thompson::Config::new().reverse(true))
+ .thompson(crate::nfa::thompson::Config::new().reverse(true))
.build_many(patterns)?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a sparse regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many_sparse<P: AsRef<str>>(
&self,
patterns: &[P],
- ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
let re = self.build_many(patterns)?;
let forward = re.forward().to_sparse()?;
let reverse = re.reverse().to_sparse()?;
@@ -2028,16 +772,14 @@ impl Builder {
/// * It should be anchored.
/// * It should use [`MatchKind::All`] semantics.
/// * It should match in reverse.
- /// * It should have anchored start states compiled for each pattern.
/// * Otherwise, its configuration should match the forward DFA.
///
- /// If these conditions are satisfied, then behavior of searches is
+ /// If these conditions aren't satisfied, then the behavior of searches is
/// unspecified.
///
- /// Note that when using this constructor, only the configuration from
- /// [`Config`] is applied. The only configuration settings on this builder
- /// only apply when the builder owns the construction of the DFAs
- /// themselves.
+ /// Note that when using this constructor, no configuration is applied.
+ /// Since this routine provides the DFAs to the builder, there is no
+ /// opportunity to apply other configuration options.
///
/// # Example
///
@@ -2079,35 +821,33 @@ impl Builder {
forward: A,
reverse: A,
) -> Regex<A> {
- let utf8 = self.config.get_utf8();
- Regex { prefilter: None, forward, reverse, utf8 }
- }
-
- /// Apply the given regex configuration options to this builder.
- pub fn configure(&mut self, config: Config) -> &mut Builder {
- self.config = self.config.overwrite(config);
- self
+ Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
- /// [`nfa::thompson::Config`](thompson::Config).
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
- pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn thompson(
+ &mut self,
+ config: crate::nfa::thompson::Config,
+ ) -> &mut Builder {
self.dfa.thompson(config);
self
}
@@ -2117,30 +857,15 @@ impl Builder {
///
/// This permits setting things like whether the underlying DFAs should
/// be minimized.
+ #[cfg(feature = "dfa-build")]
pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
self.dfa.configure(config);
self
}
}
-#[cfg(feature = "alloc")]
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
-
-#[inline(always)]
-fn next_unwrap(
- item: Option<Result<MultiMatch, MatchError>>,
-) -> Option<MultiMatch> {
- match item {
- None => None,
- Some(Ok(m)) => Some(m),
- Some(Err(err)) => panic!(
- "unexpected regex search error: {}\n\
- to handle search errors, use try_ methods",
- err,
- ),
- }
-}
diff --git a/vendor/regex-automata/src/dfa/remapper.rs b/vendor/regex-automata/src/dfa/remapper.rs
new file mode 100644
index 000000000..6e4964672
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/remapper.rs
@@ -0,0 +1,242 @@
+use alloc::vec::Vec;
+
+use crate::util::primitives::StateID;
+
+/// Remappable is a tightly coupled abstraction that facilitates remapping
+/// state identifiers in DFAs.
+///
+/// The main idea behind remapping state IDs is that DFAs often need to check
+/// if a certain state is a "special" state of some kind (like a match state)
+/// during a search. Since this is extremely perf critical code, we want this
+/// check to be as fast as possible. Partitioning state IDs into, for example,
+/// into "non-match" and "match" states means one can tell if a state is a
+/// match state via a simple comparison of the state ID.
+///
+/// The issue is that during the DFA construction process, it's not
+/// particularly easy to partition the states. Instead, the simplest thing is
+/// to often just do a pass over all of the states and shuffle them into their
+/// desired partitionings. To do that, we need a mechanism for swapping states.
+/// Hence, this abstraction.
+///
+/// Normally, for such little code, I would just duplicate it. But this is a
+/// key optimization and the implementation is a bit subtle. So the abstraction
+/// is basically a ham-fisted attempt at DRY. The only place we use this is in
+/// the dense and one-pass DFAs.
+///
+/// See also src/dfa/special.rs for a more detailed explanation of how dense
+/// DFAs are partitioned.
+pub(super) trait Remappable: core::fmt::Debug {
+ /// Return the total number of states.
+ fn state_len(&self) -> usize;
+ /// Return the power-of-2 exponent that yields the stride. The pertinent
+ /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
+ fn stride2(&self) -> usize;
+ /// Swap the states pointed to by the given IDs. The underlying finite
+ /// state machine should be mutated such that all of the transitions in
+ /// `id1` are now in the memory region where the transitions for `id2`
+ /// were, and all of the transitions in `id2` are now in the memory region
+ /// where the transitions for `id1` were.
+ ///
+ /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
+ ///
+ /// It is expected that, after calling this, the underlying value will be
+ /// left in an inconsistent state, since any other transitions pointing to,
+ /// e.g., `id1` need to be updated to point to `id2`, since that's where
+ /// `id1` moved to.
+ ///
+ /// In order to "fix" the underlying inconsistent state, a `Remapper`
+ /// should be used to guarantee that `remap` is called at the appropriate
+ /// time.
+ fn swap_states(&mut self, id1: StateID, id2: StateID);
+ /// This must remap every single state ID in the underlying value according
+ /// to the function given. For example, in a DFA, this should remap every
+ /// transition and every starting state ID.
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID);
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// finite state machine. This is useful when one wants to shuffle states into
+/// different positions in the machine.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` must be called, which will rewrite
+/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
+/// will almost certainly result in a corrupt machine.
+#[derive(Debug)]
+pub(super) struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+ /// A mapper from state index to state ID (and back).
+ idxmap: IndexMapper,
+}
+
+impl Remapper {
+ /// Create a new remapper from the given remappable implementation. The
+ /// remapper can then be used to swap states. The remappable value given
+ /// here must the same one given to `swap` and `remap`.
+ pub(super) fn new(r: &impl Remappable) -> Remapper {
+ let idxmap = IndexMapper { stride2: r.stride2() };
+ let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
+ Remapper { map, idxmap }
+ }
+
+ /// Swap two states. Once this is called, callers must follow through to
+ /// call `remap`, or else it's possible for the underlying remappable
+ /// value to be in a corrupt state.
+ pub(super) fn swap(
+ &mut self,
+ r: &mut impl Remappable,
+ id1: StateID,
+ id2: StateID,
+ ) {
+ if id1 == id2 {
+ return;
+ }
+ r.swap_states(id1, id2);
+ self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
+ }
+
+ /// Complete the remapping process by rewriting all state IDs in the
+ /// remappable value according to the swaps performed.
+ pub(super) fn remap(mut self, r: &mut impl Remappable) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ //
+ // The intuition here is to think about how changes are made to the
+ // map: only through pairwise swaps. That means that starting at any
+ // given state, it is always possible to find the loop back to that
+ // state by following the swaps represented in the map (which might be
+ // 0 swaps).
+ //
+ // We are also careful to clone the map before starting in order to
+ // freeze it. We use the frozen map to find our loops, since we need to
+ // update our map as well. Without freezing it, our updates could break
+ // the loops referenced above and produce incorrect results.
+ let oldmap = self.map.clone();
+ for i in 0..r.state_len() {
+ let cur_id = self.idxmap.to_state_id(i);
+ let mut new_id = oldmap[i];
+ if cur_id == new_id {
+ continue;
+ }
+ loop {
+ let id = oldmap[self.idxmap.to_index(new_id)];
+ if cur_id == id {
+ self.map[i] = new_id;
+ break;
+ }
+ new_id = id;
+ }
+ }
+ r.remap(|next| self.map[self.idxmap.to_index(next)]);
+ }
+}
+
+/// A simple type for mapping between state indices and state IDs.
+///
+/// The reason why this exists is because state IDs are "premultiplied." That
+/// is, in order to get to the transitions for a particular state, one need
+/// only use the state ID as-is, instead of having to multiple it by transition
+/// table's stride.
+///
+/// The downside of this is that it's inconvenient to map between state IDs
+/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
+/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
+/// `2`, `3`, etc.
+///
+/// Since our state IDs are premultiplied, we can convert back-and-forth
+/// between IDs and indices by simply unmultiplying the IDs and multiplying the
+/// indices.
+#[derive(Debug)]
+struct IndexMapper {
+ /// The power of 2 corresponding to the stride of the corresponding
+ /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
+ /// stride2' pre-multiplies an index to an ID.
+ stride2: usize,
+}
+
+impl IndexMapper {
+ /// Convert a state ID to a state index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert a state index to a state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID. We'll "just"
+ // wind up with panics or silent logic errors at some other point.
+ StateID::new_unchecked(index << self.stride2)
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+mod dense {
+ use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for OwnedDFA {
+ fn state_len(&self) -> usize {
+ OwnedDFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ OwnedDFA::stride2(self)
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ OwnedDFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ OwnedDFA::remap(self, map)
+ }
+ }
+}
+
+#[cfg(feature = "dfa-onepass")]
+mod onepass {
+ use crate::{dfa::onepass::DFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for DFA {
+ fn state_len(&self) -> usize {
+ DFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ // We don't do pre-multiplication for the one-pass DFA, so
+ // returning 0 has the effect of making state IDs and state indices
+ // equivalent.
+ 0
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ DFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ DFA::remap(self, map)
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs
index 492414981..8c012a594 100644
--- a/vendor/regex-automata/src/dfa/search.rs
+++ b/vendor/regex-automata/src/dfa/search.rs
@@ -1,493 +1,654 @@
use crate::{
dfa::{
accel,
- automaton::{Automaton, OverlappingState, StateMatch},
+ automaton::{Automaton, OverlappingState},
},
util::{
- id::{PatternID, StateID},
- matchtypes::HalfMatch,
- prefilter, MATCH_OFFSET,
+ prefilter::Prefilter,
+ primitives::StateID,
+ search::{Anchored, HalfMatch, Input, Span},
},
MatchError,
};
#[inline(never)]
-pub fn find_earliest_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
+pub fn find_fwd<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
- } else {
- find_fwd(None, true, dfa, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
}
-}
-
-#[inline(never)]
-pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
- dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
+ if pre.is_some() {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, pre, true)
+ } else {
+ find_fwd_imp(dfa, input, pre, false)
+ }
} else {
- find_fwd(None, false, dfa, pattern_id, bytes, start, end)
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, None, true)
+ } else {
+ find_fwd_imp(dfa, input, None, false)
+ }
}
}
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
-/// getting inlined eliminate some critical branches. To avoid bloating binary
-/// size, we only call this function in a fixed number of places.
-#[inline(always)]
-fn find_fwd<A: Automaton + ?Sized>(
- mut pre: Option<&mut prefilter::Scanner>,
- earliest: bool,
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[..end];
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut mat = None;
+ let mut sid = init_fwd(dfa, input)?;
+ let mut at = input.start();
+ // This could just be a closure, but then I think it would be unsound
+ // because it would need to be safe to invoke. This way, the lack of safety
+ // is clearer in the code below.
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
- let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = start;
- if let Some(ref mut pre) = pre {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
// If a prefilter doesn't report false positives, then we don't need to
// touch the DFA at all. However, since all matches include the pattern
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
// limit this optimization to cases where there is exactly one pattern.
// In that case, any match must be the 0th pattern.
- if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
- return Ok(pre.next_candidate(bytes, at).into_option().map(
- |offset| HalfMatch { pattern: PatternID::ZERO, offset },
- ));
- } else if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
}
}
}
}
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- if dfa.is_start_state(state) {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ while at < input.end() {
+ // SAFETY: There are two safety invariants we need to uphold here in
+ // the loops below: that 'sid' and 'prev_sid' are valid state IDs
+ // for this DFA, and that 'at' is a valid index into 'haystack'.
+ // For the former, we rely on the invariant that next_state* and
+ // start_state_forward always returns a valid state ID (given a valid
+ // state ID in the former case). For the latter safety invariant, we
+ // always guard unchecked access with a check that 'at' is less than
+ // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
+ // ensure that 'at' is always in bounds.
+ //
+ // PERF: See a similar comment in src/hybrid/search.rs that justifies
+ // this extra work to make the search loop fast. The same reasoning and
+ // benchmarks apply here.
+ let mut prev_sid;
+ while at < input.end() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ // We want to skip any update to 'at' below
+ // at the end of this iteration and just
+ // jump immediately back to the next state
+ // transition at the leading position of the
+ // candidate match.
+ //
+ // ... but only if we actually made progress
+ // with our prefilter, otherwise if the start
+ // state has a self-loop, we can get stuck.
+ if span.start > at {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
+ }
+ continue;
}
}
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
}
- } else if dfa.is_match_state(state) {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: at - MATCH_OFFSET,
- });
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ mat = Some(HalfMatch::new(pattern, at));
if earliest {
- return Ok(last_match);
+ return Ok(mat);
}
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
}
- } else if dfa.is_accel_state(state) {
- let needs = dfa.accelerator(state);
- at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
- } else if dfa.is_dead_state(state) {
- return Ok(last_match);
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ at = accel::find_fwd(needs, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
} else {
- debug_assert!(dfa.is_quit_state(state));
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ // It's important that this is a debug_assert, since this can
+ // actually be tripped even if DFA::from_bytes succeeds and
+ // returns a supposedly valid DFA.
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
}
}
- while at < end && dfa.next_state(state, bytes[at]) == state {
- at += 1;
- }
+ at += 1;
}
- Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
+ eoi_fwd(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
-pub fn find_earliest_rev<A: Automaton + ?Sized>(
+pub fn find_rev<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(true, dfa, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
+ }
+ if input.get_earliest() {
+ find_rev_imp(dfa, input, true)
+ } else {
+ find_rev_imp(dfa, input, false)
+ }
}
-#[inline(never)]
-pub fn find_leftmost_rev<A: Automaton + ?Sized>(
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_rev_imp<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(false, dfa, pattern_id, bytes, start, end)
-}
-
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
-/// permits eliminating a few crucial branches.
-#[inline(always)]
-fn find_rev<A: Automaton + ?Sized>(
+ input: &Input<'_>,
earliest: bool,
- dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
+ let mut mat = None;
+ let mut sid = init_rev(dfa, input)?;
+ // In reverse search, the loop below can't handle the case of searching an
+ // empty slice. Ideally we could write something congruent to the forward
+ // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
+ // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
+ // this extra case handling by using a signed offset, but Rust makes it
+ // annoying to do. So... We just handle the empty case separately.
+ if input.start() == input.end() {
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
- let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
- let mut last_match = None;
- let mut at = end;
- while at > start {
- at -= 1;
- while at > start && dfa.next_state(state, bytes[at]) == state {
+ let mut at = input.end() - 1;
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
+ loop {
+ // SAFETY: See comments in 'find_fwd' for a safety argument.
+ let mut prev_sid;
+ while at >= input.start() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid)
+ || at <= input.start().saturating_add(3)
+ {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
at -= 1;
- }
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- if dfa.is_special_state(state) {
- if dfa.is_start_state(state) {
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at -= 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
+ .unwrap_or(input.start());
}
- } else if dfa.is_match_state(state) {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: at + MATCH_OFFSET,
- });
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ // Since reverse searches report the beginning of a match
+ // and the beginning is inclusive (not exclusive like the
+ // end of a match), we add 1 to make it inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
- return Ok(last_match);
+ return Ok(mat);
}
- if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
+ .unwrap_or(input.start());
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_rev(needles, bytes, at)
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
- .unwrap_or(0);
- } else if dfa.is_dead_state(state) {
- return Ok(last_match);
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
} else {
- debug_assert!(dfa.is_quit_state(state));
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte, offset: at });
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
}
}
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
}
- Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
- pre: Option<&mut prefilter::Scanner>,
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should only ever
- // use a prefilter when no pattern ID is given.
- if pre.is_some() && pattern_id.is_none() {
- find_overlapping_fwd_imp(
- pre,
- dfa,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
+ if pre.is_some() {
+ find_overlapping_fwd_imp(dfa, input, pre, state)
} else {
- find_overlapping_fwd_imp(
- None,
- dfa,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ find_overlapping_fwd_imp(dfa, input, None, state)
}
}
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
-/// permits eliminating a few crucial branches and reduces code size when it is
-/// not used.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
- mut pre: Option<&mut prefilter::Scanner>,
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- mut start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let mut state = match caller_state.id() {
- None => init_fwd(dfa, pattern_id, bytes, start, end)?,
- Some(id) => {
- if let Some(last) = caller_state.last_match() {
- let match_count = dfa.match_count(id);
- if last.match_index < match_count {
- let m = HalfMatch {
- pattern: dfa.match_pattern(id, last.match_index),
- offset: last.offset,
- };
- last.match_index += 1;
- return Ok(Some(m));
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut sid = match state.id {
+ None => {
+ state.at = input.start();
+ init_fwd(dfa, input)?
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
}
}
-
- // This is a subtle but critical detail. If the caller provides a
- // non-None state ID, then it must be the case that the state ID
- // corresponds to one set by this function. The state ID therefore
- // corresponds to a match state, a dead state or some other state.
- // However, "some other" state _only_ occurs when the input has
- // been exhausted because the only way to stop before then is to
- // see a match or a dead/quit state.
- //
- // If the input is exhausted or if it's a dead state, then
- // incrementing the starting position has no relevance on
- // correctness, since the loop below will either not execute
- // at all or will immediately stop due to being in a dead state.
- // (Once in a dead state it is impossible to leave it.)
- //
- // Therefore, the only case we need to consider is when
- // caller_state is a match state. In this case, since our machines
- // support the ability to delay a match by a certain number of
- // bytes (to support look-around), it follows that we actually
- // consumed that many additional bytes on our previous search. When
- // the caller resumes their search to find subsequent matches, they
- // will use the ending location from the previous match as the next
- // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
- // we scanned to on the previous search. Therefore, we need to
- // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
- //
- // Incidentally, since MATCH_OFFSET is non-zero, this also makes
- // dealing with empty matches convenient. Namely, callers needn't
- // special case them when implementing an iterator. Instead, this
- // ensures that forward progress is always made.
- start += MATCH_OFFSET;
- id
+ // Once we've reported all matches at a given position, we need to
+ // advance the search to the next position.
+ state.at += 1;
+ if state.at > input.end() {
+ return Ok(());
+ }
+ sid
}
};
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- caller_state.set_id(state);
- if dfa.is_start_state(state) {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ // NOTE: We don't optimize the crap out of this routine primarily because
+ // it seems like most find_overlapping searches will have higher match
+ // counts, and thus, throughput is perhaps not as important. But if you
+ // have a use case for something faster, feel free to file an issue.
+ while state.at < input.end() {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(state.at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(()),
+ Some(ref span) => {
+ if span.start > state.at {
+ state.at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, &input, state.at,
+ )?;
+ }
+ continue;
}
}
}
- } else if dfa.is_accel_state(state) {
- let needles = dfa.accelerator(state);
- at = accel::find_fwd(needles, bytes, at)
- .unwrap_or(bytes.len());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at = accel::find_fwd(
+ needles,
+ input.haystack(),
+ state.at + 1,
+ )
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_fwd(needs, input.haystack(), state.at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
+ } else {
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ }
+ }
+ state.at += 1;
+ }
+
+ let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ result
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let mut sid = match state.id {
+ None => {
+ let sid = init_rev(dfa, input)?;
+ state.id = Some(sid);
+ if input.start() == input.end() {
+ state.rev_eoi = true;
+ } else {
+ state.at = input.end() - 1;
+ }
+ sid
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position. However, if we've
+ // already followed the EOI transition, then we know we're done
+ // with the search and there cannot be any more matches to report.
+ if state.rev_eoi {
+ return Ok(());
+ } else if state.at == input.start() {
+ // At this point, we should follow the EOI transition. This
+ // will cause us the skip the main loop below and fall through
+ // to the final 'eoi_rev' transition.
+ state.rev_eoi = true;
+ } else {
+ // We haven't hit the end of the search yet, so move on.
+ state.at -= 1;
+ }
+ sid
+ }
+ };
+ while !state.rev_eoi {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
}
- } else if dfa.is_match_state(state) {
- let offset = at - MATCH_OFFSET;
- caller_state
- .set_last_match(StateMatch { match_index: 1, offset });
- return Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset,
- }));
- } else if dfa.is_accel_state(state) {
- let needs = dfa.accelerator(state);
- at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
- } else if dfa.is_dead_state(state) {
- return Ok(None);
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at + 1));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
} else {
- debug_assert!(dfa.is_quit_state(state));
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
}
}
+ if state.at == input.start() {
+ break;
+ }
+ state.at -= 1;
}
- let result = eoi_fwd(dfa, bytes, end, &mut state);
- caller_state.set_id(state);
- if let Ok(Some(ref last_match)) = result {
- caller_state.set_last_match(StateMatch {
- match_index: 1,
- offset: last_match.offset(),
- });
+ let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
+ state.rev_eoi = true;
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
}
result
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<StateID, MatchError> {
- let state = dfa.start_state_forward(pattern_id, bytes, start, end);
+ let sid = dfa.start_state_forward(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!dfa.is_match_state(state));
- Ok(state)
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev<A: Automaton + ?Sized>(
dfa: &A,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<StateID, MatchError> {
- let state = dfa.start_state_reverse(pattern_id, bytes, start, end);
+ let sid = dfa.start_state_reverse(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!dfa.is_match_state(state));
- Ok(state)
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd<A: Automaton + ?Sized>(
dfa: &A,
- bytes: &[u8],
- end: usize,
- state: &mut StateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- match bytes.get(end) {
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
Some(&b) => {
- *state = dfa.next_state(*state, b);
- if dfa.is_match_state(*state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(*state, 0),
- offset: end,
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_state(*sid, b);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(b, sp.end));
}
}
None => {
- *state = dfa.next_eoi_state(*state);
- if dfa.is_match_state(*state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(*state, 0),
- offset: bytes.len(),
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
}
}
+ Ok(())
}
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev<A: Automaton + ?Sized>(
dfa: &A,
- bytes: &[u8],
- start: usize,
- state: StateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- if start > 0 {
- let state = dfa.next_state(state, bytes[start - 1]);
- if dfa.is_match_state(state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: start,
- }))
- } else {
- Ok(None)
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa.next_state(*sid, byte);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
- let state = dfa.next_eoi_state(state);
- if dfa.is_match_state(state) {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(state, 0),
- offset: 0,
- }))
- } else {
- Ok(None)
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
}
+ Ok(())
}
-// Currently unused, but is useful to keep around. This was originally used
-// when the code above used raw pointers for its main loop.
-// /// Returns the distance between the given pointer and the start of `bytes`.
-// /// This assumes that the given pointer points to somewhere in the `bytes`
-// /// slice given.
-// fn offset(bytes: &[u8], p: *const u8) -> usize {
-// debug_assert!(bytes.as_ptr() <= p);
-// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
-// ((p as isize) - (bytes.as_ptr() as isize)) as usize
-// }
+/// Re-compute the starting state that a DFA should be in after finding a
+/// prefilter candidate match at the position `at`.
+///
+/// The function with the same name has a bit more docs in hybrid/search.rs.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn prefilter_restart<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ at: usize,
+) -> Result<StateID, MatchError> {
+ let mut input = input.clone();
+ input.set_start(at);
+ init_fwd(dfa, &input)
+}
diff --git a/vendor/regex-automata/src/dfa/search_unsafe.rs b/vendor/regex-automata/src/dfa/search_unsafe.rs
deleted file mode 100644
index ea1c29ff7..000000000
--- a/vendor/regex-automata/src/dfa/search_unsafe.rs
+++ /dev/null
@@ -1,321 +0,0 @@
-use crate::dfa::automaton::{Automaton, State};
-use crate::MatchError;
-
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
-/// permits eliminating a few crucial branches.
-#[inline(always)]
-pub fn find_fwd<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- start: usize,
- end: usize,
- earliest: bool,
-) -> Result<Option<usize>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?;
- if earliest && last_match.is_some() {
- return Ok(last_match);
- }
-
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- if dfa.is_dead_state(state) {
- return Ok(last_match);
- } else if dfa.is_quit_state(state) {
- return Err(MatchError::Quit { byte, offset: at - 1 });
- }
- last_match = Some(at - dfa.match_offset());
- if earliest {
- return Ok(last_match);
- }
- }
- }
- /*
- unsafe {
- let mut p = bytes.as_ptr().add(start);
- while p < bytes[end..].as_ptr() {
- let byte = *p;
- state = dfa.next_state_unchecked(state, byte);
- p = p.add(1);
- if dfa.is_special_state(state) {
- if dfa.is_dead_state(state) {
- return Ok(last_match);
- } else if dfa.is_quit_state(state) {
- return Err(MatchError::Quit {
- byte,
- offset: offset(bytes, p) - 1,
- });
- }
- last_match = Some(offset(bytes, p) - dfa.match_offset());
- if earliest {
- return Ok(last_match);
- }
- }
- }
- }
- */
- Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match))
-}
-
-/// This is marked as `inline(always)` specifically because it supports
-/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
-/// permits eliminating a few crucial branches.
-#[inline(always)]
-pub fn find_rev<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- start: usize,
- end: usize,
- earliest: bool,
-) -> Result<Option<usize>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?;
- if earliest && last_match.is_some() {
- return Ok(last_match);
- }
-
- let mut at = end;
- while at > start {
- at -= 1;
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- if dfa.is_special_state(state) {
- if dfa.is_dead_state(state) {
- return Ok(last_match);
- } else if dfa.is_quit_state(state) {
- return Err(MatchError::Quit { byte, offset: at });
- }
- last_match = Some(at + dfa.match_offset());
- if earliest {
- return Ok(last_match);
- }
- }
- }
- /*
- unsafe {
- let mut p = bytes.as_ptr().add(end);
- while p > bytes[start..].as_ptr() {
- p = p.sub(1);
- let byte = *p;
- state = dfa.next_state_unchecked(state, byte);
- if dfa.is_special_state(state) {
- if dfa.is_dead_state(state) {
- return Ok(last_match);
- } else if dfa.is_quit_state(state) {
- return Err(MatchError::Quit {
- byte,
- offset: offset(bytes, p),
- });
- }
- last_match = Some(offset(bytes, p) + dfa.match_offset());
- if earliest {
- return Ok(last_match);
- }
- }
- }
- }
- */
- Ok(eof_rev(dfa, state, bytes, start)?.or(last_match))
-}
-
-pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- mut start: usize,
- end: usize,
- caller_state: &mut State<A::ID>,
-) -> Result<Option<usize>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let (mut state, mut last_match) = match caller_state.as_option() {
- None => init_fwd(dfa, bytes, start, end)?,
- Some(id) => {
- // This is a subtle but critical detail. If the caller provides a
- // non-None state ID, then it must be the case that the state ID
- // corresponds to one set by this function. The state ID therefore
- // corresponds to a match state, a dead state or some other state.
- // However, "some other" state _only_ occurs when the input has
- // been exhausted because the only way to stop before then is to
- // see a match or a dead/quit state.
- //
- // If the input is exhausted or if it's a dead state, then
- // incrementing the starting position has no relevance on
- // correctness, since the loop below will either not execute
- // at all or will immediately stop due to being in a dead state.
- // (Once in a dead state it is impossible to leave it.)
- //
- // Therefore, the only case we need to consider is when
- // caller_state is a match state. In this case, since our machines
- // support the ability to delay a match by a certain number of
- // bytes (to support look-around), it follows that we actually
- // consumed that many additional bytes on our previous search. When
- // the caller resumes their search to find subsequent matches, they
- // will use the ending location from the previous match as the next
- // starting point, which is `match_offset` bytes PRIOR to where
- // we scanned to on the previous search. Therefore, we need to
- // compensate by bumping `start` up by `match_offset` bytes.
- start += dfa.match_offset();
- // Since match_offset could be any arbitrary value and we use
- // `start` in pointer arithmetic below, we check that we are still
- // in bounds. Otherwise, we could materialize a pointer that is
- // more than one past the end point of `bytes`, which is UB.
- if start > end {
- return Ok(None);
- }
- (id, None)
- }
- };
- if last_match.is_some() {
- caller_state.set(state);
- return Ok(last_match);
- }
-
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- state = dfa.next_state(state, byte);
- at += 1;
- if dfa.is_special_state(state) {
- caller_state.set(state);
- if dfa.is_dead_state(state) {
- return Ok(None);
- } else if dfa.is_quit_state(state) {
- return Err(MatchError::Quit { byte, offset: at - 1 });
- } else {
- return Ok(Some(at - dfa.match_offset()));
- }
- }
- }
- /*
- // SAFETY: Other than the normal pointer arithmetic happening here, a
- // unique aspect of safety for this function is the fact that the caller
- // can provide the state that the search routine will start with. If this
- // state were invalid, it would be possible to incorrectly index the
- // transition table. We however prevent this from happening by guaranteeing
- // that State is valid. Namely, callers cannot mutate a State. All they can
- // do is create a "start" state or otherwise reuse a previously set state.
- // Since callers can't mutate a state, it follows that a previously set
- // state can only be retrieved by crate internal functions. Therefore, our
- // use of it is safe since this code will only ever set the provided state
- // to a valid state.
- unsafe {
- let mut p = bytes.as_ptr().add(start);
- while p < bytes[end..].as_ptr() {
- let byte = *p;
- state = dfa.next_state_unchecked(state, byte);
- p = p.add(1);
- if dfa.is_special_state(state) {
- caller_state.set(state);
- return if dfa.is_dead_state(state) {
- Ok(None)
- } else if dfa.is_quit_state(state) {
- Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 })
- } else {
- Ok(Some(offset(bytes, p) - dfa.match_offset()))
- };
- }
- }
- }
- */
-
- let result = eof_fwd(dfa, bytes, end, &mut state);
- caller_state.set(state);
- result
-}
-
-fn init_fwd<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<(A::ID, Option<usize>), MatchError> {
- let state = dfa.start_state_forward(bytes, start, end);
- if dfa.is_match_state(state) {
- Ok((state, Some(start - dfa.match_offset())))
- } else {
- Ok((state, None))
- }
-}
-
-fn init_rev<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<(A::ID, Option<usize>), MatchError> {
- let state = dfa.start_state_reverse(bytes, start, end);
- if dfa.is_match_state(state) {
- Ok((state, Some(end + dfa.match_offset())))
- } else {
- Ok((state, None))
- }
-}
-
-fn eof_fwd<A: Automaton + ?Sized>(
- dfa: &A,
- bytes: &[u8],
- end: usize,
- state: &mut A::ID,
-) -> Result<Option<usize>, MatchError> {
- match bytes.get(end) {
- Some(&b) => {
- *state = dfa.next_state(*state, b);
- if dfa.is_match_state(*state) {
- Ok(Some(end))
- } else {
- Ok(None)
- }
- }
- None => {
- *state = dfa.next_eof_state(*state);
- if dfa.is_match_state(*state) {
- Ok(Some(bytes.len()))
- } else {
- Ok(None)
- }
- }
- }
-}
-
-fn eof_rev<A: Automaton + ?Sized>(
- dfa: &A,
- state: A::ID,
- bytes: &[u8],
- start: usize,
-) -> Result<Option<usize>, MatchError> {
- if start > 0 {
- if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) {
- Ok(Some(start))
- } else {
- Ok(None)
- }
- } else {
- if dfa.is_match_state(dfa.next_eof_state(state)) {
- Ok(Some(0))
- } else {
- Ok(None)
- }
- }
-}
-
-/// Returns the distance between the given pointer and the start of `bytes`.
-/// This assumes that the given pointer points to somewhere in the `bytes`
-/// slice given.
-fn offset(bytes: &[u8], p: *const u8) -> usize {
- debug_assert!(bytes.as_ptr() <= p);
- debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
- ((p as isize) - (bytes.as_ptr() as isize)) as usize
-}
diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs
index 346606987..5d8ec2340 100644
--- a/vendor/regex-automata/src/dfa/sparse.rs
+++ b/vendor/regex-automata/src/dfa/sparse.rs
@@ -14,7 +14,7 @@ example, this configures a sparse DFA to do an overlapping search:
```
use regex_automata::{
dfa::{Automaton, OverlappingState, dense},
- HalfMatch, MatchKind,
+ HalfMatch, Input, MatchKind,
};
let dense_re = dense::Builder::new()
@@ -23,25 +23,21 @@ let dense_re = dense::Builder::new()
let sparse_re = dense_re.to_sparse()?;
// Setup our haystack and initial start state.
-let haystack = b"Samwise";
+let input = Input::new("Samwise");
let mut state = OverlappingState::start();
// First, 'Sam' will match.
-let end1 = sparse_re.find_overlapping_fwd_at(
- None, None, haystack, 0, haystack.len(), &mut state,
-)?;
-assert_eq!(end1, Some(HalfMatch::must(0, 3)));
+sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
+assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match());
// And now 'Samwise' will match.
-let end2 = sparse_re.find_overlapping_fwd_at(
- None, None, haystack, 3, haystack.len(), &mut state,
-)?;
-assert_eq!(end2, Some(HalfMatch::must(0, 7)));
+sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
+assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match());
# Ok::<(), Box<dyn std::error::Error>>(())
```
*/
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
use core::iter;
use core::{
convert::{TryFrom, TryInto},
@@ -49,23 +45,27 @@ use core::{
mem::size_of,
};
-#[cfg(feature = "alloc")]
-use alloc::{collections::BTreeSet, vec, vec::Vec};
+#[cfg(feature = "dfa-build")]
+use alloc::{vec, vec::Vec};
-#[cfg(feature = "alloc")]
-use crate::dfa::{dense, error::Error};
+#[cfg(feature = "dfa-build")]
+use crate::dfa::dense::{self, BuildError};
use crate::{
dfa::{
automaton::{fmt_state_indicator, Automaton},
+ dense::Flags,
special::Special,
- DEAD,
+ StartKind, DEAD,
},
util::{
- alphabet::ByteClasses,
- bytes::{self, DeserializeError, Endian, SerializeError},
- id::{PatternID, StateID},
- start::Start,
- DebugByte,
+ alphabet::{ByteClasses, ByteSet},
+ escape::DebugByte,
+ int::{Pointer, Usize, U16, U32},
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, Input, MatchError},
+ start::{Start, StartByteMap},
+ wire::{self, DeserializeError, Endian, SerializeError},
},
};
@@ -107,14 +107,11 @@ const VERSION: u32 = 2;
/// for searching. For example:
///
/// ```
-/// use regex_automata::{
-/// dfa::{Automaton, sparse::DFA},
-/// HalfMatch,
-/// };
+/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
-/// let expected = HalfMatch::must(0, 8);
-/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// let expected = Some(HalfMatch::must(0, 8));
+/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone)]
@@ -130,12 +127,15 @@ pub struct DFA<T> {
//
// That is, a lot of the complexity is pushed down into how each state
// itself is represented.
- trans: Transitions<T>,
- starts: StartTable<T>,
+ tt: Transitions<T>,
+ st: StartTable<T>,
special: Special,
+ pre: Option<Prefilter>,
+ quitset: ByteSet,
+ flags: Flags,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl DFA<Vec<u8>> {
/// Parse the given regular expression using a default configuration and
/// return the corresponding sparse DFA.
@@ -149,18 +149,16 @@ impl DFA<Vec<u8>> {
/// # Example
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
///
/// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
///
- /// let expected = HalfMatch::must(0, 11);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> {
dense::Builder::new()
.build(pattern)
.and_then(|dense| dense.to_sparse())
@@ -178,26 +176,24 @@ impl DFA<Vec<u8>> {
/// # Example
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
///
/// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
- /// let expected = HalfMatch::must(1, 3);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// let expected = Some(HalfMatch::must(1, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
- ) -> Result<DFA<Vec<u8>>, Error> {
+ ) -> Result<DFA<Vec<u8>>, BuildError> {
dense::Builder::new()
.build_many(patterns)
.and_then(|dense| dense.to_sparse())
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl DFA<Vec<u8>> {
/// Create a new DFA that matches every input.
///
@@ -206,17 +202,17 @@ impl DFA<Vec<u8>> {
/// ```
/// use regex_automata::{
/// dfa::{Automaton, sparse},
- /// HalfMatch,
+ /// HalfMatch, Input,
/// };
///
/// let dfa = sparse::DFA::always_match()?;
///
- /// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
+ pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> {
dense::DFA::always_match()?.to_sparse()
}
@@ -225,21 +221,21 @@ impl DFA<Vec<u8>> {
/// # Example
///
/// ```
- /// use regex_automata::dfa::{Automaton, sparse};
+ /// use regex_automata::{dfa::{Automaton, sparse}, Input};
///
/// let dfa = sparse::DFA::never_match()?;
- /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
- /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
+ pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> {
dense::DFA::never_match()?.to_sparse()
}
/// The implementation for constructing a sparse DFA from a dense DFA.
pub(crate) fn from_dense<T: AsRef<[u32]>>(
dfa: &dense::DFA<T>,
- ) -> Result<DFA<Vec<u8>>, Error> {
+ ) -> Result<DFA<Vec<u8>>, BuildError> {
// In order to build the transition table, we need to be able to write
// state identifiers for each of the "next" transitions in each state.
// Our state identifiers correspond to the byte offset in the
@@ -249,35 +245,35 @@ impl DFA<Vec<u8>> {
// of the transition table happens in two passes.
//
// In the first pass, we fill out the shell of each state, which
- // includes the transition count, the input byte ranges and zero-filled
- // space for the transitions and accelerators, if present. In this
- // first pass, we also build up a map from the state identifier index
- // of the dense DFA to the state identifier in this sparse DFA.
+ // includes the transition length, the input byte ranges and
+ // zero-filled space for the transitions and accelerators, if present.
+ // In this first pass, we also build up a map from the state identifier
+ // index of the dense DFA to the state identifier in this sparse DFA.
//
// In the second pass, we fill in the transitions based on the map
// built in the first pass.
// The capacity given here reflects a minimum. (Well, the true minimum
// is likely even bigger, but hopefully this saves a few reallocs.)
- let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
+ let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len());
// This maps state indices from the dense DFA to StateIDs in the sparse
// DFA. We build out this map on the first pass, and then use it in the
// second pass to back-fill our transitions.
- let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
+ let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()];
for state in dfa.states() {
let pos = sparse.len();
- remap[dfa.to_index(state.id())] =
- StateID::new(pos).map_err(|_| Error::too_many_states())?;
- // zero-filled space for the transition count
+ remap[dfa.to_index(state.id())] = StateID::new(pos)
+ .map_err(|_| BuildError::too_many_states())?;
+ // zero-filled space for the transition length
sparse.push(0);
sparse.push(0);
- let mut transition_count = 0;
+ let mut transition_len = 0;
for (unit1, unit2, _) in state.sparse_transitions() {
match (unit1.as_u8(), unit2.as_u8()) {
(Some(b1), Some(b2)) => {
- transition_count += 1;
+ transition_len += 1;
sparse.push(b1);
sparse.push(b2);
}
@@ -298,40 +294,40 @@ impl DFA<Vec<u8>> {
// N.B. The loop above is not guaranteed to yield the EOI
// transition, since it may point to a DEAD state. By putting
// it here, we always write the EOI transition, and thus
- // guarantee that our transition count is >0. Why do we always
+ // guarantee that our transition length is >0. Why do we always
// need the EOI transition? Because in order to implement
// Automaton::next_eoi_state, this lets us just ask for the last
// transition. There are probably other/better ways to do this.
- transition_count += 1;
+ transition_len += 1;
sparse.push(0);
sparse.push(0);
- // Check some assumptions about transition count.
+ // Check some assumptions about transition length.
assert_ne!(
- transition_count, 0,
- "transition count should be non-zero",
+ transition_len, 0,
+ "transition length should be non-zero",
);
assert!(
- transition_count <= 257,
- "expected transition count {} to be <= 257",
- transition_count,
+ transition_len <= 257,
+ "expected transition length {} to be <= 257",
+ transition_len,
);
- // Fill in the transition count.
- // Since transition count is always <= 257, we use the most
+ // Fill in the transition length.
+ // Since transition length is always <= 257, we use the most
// significant bit to indicate whether this is a match state or
// not.
let ntrans = if dfa.is_match_state(state.id()) {
- transition_count | (1 << 15)
+ transition_len | (1 << 15)
} else {
- transition_count
+ transition_len
};
- bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
+ wire::NE::write_u16(ntrans, &mut sparse[pos..]);
// zero-fill the actual transitions.
- // Unwraps are OK since transition_count <= 257 and our minimum
+ // Unwraps are OK since transition_length <= 257 and our minimum
// support usize size is 16-bits.
- let zeros = usize::try_from(transition_count)
+ let zeros = usize::try_from(transition_len)
.unwrap()
.checked_mul(StateID::SIZE)
.unwrap();
@@ -355,18 +351,18 @@ impl DFA<Vec<u8>> {
sparse.extend(iter::repeat(0).take(zeros));
// Now write the length prefix.
- bytes::NE::write_u32(
+ wire::NE::write_u32(
// Will never fail since u32::MAX is invalid pattern ID.
// Thus, the number of pattern IDs is representable by a
// u32.
- plen.try_into().expect("pattern ID count fits in u32"),
+ plen.try_into().expect("pattern ID length fits in u32"),
&mut sparse[pos..],
);
pos += size_of::<u32>();
// Now write the pattern IDs.
for &pid in dfa.pattern_id_slice(state.id()) {
- pos += bytes::write_pattern_id::<bytes::NE>(
+ pos += wire::write_pattern_id::<wire::NE>(
pid,
&mut sparse[pos..],
);
@@ -384,28 +380,31 @@ impl DFA<Vec<u8>> {
}
let mut new = DFA {
- trans: Transitions {
+ tt: Transitions {
sparse,
classes: dfa.byte_classes().clone(),
- count: dfa.state_count(),
- patterns: dfa.pattern_count(),
+ state_len: dfa.state_len(),
+ pattern_len: dfa.pattern_len(),
},
- starts: StartTable::from_dense_dfa(dfa, &remap)?,
+ st: StartTable::from_dense_dfa(dfa, &remap)?,
special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
+ pre: dfa.get_prefilter().map(|p| p.clone()),
+ quitset: dfa.quitset().clone(),
+ flags: dfa.flags().clone(),
};
// And here's our second pass. Iterate over all of the dense states
// again, and update the transitions in each of the states in the
// sparse DFA.
for old_state in dfa.states() {
let new_id = remap[dfa.to_index(old_state.id())];
- let mut new_state = new.trans.state_mut(new_id);
+ let mut new_state = new.tt.state_mut(new_id);
let sparse = old_state.sparse_transitions();
for (i, (_, _, next)) in sparse.enumerate() {
let next = remap[dfa.to_index(next)];
new_state.set_next_at(i, next);
}
}
- trace!(
+ debug!(
"created sparse DFA, memory usage: {} (dense memory usage: {})",
new.memory_usage(),
dfa.memory_usage(),
@@ -419,9 +418,12 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// DFA returned always uses `&[u8]` for its transitions.
pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
DFA {
- trans: self.trans.as_ref(),
- starts: self.starts.as_ref(),
+ tt: self.tt.as_ref(),
+ st: self.st.as_ref(),
special: self.special,
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
}
}
@@ -431,36 +433,67 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// Effectively, this returns a sparse DFA whose transitions live on the
/// heap.
#[cfg(feature = "alloc")]
- pub fn to_owned(&self) -> DFA<Vec<u8>> {
+ pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> {
DFA {
- trans: self.trans.to_owned(),
- starts: self.starts.to_owned(),
+ tt: self.tt.to_owned(),
+ st: self.st.to_owned(),
special: self.special,
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
}
}
- /// Returns the memory usage, in bytes, of this DFA.
+ /// Returns the starting state configuration for this DFA.
///
- /// The memory usage is computed based on the number of bytes used to
- /// represent this DFA.
- ///
- /// This does **not** include the stack size used up by this DFA. To
- /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
- pub fn memory_usage(&self) -> usize {
- self.trans.memory_usage() + self.starts.memory_usage()
+ /// The default is [`StartKind::Both`], which means the DFA supports both
+ /// unanchored and anchored searches. However, this can generally lead to
+ /// bigger DFAs. Therefore, a DFA might be compiled with support for just
+ /// unanchored or anchored searches. In that case, running a search with
+ /// an unsupported configuration will panic.
+ pub fn start_kind(&self) -> StartKind {
+ self.st.kind
}
/// Returns true only if this DFA has starting states for each pattern.
///
/// When a DFA has starting states for each pattern, then a search with the
/// DFA can be configured to only look for anchored matches of a specific
- /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
- /// can accept a non-None `pattern_id` if and only if this method returns
- /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+ /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
+ /// accept a [`Anchored::Pattern`] if and only if this method returns true.
+ /// Otherwise, an error will be returned.
///
/// Note that if the DFA is empty, this always returns false.
- pub fn has_starts_for_each_pattern(&self) -> bool {
- self.starts.patterns > 0
+ pub fn starts_for_each_pattern(&self) -> bool {
+ self.st.pattern_len.is_some()
+ }
+
+ /// Returns the equivalence classes that make up the alphabet for this DFA.
+ ///
+ /// Unless [`dense::Config::byte_classes`] was disabled, it is possible
+ /// that multiple distinct bytes are grouped into the same equivalence
+ /// class if it is impossible for them to discriminate between a match and
+ /// a non-match. This has the effect of reducing the overall alphabet size
+ /// and in turn potentially substantially reducing the size of the DFA's
+ /// transition table.
+ ///
+ /// The downside of using equivalence classes like this is that every state
+ /// transition will automatically use this map to convert an arbitrary
+ /// byte to its corresponding equivalence class. In practice this has a
+ /// negligible impact on performance.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.tt.classes
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.tt.memory_usage() + self.st.memory_usage()
}
}
@@ -488,10 +521,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -503,13 +533,13 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_little_endian(&self) -> Vec<u8> {
- self.to_bytes::<bytes::LE>()
+ self.to_bytes::<wire::LE>()
}
/// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
@@ -533,10 +563,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -548,13 +575,13 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_big_endian(&self) -> Vec<u8> {
- self.to_bytes::<bytes::BE>()
+ self.to_bytes::<wire::BE>()
}
/// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
@@ -587,10 +614,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// This example shows how to serialize and deserialize a DFA:
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -600,18 +624,18 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// // ignore it.
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
pub fn to_bytes_native_endian(&self) -> Vec<u8> {
- self.to_bytes::<bytes::NE>()
+ self.to_bytes::<wire::NE>()
}
/// The implementation of the public `to_bytes` serialization methods,
/// which is generic over endianness.
- #[cfg(feature = "alloc")]
+ #[cfg(feature = "dfa-build")]
fn to_bytes<E: Endian>(&self) -> Vec<u8> {
let mut buf = vec![0; self.write_to_len()];
// This should always succeed since the only possible serialization
@@ -645,10 +669,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -660,15 +681,15 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// let written = original_dfa.write_to_native_endian(&mut buf)?;
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_little_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.write_to::<bytes::LE>(dst)
+ self.write_to::<wire::LE>(dst)
}
/// Serialize this DFA as raw bytes to the given slice, in big endian
@@ -695,10 +716,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -710,15 +728,15 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// let written = original_dfa.write_to_native_endian(&mut buf)?;
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_big_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.write_to::<bytes::BE>(dst)
+ self.write_to::<wire::BE>(dst)
}
/// Serialize this DFA as raw bytes to the given slice, in native endian
@@ -754,10 +772,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// dynamic memory allocation.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -767,15 +782,15 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// let written = original_dfa.write_to_native_endian(&mut buf)?;
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_native_endian(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- self.write_to::<bytes::NE>(dst)
+ self.write_to::<wire::NE>(dst)
}
/// The implementation of the public `write_to` serialization methods,
@@ -785,17 +800,19 @@ impl<T: AsRef<[u8]>> DFA<T> {
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let mut nw = 0;
- nw += bytes::write_label(LABEL, &mut dst[nw..])?;
- nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
- nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += wire::write_label(LABEL, &mut dst[nw..])?;
+ nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
nw += {
// Currently unused, intended for future flexibility
E::write_u32(0, &mut dst[nw..]);
size_of::<u32>()
};
- nw += self.trans.write_to::<E>(&mut dst[nw..])?;
- nw += self.starts.write_to::<E>(&mut dst[nw..])?;
+ nw += self.flags.write_to::<E>(&mut dst[nw..])?;
+ nw += self.tt.write_to::<E>(&mut dst[nw..])?;
+ nw += self.st.write_to::<E>(&mut dst[nw..])?;
nw += self.special.write_to::<E>(&mut dst[nw..])?;
+ nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
Ok(nw)
}
@@ -817,10 +834,7 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// a sparse DFA.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// // Compile our original DFA.
/// let original_dfa = DFA::new("foo[0-9]+")?;
@@ -829,18 +843,20 @@ impl<T: AsRef<[u8]>> DFA<T> {
/// let written = original_dfa.write_to_native_endian(&mut buf)?;
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn write_to_len(&self) -> usize {
- bytes::write_label_len(LABEL)
- + bytes::write_endianness_check_len()
- + bytes::write_version_len()
+ wire::write_label_len(LABEL)
+ + wire::write_endianness_check_len()
+ + wire::write_version_len()
+ size_of::<u32>() // unused, intended for future flexibility
- + self.trans.write_to_len()
- + self.starts.write_to_len()
+ + self.flags.write_to_len()
+ + self.tt.write_to_len()
+ + self.st.write_to_len()
+ self.special.write_to_len()
+ + self.quitset.write_to_len()
}
}
@@ -901,17 +917,14 @@ impl<'a> DFA<&'a [u8]> {
/// and then use it for searching.
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// let bytes = initial.to_bytes_native_endian();
/// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -927,7 +940,7 @@ impl<'a> DFA<&'a [u8]> {
/// a file:
///
/// ```no_run
- /// use regex_automata::dfa::{Automaton, sparse::DFA};
+ /// use regex_automata::dfa::sparse::DFA;
///
/// let dfa = DFA::new("foo[0-9]+")?;
///
@@ -943,23 +956,22 @@ impl<'a> DFA<&'a [u8]> {
///
/// And now the second part is embedding the DFA into the compiled program
/// and deserializing it at runtime on first use. We use conditional
- /// compilation to choose the correct endianness. As mentioned above, we
- /// do not need to employ any special tricks to ensure a proper alignment,
- /// since a sparse DFA has no alignment requirements.
+ /// compilation to choose the correct endianness. We do not need to employ
+ /// any special tricks to ensure a proper alignment, since a sparse DFA has
+ /// no alignment requirements.
///
/// ```no_run
/// use regex_automata::{
- /// dfa::{Automaton, sparse},
- /// HalfMatch,
+ /// dfa::{Automaton, sparse::DFA},
+ /// util::lazy::Lazy,
+ /// HalfMatch, Input,
/// };
///
- /// type DFA = sparse::DFA<&'static [u8]>;
- ///
- /// fn get_foo() -> &'static DFA {
- /// use std::cell::Cell;
- /// use std::mem::MaybeUninit;
- /// use std::sync::Once;
- ///
+ /// // This crate provides its own "lazy" type, kind of like
+ /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
+ /// // no-std environments and let's us write this using completely
+ /// // safe code.
+ /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
/// # const _: &str = stringify! {
/// #[cfg(target_endian = "big")]
/// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
@@ -968,33 +980,13 @@ impl<'a> DFA<&'a [u8]> {
/// # };
/// # static BYTES: &[u8] = b"";
///
- /// struct Lazy(Cell<MaybeUninit<DFA>>);
- /// // SAFETY: This is safe because DFA impls Sync.
- /// unsafe impl Sync for Lazy {}
- ///
- /// static INIT: Once = Once::new();
- /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
- ///
- /// INIT.call_once(|| {
- /// let (dfa, _) = DFA::from_bytes(BYTES)
- /// .expect("serialized DFA should be valid");
- /// // SAFETY: This is guaranteed to only execute once, and all
- /// // we do with the pointer is write the DFA to it.
- /// unsafe {
- /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
- /// }
- /// });
- /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
- /// // stored in static memory.
- /// unsafe {
- /// let dfa = (*DFA.0.as_ptr()).as_ptr();
- /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
- /// }
- /// }
- ///
- /// let dfa = get_foo();
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+ /// let (dfa, _) = DFA::from_bytes(BYTES)
+ /// .expect("serialized DFA should be valid");
+ /// dfa
+ /// });
+ ///
+ /// let expected = Ok(Some(HalfMatch::must(0, 8)));
+ /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
/// ```
///
/// Alternatively, consider using
@@ -1009,8 +1001,8 @@ impl<'a> DFA<&'a [u8]> {
// (by trying to decode every state) and start state ID list below. If
// either validation fails, then we return an error.
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
- dfa.trans.validate()?;
- dfa.starts.validate(&dfa.trans)?;
+ dfa.tt.validate(&dfa.special)?;
+ dfa.st.validate(&dfa.special, &dfa.tt)?;
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
// so it has already been validated.
Ok((dfa, nread))
@@ -1029,23 +1021,20 @@ impl<'a> DFA<&'a [u8]> {
///
/// # Safety
///
- /// This routine is unsafe because it permits callers to provide
+ /// This routine is not safe because it permits callers to provide
/// arbitrary transitions with possibly incorrect state identifiers. While
/// the various serialization routines will never return an incorrect
- /// DFA, there is no guarantee that the bytes provided here
- /// are correct. While `from_bytes_unchecked` will still do several forms
- /// of basic validation, this routine does not check that the transitions
- /// themselves are correct. Given an incorrect transition table, it is
- /// possible for the search routines to access out-of-bounds memory because
- /// of explicit bounds check elision.
+ /// DFA, there is no guarantee that the bytes provided here are correct.
+ /// While `from_bytes_unchecked` will still do several forms of basic
+ /// validation, this routine does not check that the transitions themselves
+ /// are correct. Given an incorrect transition table, it is possible for
+ /// the search routines to access out-of-bounds memory because of explicit
+ /// bounds check elision.
///
/// # Example
///
/// ```
- /// use regex_automata::{
- /// dfa::{Automaton, sparse::DFA},
- /// HalfMatch,
- /// };
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
///
/// let initial = DFA::new("foo[0-9]+")?;
/// let bytes = initial.to_bytes_native_endian();
@@ -1053,8 +1042,8 @@ impl<'a> DFA<&'a [u8]> {
/// // directly from a compatible serialization routine.
/// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
///
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub unsafe fn from_bytes_unchecked(
@@ -1062,56 +1051,70 @@ impl<'a> DFA<&'a [u8]> {
) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
let mut nr = 0;
- nr += bytes::read_label(&slice[nr..], LABEL)?;
- nr += bytes::read_endianness_check(&slice[nr..])?;
- nr += bytes::read_version(&slice[nr..], VERSION)?;
+ nr += wire::read_label(&slice[nr..], LABEL)?;
+ nr += wire::read_endianness_check(&slice[nr..])?;
+ nr += wire::read_version(&slice[nr..], VERSION)?;
- let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+ let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
nr += size_of::<u32>();
- let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
+ let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
- let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+ let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
nr += nread;
let (special, nread) = Special::from_bytes(&slice[nr..])?;
nr += nread;
- if special.max.as_usize() >= trans.sparse().len() {
+ if special.max.as_usize() >= tt.sparse().len() {
return Err(DeserializeError::generic(
"max should not be greater than or equal to sparse bytes",
));
}
- Ok((DFA { trans, starts, special }, nr))
+ let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ // Prefilters don't support serialization, so they're always absent.
+ let pre = None;
+ Ok((DFA { tt, st, special, pre, quitset, flags }, nr))
}
}
impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "sparse::DFA(")?;
- for state in self.trans.states() {
+ for state in self.tt.states() {
fmt_state_indicator(f, self, state.id())?;
- writeln!(f, "{:06?}: {:?}", state.id(), state)?;
+ writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?;
}
writeln!(f, "")?;
- for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() {
- if i % self.starts.stride == 0 {
- match pid {
- None => writeln!(f, "START-GROUP(ALL)")?,
- Some(pid) => {
- writeln!(f, "START_GROUP(pattern: {:?})", pid)?
- }
+ for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() {
+ if i % self.st.stride == 0 {
+ match anchored {
+ Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
+ Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
+ Anchored::Pattern(pid) => writeln!(
+ f,
+ "START_GROUP(pattern: {:?})",
+ pid.as_usize()
+ )?,
}
}
writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?;
}
- writeln!(f, "state count: {:?}", self.trans.count)?;
+ writeln!(f, "state length: {:?}", self.tt.state_len)?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, "flags: {:?}", self.flags)?;
writeln!(f, ")")?;
Ok(())
}
}
+// SAFETY: We assert that our implementation of each method is correct.
unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
#[inline]
fn is_special_state(&self, id: StateID) -> bool {
@@ -1145,10 +1148,10 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
// This is marked as inline to help dramatically boost sparse searching,
// which decodes each state it enters to follow the next transition.
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn next_state(&self, current: StateID, input: u8) -> StateID {
- let input = self.trans.classes.get(input);
- self.trans.state(current).next(input)
+ let input = self.tt.classes.get(input);
+ self.tt.state(current).next(input)
}
#[inline]
@@ -1162,17 +1165,17 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
#[inline]
fn next_eoi_state(&self, current: StateID) -> StateID {
- self.trans.state(current).next_eoi()
+ self.tt.state(current).next_eoi()
}
#[inline]
- fn pattern_count(&self) -> usize {
- self.trans.patterns
+ fn pattern_len(&self) -> usize {
+ self.tt.pattern_len
}
#[inline]
- fn match_count(&self, id: StateID) -> usize {
- self.trans.state(id).pattern_count()
+ fn match_len(&self, id: StateID) -> usize {
+ self.tt.state(id).pattern_len()
}
#[inline]
@@ -1182,39 +1185,76 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
// that finds the pattern ID from the state machine, which requires
// a bit of slicing/pointer-chasing. This optimization tends to only
// matter when matches are frequent.
- if self.trans.patterns == 1 {
+ if self.tt.pattern_len == 1 {
return PatternID::ZERO;
}
- self.trans.state(id).pattern_id(match_index)
+ self.tt.state(id).pattern_id(match_index)
+ }
+
+ #[inline]
+ fn has_empty(&self) -> bool {
+ self.flags.has_empty
+ }
+
+ #[inline]
+ fn is_utf8(&self) -> bool {
+ self.flags.is_utf8
+ }
+
+ #[inline]
+ fn is_always_start_anchored(&self) -> bool {
+ self.flags.is_always_start_anchored
}
#[inline]
fn start_state_forward(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- let index = Start::from_position_fwd(bytes, start, end);
- self.starts.start(index, pattern_id)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.start() > 0 {
+ let offset = input.start() - 1;
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.fwd(&input);
+ self.st.start(input, start)
}
#[inline]
fn start_state_reverse(
&self,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> StateID {
- let index = Start::from_position_rev(bytes, start, end);
- self.starts.start(index, pattern_id)
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.end() < input.haystack().len() {
+ let offset = input.end();
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.rev(&input);
+ self.st.start(input, start)
+ }
+
+ #[inline]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ match mode {
+ Anchored::No => self.st.universal_start_unanchored,
+ Anchored::Yes => self.st.universal_start_anchored,
+ Anchored::Pattern(_) => None,
+ }
}
#[inline]
fn accelerator(&self, id: StateID) -> &[u8] {
- self.trans.state(id).accelerator()
+ self.tt.state(id).accelerator()
+ }
+
+ #[inline]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref()
}
}
@@ -1278,43 +1318,38 @@ struct Transitions<T> {
/// least one state---the dead state---even the empty DFA. In particular,
/// the dead state always has ID 0 and is correspondingly always the first
/// state. The dead state is never a match state.
- count: usize,
+ state_len: usize,
/// The total number of unique patterns represented by these match states.
- patterns: usize,
+ pattern_len: usize,
}
impl<'a> Transitions<&'a [u8]> {
unsafe fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
- let (state_count, nr) =
- bytes::try_read_u32_as_usize(&slice, "state count")?;
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(&slice, "state length")?;
slice = &slice[nr..];
- let (pattern_count, nr) =
- bytes::try_read_u32_as_usize(&slice, "pattern count")?;
+ let (pattern_len, nr) =
+ wire::try_read_u32_as_usize(&slice, "pattern length")?;
slice = &slice[nr..];
let (classes, nr) = ByteClasses::from_bytes(&slice)?;
slice = &slice[nr..];
let (len, nr) =
- bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
+ wire::try_read_u32_as_usize(&slice, "sparse transitions length")?;
slice = &slice[nr..];
- bytes::check_slice_len(slice, len, "sparse states byte length")?;
+ wire::check_slice_len(slice, len, "sparse states byte length")?;
let sparse = &slice[..len];
slice = &slice[len..];
- let trans = Transitions {
- sparse,
- classes,
- count: state_count,
- patterns: pattern_count,
- };
- Ok((trans, slice.as_ptr() as usize - slice_start))
+ let trans = Transitions { sparse, classes, state_len, pattern_len };
+ Ok((trans, slice.as_ptr().as_usize() - slice_start))
}
}
@@ -1334,12 +1369,12 @@ impl<T: AsRef<[u8]>> Transitions<T> {
}
dst = &mut dst[..nwrite];
- // write state count
- E::write_u32(u32::try_from(self.count).unwrap(), dst);
+ // write state length
+ E::write_u32(u32::try_from(self.state_len).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
- // write pattern count
- E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ // write pattern length
+ E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
// write byte class map
@@ -1351,15 +1386,22 @@ impl<T: AsRef<[u8]>> Transitions<T> {
dst = &mut dst[size_of::<u32>()..];
// write actual transitions
- dst.copy_from_slice(self.sparse());
+ let mut id = DEAD;
+ while id.as_usize() < self.sparse().len() {
+ let state = self.state(id);
+ let n = state.write_to::<E>(&mut dst)?;
+ dst = &mut dst[n..];
+ // The next ID is the offset immediately following `state`.
+ id = StateID::new(id.as_usize() + state.write_to_len()).unwrap();
+ }
Ok(nwrite)
}
/// Returns the number of bytes the serialized form of this transition
/// table will use.
fn write_to_len(&self) -> usize {
- size_of::<u32>() // state count
- + size_of::<u32>() // pattern count
+ size_of::<u32>() // state length
+ + size_of::<u32>() // pattern length
+ self.classes.write_to_len()
+ size_of::<u32>() // sparse transitions length
+ self.sparse().len()
@@ -1369,7 +1411,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
///
/// That is, every state ID can be used to correctly index a state in this
/// table.
- fn validate(&self) -> Result<(), DeserializeError> {
+ fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
// In order to validate everything, we not only need to make sure we
// can decode every state, but that every transition in every state
// points to a valid state. There are many duplicative transitions, so
@@ -1381,10 +1423,22 @@ impl<T: AsRef<[u8]>> Transitions<T> {
// whether doing something more clever is worth it just yet. If you're
// profiling this code and need it to run faster, please file an issue.
//
+ // OK, so we also use this to record the set of valid state IDs. Since
+ // it is possible for a transition to point to an invalid state ID that
+ // still (somehow) deserializes to a valid state. So we need to make
+ // sure our transitions are limited to actually correct state IDs.
+ // The problem is, I'm not sure how to do this verification step in
+ // no-std no-alloc mode. I think we'd *have* to store the set of valid
+ // state IDs in the DFA itself. For now, we don't do this verification
+ // in no-std no-alloc mode. The worst thing that can happen is an
+ // incorrect result. But no panics or memory safety problems should
+ // result. Because we still do validate that the state itself is
+ // "valid" in the sense that everything it points to actually exists.
+ //
// ---AG
struct Seen {
#[cfg(feature = "alloc")]
- set: BTreeSet<StateID>,
+ set: alloc::collections::BTreeSet<StateID>,
#[cfg(not(feature = "alloc"))]
set: core::marker::PhantomData<StateID>,
}
@@ -1392,7 +1446,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
#[cfg(feature = "alloc")]
impl Seen {
fn new() -> Seen {
- Seen { set: BTreeSet::new() }
+ Seen { set: alloc::collections::BTreeSet::new() }
}
fn insert(&mut self, id: StateID) {
self.set.insert(id);
@@ -1416,38 +1470,78 @@ impl<T: AsRef<[u8]>> Transitions<T> {
let mut verified: Seen = Seen::new();
// We need to make sure that we decode the correct number of states.
// Otherwise, an empty set of transitions would validate even if the
- // recorded state count is non-empty.
- let mut count = 0;
+ // recorded state length is non-empty.
+ let mut len = 0;
// We can't use the self.states() iterator because it assumes the state
// encodings are valid. It could panic if they aren't.
let mut id = DEAD;
while id.as_usize() < self.sparse().len() {
- let state = self.try_state(id)?;
+ // Before we even decode the state, we check that the ID itself
+ // is well formed. That is, if it's a special state then it must
+ // actually be a quit, dead, accel, match or start state.
+ if sp.is_special_state(id) {
+ let is_actually_special = sp.is_dead_state(id)
+ || sp.is_quit_state(id)
+ || sp.is_match_state(id)
+ || sp.is_start_state(id)
+ || sp.is_accel_state(id);
+ if !is_actually_special {
+ // This is kind of a cryptic error message...
+ return Err(DeserializeError::generic(
+ "found sparse state tagged as special but \
+ wasn't actually special",
+ ));
+ }
+ }
+ let state = self.try_state(sp, id)?;
verified.insert(id);
// The next ID should be the offset immediately following `state`.
- id = StateID::new(bytes::add(
+ id = StateID::new(wire::add(
id.as_usize(),
- state.bytes_len(),
+ state.write_to_len(),
"next state ID offset",
)?)
.map_err(|err| {
DeserializeError::state_id_error(err, "next state ID offset")
})?;
- count += 1;
-
- // Now check that all transitions in this state are correct.
+ len += 1;
+ }
+ // Now that we've checked that all top-level states are correct and
+ // importantly, collected a set of valid state IDs, we have all the
+ // information we need to check that all transitions are correct too.
+ //
+ // Note that we can't use `valid_ids` to iterate because it will
+ // be empty in no-std no-alloc contexts. (And yes, that means our
+ // verification isn't quite as good.) We can use `self.states()`
+ // though at least, since we know that all states can at least be
+ // decoded and traversed correctly.
+ for state in self.states() {
+ // Check that all transitions in this state are correct.
for i in 0..state.ntrans {
let to = state.next_at(i);
- if verified.contains(&to) {
- continue;
+ // For no-alloc, we just check that the state can decode. It is
+ // technically possible that the state ID could still point to
+ // a non-existent state even if it decodes (fuzzing proved this
+ // to be true), but it shouldn't result in any memory unsafety
+ // or panics in non-debug mode.
+ #[cfg(not(feature = "alloc"))]
+ {
+ let _ = self.try_state(sp, to)?;
+ }
+ #[cfg(feature = "alloc")]
+ {
+ if !verified.contains(&to) {
+ return Err(DeserializeError::generic(
+ "found transition that points to a \
+ non-existent state",
+ ));
+ }
}
- let _ = self.try_state(to)?;
- verified.insert(id);
}
}
- if count != self.count {
+ if len != self.state_len {
return Err(DeserializeError::generic(
- "mismatching sparse state count",
+ "mismatching sparse state length",
));
}
Ok(())
@@ -1458,19 +1552,19 @@ impl<T: AsRef<[u8]>> Transitions<T> {
Transitions {
sparse: self.sparse(),
classes: self.classes.clone(),
- count: self.count,
- patterns: self.patterns,
+ state_len: self.state_len,
+ pattern_len: self.pattern_len,
}
}
/// Converts these transitions to an owned value.
#[cfg(feature = "alloc")]
- fn to_owned(&self) -> Transitions<Vec<u8>> {
+ fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> {
Transitions {
sparse: self.sparse().to_vec(),
classes: self.classes.clone(),
- count: self.count,
- patterns: self.patterns,
+ state_len: self.state_len,
+ pattern_len: self.pattern_len,
}
}
@@ -1483,10 +1577,10 @@ impl<T: AsRef<[u8]>> Transitions<T> {
/// functions involved are also inlined, which should hopefully eliminate
/// a lot of the extraneous decoding that is never needed just to follow
/// the next transition.
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn state(&self, id: StateID) -> State<'_> {
let mut state = &self.sparse()[id.as_usize()..];
- let mut ntrans = bytes::read_u16(&state) as usize;
+ let mut ntrans = wire::read_u16(&state).as_usize();
let is_match = (1 << 15) & ntrans != 0;
ntrans &= !(1 << 15);
state = &state[2..];
@@ -1494,13 +1588,13 @@ impl<T: AsRef<[u8]>> Transitions<T> {
let (input_ranges, state) = state.split_at(ntrans * 2);
let (next, state) = state.split_at(ntrans * StateID::SIZE);
let (pattern_ids, state) = if is_match {
- let npats = bytes::read_u32(&state) as usize;
+ let npats = wire::read_u32(&state).as_usize();
state[4..].split_at(npats * 4)
} else {
(&[][..], state)
};
- let accel_len = state[0] as usize;
+ let accel_len = usize::from(state[0]);
let accel = &state[1..accel_len + 1];
State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
}
@@ -1513,27 +1607,44 @@ impl<T: AsRef<[u8]>> Transitions<T> {
/// all of its data is consistent. It does not verify that its state ID
/// transitions point to valid states themselves, nor does it verify that
/// every pattern ID is valid.
- fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
+ fn try_state(
+ &self,
+ sp: &Special,
+ id: StateID,
+ ) -> Result<State<'_>, DeserializeError> {
if id.as_usize() > self.sparse().len() {
- return Err(DeserializeError::generic("invalid sparse state ID"));
+ return Err(DeserializeError::generic(
+ "invalid caller provided sparse state ID",
+ ));
}
let mut state = &self.sparse()[id.as_usize()..];
// Encoding format starts with a u16 that stores the total number of
// transitions in this state.
let (mut ntrans, _) =
- bytes::try_read_u16_as_usize(state, "state transition count")?;
+ wire::try_read_u16_as_usize(state, "state transition length")?;
let is_match = ((1 << 15) & ntrans) != 0;
ntrans &= !(1 << 15);
state = &state[2..];
if ntrans > 257 || ntrans == 0 {
- return Err(DeserializeError::generic("invalid transition count"));
+ return Err(DeserializeError::generic(
+ "invalid transition length",
+ ));
+ }
+ if is_match && !sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "state marked as match but not in match ID range",
+ ));
+ } else if !is_match && sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "state in match ID range but not marked as match state",
+ ));
}
// Each transition has two pieces: an inclusive range of bytes on which
// it is defined, and the state ID that those bytes transition to. The
// pairs come first, followed by a corresponding sequence of state IDs.
let input_ranges_len = ntrans.checked_mul(2).unwrap();
- bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
+ wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
let (input_ranges, state) = state.split_at(input_ranges_len);
// Every range should be of the form A-B, where A<=B.
for pair in input_ranges.chunks(2) {
@@ -1549,13 +1660,13 @@ impl<T: AsRef<[u8]>> Transitions<T> {
let next_len = ntrans
.checked_mul(self.id_len())
.expect("state size * #trans should always fit in a usize");
- bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
+ wire::check_slice_len(state, next_len, "sparse trans state IDs")?;
let (next, state) = state.split_at(next_len);
// We can at least verify that every state ID is in bounds.
for idbytes in next.chunks(self.id_len()) {
let (id, _) =
- bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
- bytes::check_slice_len(
+ wire::read_state_id(idbytes, "sparse state ID in try_state")?;
+ wire::check_slice_len(
self.sparse(),
id.as_usize(),
"invalid sparse state ID",
@@ -1567,19 +1678,24 @@ impl<T: AsRef<[u8]>> Transitions<T> {
// encoded 32-bit integers.
let (pattern_ids, state) = if is_match {
let (npats, nr) =
- bytes::try_read_u32_as_usize(state, "pattern ID count")?;
+ wire::try_read_u32_as_usize(state, "pattern ID length")?;
let state = &state[nr..];
+ if npats == 0 {
+ return Err(DeserializeError::generic(
+ "state marked as a match, but has no pattern IDs",
+ ));
+ }
let pattern_ids_len =
- bytes::mul(npats, 4, "sparse pattern ID byte length")?;
- bytes::check_slice_len(
+ wire::mul(npats, 4, "sparse pattern ID byte length")?;
+ wire::check_slice_len(
state,
pattern_ids_len,
"sparse pattern IDs",
)?;
let (pattern_ids, state) = state.split_at(pattern_ids_len);
for patbytes in pattern_ids.chunks(PatternID::SIZE) {
- bytes::read_pattern_id(
+ wire::read_pattern_id(
patbytes,
"sparse pattern ID in try_state",
)?;
@@ -1597,21 +1713,30 @@ impl<T: AsRef<[u8]>> Transitions<T> {
if state.is_empty() {
return Err(DeserializeError::generic("no accelerator length"));
}
- let (accel_len, state) = (state[0] as usize, &state[1..]);
+ let (accel_len, state) = (usize::from(state[0]), &state[1..]);
if accel_len > 3 {
return Err(DeserializeError::generic(
"sparse invalid accelerator length",
));
+ } else if accel_len == 0 && sp.is_accel_state(id) {
+ return Err(DeserializeError::generic(
+ "got no accelerators in state, but in accelerator ID range",
+ ));
+ } else if accel_len > 0 && !sp.is_accel_state(id) {
+ return Err(DeserializeError::generic(
+ "state in accelerator ID range, but has no accelerators",
+ ));
}
- bytes::check_slice_len(
+
+ wire::check_slice_len(
state,
accel_len,
"sparse corrupt accelerator length",
)?;
let (accel, _) = (&state[..accel_len], &state[accel_len..]);
- Ok(State {
+ let state = State {
id,
is_match,
ntrans,
@@ -1619,7 +1744,13 @@ impl<T: AsRef<[u8]>> Transitions<T> {
next,
pattern_ids,
accel,
- })
+ };
+ if sp.is_quit_state(state.next_at(state.ntrans - 1)) {
+ return Err(DeserializeError::generic(
+ "state with EOI transition to quit state is illegal",
+ ));
+ }
+ Ok(state)
}
/// Return an iterator over all of the states in this DFA.
@@ -1648,13 +1779,13 @@ impl<T: AsRef<[u8]>> Transitions<T> {
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<T: AsMut<[u8]>> Transitions<T> {
/// Return a convenient mutable representation of the given state.
/// This panics if the state is invalid.
fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
let mut state = &mut self.sparse_mut()[id.as_usize()..];
- let mut ntrans = bytes::read_u16(&state) as usize;
+ let mut ntrans = wire::read_u16(&state).as_usize();
let is_match = (1 << 15) & ntrans != 0;
ntrans &= !(1 << 15);
state = &mut state[2..];
@@ -1662,13 +1793,13 @@ impl<T: AsMut<[u8]>> Transitions<T> {
let (input_ranges, state) = state.split_at_mut(ntrans * 2);
let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
let (pattern_ids, state) = if is_match {
- let npats = bytes::read_u32(&state) as usize;
+ let npats = wire::read_u32(&state).as_usize();
state[4..].split_at_mut(npats * 4)
} else {
(&mut [][..], state)
};
- let accel_len = state[0] as usize;
+ let accel_len = usize::from(state[0]);
let accel = &mut state[1..accel_len + 1];
StateMut {
id,
@@ -1702,53 +1833,85 @@ struct StartTable<T> {
/// In practice, T is either Vec<u8> or &[u8] and has no alignment
/// requirements.
///
- /// The first `stride` (currently always 4) entries always correspond to
- /// the start states for the entire DFA. After that, there are
- /// `stride * patterns` state IDs, where `patterns` may be zero in the
- /// case of a DFA with no patterns or in the case where the DFA was built
- /// without enabling starting states for each pattern.
+ /// The first `2 * stride` (currently always 8) entries always correspond
+ /// to the starts states for the entire DFA, with the first 4 entries being
+ /// for unanchored searches and the second 4 entries being for anchored
+ /// searches. To keep things simple, we always use 8 entries even if the
+ /// `StartKind` is not both.
+ ///
+ /// After that, there are `stride * patterns` state IDs, where `patterns`
+ /// may be zero in the case of a DFA with no patterns or in the case where
+ /// the DFA was built without enabling starting states for each pattern.
table: T,
+ /// The starting state configuration supported. When 'both', both
+ /// unanchored and anchored searches work. When 'unanchored', anchored
+ /// searches panic. When 'anchored', unanchored searches panic.
+ kind: StartKind,
+ /// The start state configuration for every possible byte.
+ start_map: StartByteMap,
/// The number of starting state IDs per pattern.
stride: usize,
/// The total number of patterns for which starting states are encoded.
- /// This may be zero for non-empty DFAs when the DFA was built without
- /// start states for each pattern.
- patterns: usize,
+ /// This is `None` for DFAs that were built without start states for each
+ /// pattern. Thus, one cannot use this field to say how many patterns
+ /// are in the DFA in all cases. It is specific to how many patterns are
+ /// represented in this start table.
+ pattern_len: Option<usize>,
+ /// The universal starting state for unanchored searches. This is only
+ /// present when the DFA supports unanchored searches and when all starting
+ /// state IDs for an unanchored search are equivalent.
+ universal_start_unanchored: Option<StateID>,
+ /// The universal starting state for anchored searches. This is only
+ /// present when the DFA supports anchored searches and when all starting
+ /// state IDs for an anchored search are equivalent.
+ universal_start_anchored: Option<StateID>,
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl StartTable<Vec<u8>> {
- fn new(patterns: usize) -> StartTable<Vec<u8>> {
- let stride = Start::count();
+ fn new<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ pattern_len: Option<usize>,
+ ) -> StartTable<Vec<u8>> {
+ let stride = Start::len();
// This is OK since the only way we're here is if a dense DFA could be
// constructed successfully, which uses the same space.
let len = stride
- .checked_mul(patterns)
+ .checked_mul(pattern_len.unwrap_or(0))
.unwrap()
- .checked_add(stride)
+ .checked_add(stride.checked_mul(2).unwrap())
.unwrap()
.checked_mul(StateID::SIZE)
.unwrap();
- StartTable { table: vec![0; len], stride, patterns }
+ StartTable {
+ table: vec![0; len],
+ kind: dfa.start_kind(),
+ start_map: dfa.start_map().clone(),
+ stride,
+ pattern_len,
+ universal_start_unanchored: dfa
+ .universal_start_state(Anchored::No),
+ universal_start_anchored: dfa.universal_start_state(Anchored::Yes),
+ }
}
fn from_dense_dfa<T: AsRef<[u32]>>(
dfa: &dense::DFA<T>,
remap: &[StateID],
- ) -> Result<StartTable<Vec<u8>>, Error> {
+ ) -> Result<StartTable<Vec<u8>>, BuildError> {
// Unless the DFA has start states compiled for each pattern, then
// as far as the starting state table is concerned, there are zero
// patterns to account for. It will instead only store starting states
// for the entire DFA.
- let start_pattern_count = if dfa.has_starts_for_each_pattern() {
- dfa.pattern_count()
+ let start_pattern_len = if dfa.starts_for_each_pattern() {
+ Some(dfa.pattern_len())
} else {
- 0
+ None
};
- let mut sl = StartTable::new(start_pattern_count);
- for (old_start_id, sty, pid) in dfa.starts() {
+ let mut sl = StartTable::new(dfa, start_pattern_len);
+ for (old_start_id, anchored, sty) in dfa.starts() {
let new_start_id = remap[dfa.to_index(old_start_id)];
- sl.set_start(sty, pid, new_start_id);
+ sl.set_start(anchored, sty, new_start_id);
}
Ok(sl)
}
@@ -1758,53 +1921,98 @@ impl<'a> StartTable<&'a [u8]> {
unsafe fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
- let slice_start = slice.as_ptr() as usize;
+ let slice_start = slice.as_ptr().as_usize();
- let (stride, nr) =
- bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
+ let (kind, nr) = StartKind::from_bytes(slice)?;
slice = &slice[nr..];
- let (patterns, nr) = bytes::try_read_u32_as_usize(
- slice,
- "sparse start table patterns",
- )?;
+ let (start_map, nr) = StartByteMap::from_bytes(slice)?;
slice = &slice[nr..];
- if stride != Start::count() {
+ let (stride, nr) =
+ wire::try_read_u32_as_usize(slice, "sparse start table stride")?;
+ slice = &slice[nr..];
+ if stride != Start::len() {
return Err(DeserializeError::generic(
"invalid sparse starting table stride",
));
}
- if patterns > PatternID::LIMIT {
+
+ let (maybe_pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "sparse start table patterns")?;
+ slice = &slice[nr..];
+ let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
+ None
+ } else {
+ Some(maybe_pattern_len)
+ };
+ if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
return Err(DeserializeError::generic(
"sparse invalid number of patterns",
));
}
- let pattern_table_size =
- bytes::mul(stride, patterns, "sparse invalid pattern count")?;
+
+ let (universal_unanchored, nr) =
+ wire::try_read_u32(slice, "universal unanchored start")?;
+ slice = &slice[nr..];
+ let universal_start_unanchored = if universal_unanchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_unanchored).map_err(|e| {
+ DeserializeError::state_id_error(
+ e,
+ "universal unanchored start",
+ )
+ })?)
+ };
+
+ let (universal_anchored, nr) =
+ wire::try_read_u32(slice, "universal anchored start")?;
+ slice = &slice[nr..];
+ let universal_start_anchored = if universal_anchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_anchored).map_err(|e| {
+ DeserializeError::state_id_error(e, "universal anchored start")
+ })?)
+ };
+
+ let pattern_table_size = wire::mul(
+ stride,
+ pattern_len.unwrap_or(0),
+ "sparse invalid pattern length",
+ )?;
// Our start states always start with a single stride of start states
// for the entire automaton which permit it to match any pattern. What
// follows it are an optional set of start states for each pattern.
- let start_state_count = bytes::add(
- stride,
+ let start_state_len = wire::add(
+ wire::mul(2, stride, "start state stride too big")?,
pattern_table_size,
"sparse invalid 'any' pattern starts size",
)?;
- let table_bytes_len = bytes::mul(
- start_state_count,
+ let table_bytes_len = wire::mul(
+ start_state_len,
StateID::SIZE,
"sparse pattern table bytes length",
)?;
- bytes::check_slice_len(
+ wire::check_slice_len(
slice,
table_bytes_len,
"sparse start ID table",
)?;
- let table_bytes = &slice[..table_bytes_len];
+ let table = &slice[..table_bytes_len];
slice = &slice[table_bytes_len..];
- let sl = StartTable { table: table_bytes, stride, patterns };
- Ok((sl, slice.as_ptr() as usize - slice_start))
+ let sl = StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored,
+ universal_start_anchored,
+ };
+ Ok((sl, slice.as_ptr().as_usize() - slice_start))
}
}
@@ -1821,22 +2029,51 @@ impl<T: AsRef<[u8]>> StartTable<T> {
}
dst = &mut dst[..nwrite];
+ // write start kind
+ let nw = self.kind.write_to::<E>(dst)?;
+ dst = &mut dst[nw..];
+ // write start byte map
+ let nw = self.start_map.write_to(dst)?;
+ dst = &mut dst[nw..];
// write stride
E::write_u32(u32::try_from(self.stride).unwrap(), dst);
dst = &mut dst[size_of::<u32>()..];
- // write pattern count
- E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ // write pattern length
+ E::write_u32(
+ u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start unanchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_unanchored
+ .map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start anchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
dst = &mut dst[size_of::<u32>()..];
// write start IDs
- dst.copy_from_slice(self.table());
+ for (sid, _, _) in self.iter() {
+ E::write_u32(sid.as_u32(), dst);
+ dst = &mut dst[StateID::SIZE..];
+ }
Ok(nwrite)
}
/// Returns the number of bytes the serialized form of this transition
/// table will use.
fn write_to_len(&self) -> usize {
- size_of::<u32>() // stride
+ self.kind.write_to_len()
+ + self.start_map.write_to_len()
+ + size_of::<u32>() // stride
+ size_of::<u32>() // # patterns
+ + size_of::<u32>() // universal unanchored start
+ + size_of::<u32>() // universal anchored start
+ self.table().len()
}
@@ -1846,10 +2083,29 @@ impl<T: AsRef<[u8]>> StartTable<T> {
/// state in the DFA's sparse transitions.
fn validate(
&self,
+ sp: &Special,
trans: &Transitions<T>,
) -> Result<(), DeserializeError> {
for (id, _, _) in self.iter() {
- let _ = trans.try_state(id)?;
+ if sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "start states cannot be match states",
+ ));
+ }
+ // Confirm that the start state points to a valid state.
+ let state = trans.try_state(sp, id)?;
+ // And like for the transition table, confirm that the transitions
+ // on all start states themselves point to a valid state.
+ //
+ // It'd probably be better to integrate this validation with the
+ // transition table, or otherwise store a sorted sequence of all
+ // valid state IDs in the sparse DFA itself. That way, we could
+ // check that every pointer to a state corresponds precisely to a
+ // correct and valid state.
+ for i in 0..state.ntrans {
+ let to = state.next_at(i);
+ let _ = trans.try_state(sp, to)?;
+ }
}
Ok(())
}
@@ -1858,18 +2114,26 @@ impl<T: AsRef<[u8]>> StartTable<T> {
fn as_ref(&self) -> StartTable<&'_ [u8]> {
StartTable {
table: self.table(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
stride: self.stride,
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
}
}
/// Converts this start list to an owned value.
#[cfg(feature = "alloc")]
- fn to_owned(&self) -> StartTable<Vec<u8>> {
+ fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> {
StartTable {
table: self.table().to_vec(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
stride: self.stride,
- patterns: self.patterns,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
}
}
@@ -1879,26 +2143,45 @@ impl<T: AsRef<[u8]>> StartTable<T> {
/// starting state for the given pattern is returned. If this start table
/// does not have individual starting states for each pattern, then this
/// panics.
- fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
- let start_index = index.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => {
- let pid = pid.as_usize();
- assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
- self.stride
- .checked_mul(pid)
- .unwrap()
- .checked_add(self.stride)
- .unwrap()
- .checked_add(start_index)
- .unwrap()
+ fn start(
+ &self,
+ input: &Input<'_>,
+ start: Start,
+ ) -> Result<StateID, MatchError> {
+ let start_index = start.as_usize();
+ let mode = input.get_anchored();
+ let index = match mode {
+ Anchored::No => {
+ if !self.kind.has_unanchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ start_index
+ }
+ Anchored::Yes => {
+ if !self.kind.has_anchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ self.stride + start_index
+ }
+ Anchored::Pattern(pid) => {
+ let len = match self.pattern_len {
+ None => {
+ return Err(MatchError::unsupported_anchored(mode))
+ }
+ Some(len) => len,
+ };
+ if pid.as_usize() >= len {
+ return Ok(DEAD);
+ }
+ (2 * self.stride)
+ + (self.stride * pid.as_usize())
+ + start_index
}
};
let start = index * StateID::SIZE;
// This OK since we're allowed to assume that the start table contains
// valid StateIDs.
- bytes::read_state_id_unchecked(&self.table()[start..]).0
+ Ok(wire::read_state_id_unchecked(&self.table()[start..]).0)
}
/// Return an iterator over all start IDs in this table.
@@ -1924,27 +2207,26 @@ impl<T: AsRef<[u8]>> StartTable<T> {
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<T: AsMut<[u8]>> StartTable<T> {
/// Set the start state for the given index and pattern.
///
/// If the pattern ID or state ID are not valid, then this will panic.
- fn set_start(
- &mut self,
- index: Start,
- pattern_id: Option<PatternID>,
- id: StateID,
- ) {
- let start_index = index.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => {
+ fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
+ let start_index = start.as_usize();
+ let index = match anchored {
+ Anchored::No => start_index,
+ Anchored::Yes => self.stride + start_index,
+ Anchored::Pattern(pid) => {
let pid = pid.as_usize();
- assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+ let len = self
+ .pattern_len
+ .expect("start states for each pattern enabled");
+ assert!(pid < len, "invalid pattern ID {:?}", pid);
self.stride
.checked_mul(pid)
.unwrap()
- .checked_add(self.stride)
+ .checked_add(self.stride.checked_mul(2).unwrap())
.unwrap()
.checked_add(start_index)
.unwrap()
@@ -1952,7 +2234,7 @@ impl<T: AsMut<[u8]>> StartTable<T> {
};
let start = index * StateID::SIZE;
let end = start + StateID::SIZE;
- bytes::write_state_id::<bytes::NE>(
+ wire::write_state_id::<wire::NE>(
id,
&mut self.table.as_mut()[start..end],
);
@@ -1966,9 +2248,9 @@ struct StartStateIter<'a, T> {
}
impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
- type Item = (StateID, Start, Option<PatternID>);
+ type Item = (StateID, Anchored, Start);
- fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+ fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
let i = self.i;
if i >= self.st.len() {
return None;
@@ -1978,18 +2260,13 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
// This unwrap is okay since the stride of any DFA must always match
// the number of start state types.
let start_type = Start::from_usize(i % self.st.stride).unwrap();
- let pid = if i < self.st.stride {
- // This means we don't have start states for each pattern.
- None
+ let anchored = if i < self.st.stride {
+ Anchored::No
+ } else if i < (2 * self.st.stride) {
+ Anchored::Yes
} else {
- // These unwraps are OK since we may assume our table and stride
- // is correct.
- let pid = i
- .checked_sub(self.st.stride)
- .unwrap()
- .checked_div(self.st.stride)
- .unwrap();
- Some(PatternID::new(pid).unwrap())
+ let pid = (i - (2 * self.st.stride)) / self.st.stride;
+ Anchored::Pattern(PatternID::new(pid).unwrap())
};
let start = i * StateID::SIZE;
let end = start + StateID::SIZE;
@@ -1997,7 +2274,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
// This is OK since we're allowed to assume that any IDs in this start
// table are correct and valid for this DFA.
let id = StateID::from_ne_bytes_unchecked(bytes);
- Some((id, start_type, pid))
+ Some((id, anchored, start_type))
}
}
@@ -2024,7 +2301,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
return None;
}
let state = self.trans.state(StateID::new_unchecked(self.id));
- self.id = self.id + state.bytes_len();
+ self.id = self.id + state.write_to_len();
Some(state)
}
}
@@ -2071,7 +2348,7 @@ impl<'a> State<'a> {
///
/// This is marked as inline to help dramatically boost sparse searching,
/// which decodes each state it enters to follow the next transition.
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn next(&self, input: u8) -> StateID {
// This straight linear search was observed to be much better than
// binary search on ASCII haystacks, likely because a binary search
@@ -2120,19 +2397,66 @@ impl<'a> State<'a> {
/// is invalid, then this panics.
fn pattern_id(&self, match_index: usize) -> PatternID {
let start = match_index * PatternID::SIZE;
- bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
+ wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
}
/// Returns the total number of pattern IDs for this state. This is always
/// zero when `is_match` is false.
- fn pattern_count(&self) -> usize {
+ fn pattern_len(&self) -> usize {
assert_eq!(0, self.pattern_ids.len() % 4);
self.pattern_ids.len() / 4
}
+ /// Return an accelerator for this state.
+ fn accelerator(&self) -> &'a [u8] {
+ self.accel
+ }
+
+ /// Write the raw representation of this state to the given buffer using
+ /// the given endianness.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse state transitions",
+ ));
+ }
+
+ let ntrans =
+ if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans };
+ E::write_u16(u16::try_from(ntrans).unwrap(), dst);
+ dst = &mut dst[size_of::<u16>()..];
+
+ dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges);
+ dst = &mut dst[self.input_ranges.len()..];
+
+ for i in 0..self.ntrans {
+ E::write_u32(self.next_at(i).as_u32(), dst);
+ dst = &mut dst[StateID::SIZE..];
+ }
+
+ if self.is_match {
+ E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ for i in 0..self.pattern_len() {
+ let pid = self.pattern_id(i);
+ E::write_u32(pid.as_u32(), dst);
+ dst = &mut dst[PatternID::SIZE..];
+ }
+ }
+
+ dst[0] = u8::try_from(self.accel.len()).unwrap();
+ dst[1..][..self.accel.len()].copy_from_slice(self.accel);
+
+ Ok(nwrite)
+ }
+
/// Return the total number of bytes that this state consumes in its
/// encoded form.
- fn bytes_len(&self) -> usize {
+ fn write_to_len(&self) -> usize {
let mut len = 2
+ (self.ntrans * 2)
+ (self.ntrans * StateID::SIZE)
@@ -2142,11 +2466,6 @@ impl<'a> State<'a> {
}
len
}
-
- /// Return an accelerator for this state.
- fn accelerator(&self) -> &'a [u8] {
- self.accel
- }
}
impl<'a> fmt::Debug for State<'a> {
@@ -2163,14 +2482,14 @@ impl<'a> fmt::Debug for State<'a> {
}
let (start, end) = self.range(i);
if start == end {
- write!(f, "{:?} => {:?}", DebugByte(start), next)?;
+ write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?;
} else {
write!(
f,
"{:?}-{:?} => {:?}",
DebugByte(start),
DebugByte(end),
- next,
+ next.as_usize(),
)?;
}
printed = true;
@@ -2180,7 +2499,7 @@ impl<'a> fmt::Debug for State<'a> {
if printed {
write!(f, ", ")?;
}
- write!(f, "EOI => {:?}", eoi)?;
+ write!(f, "EOI => {:?}", eoi.as_usize())?;
}
Ok(())
}
@@ -2188,7 +2507,7 @@ impl<'a> fmt::Debug for State<'a> {
/// A representation of a mutable sparse DFA state that can be cheaply
/// materialized from a state identifier.
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
struct StateMut<'a> {
/// The identifier of this state.
id: StateID,
@@ -2216,17 +2535,17 @@ struct StateMut<'a> {
accel: &'a mut [u8],
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<'a> StateMut<'a> {
/// Sets the ith transition to the given state.
fn set_next_at(&mut self, i: usize, next: StateID) {
let start = i * StateID::SIZE;
let end = start + StateID::SIZE;
- bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
+ wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]);
}
}
-#[cfg(feature = "alloc")]
+#[cfg(feature = "dfa-build")]
impl<'a> fmt::Debug for StateMut<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let state = State {
@@ -2242,6 +2561,7 @@ impl<'a> fmt::Debug for StateMut<'a> {
}
}
+/*
/// A binary search routine specialized specifically to a sparse DFA state's
/// transitions. Specifically, the transitions are defined as a set of pairs
/// of input bytes that delineate an inclusive range of bytes. If the input
@@ -2261,8 +2581,7 @@ impl<'a> fmt::Debug for StateMut<'a> {
/// guaranteed to be safe and is thus UB (since I don't think the in-memory
/// representation of `(u8, u8)` has been nailed down). One could define a
/// repr(C) type, but the casting doesn't seem justified.
-#[allow(dead_code)]
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
debug_assert!(ranges.len() <= 512, "ranges should be short");
@@ -2281,3 +2600,57 @@ fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
}
None
}
+*/
+
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
+mod tests {
+ use crate::{
+ dfa::{dense::DFA, Automaton},
+ nfa::thompson,
+ Input, MatchError,
+ };
+
+ // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
+ #[test]
+ fn heuristic_unicode_forward() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_fwd(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_fwd(&input);
+ assert_eq!(Err(expected), got);
+ }
+
+ // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
+ #[test]
+ fn heuristic_unicode_reverse() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/special.rs b/vendor/regex-automata/src/dfa/special.rs
index 3db95a707..a831df5c5 100644
--- a/vendor/regex-automata/src/dfa/special.rs
+++ b/vendor/regex-automata/src/dfa/special.rs
@@ -1,8 +1,8 @@
use crate::{
dfa::DEAD,
util::{
- bytes::{self, DeserializeError, Endian, SerializeError},
- id::StateID,
+ primitives::StateID,
+ wire::{self, DeserializeError, Endian, SerializeError},
},
};
@@ -21,7 +21,7 @@ macro_rules! err {
// has run. The dead state always has an ID of 0. i.e., It is always the
// first state in a DFA.
// * quit - A state that is entered whenever a byte is seen that should cause
-// a DFA to give up and stop searching. This results in a MatchError::Quit
+// a DFA to give up and stop searching. This results in a MatchError::quit
// error being returned at search time. The default configuration for a DFA
// has no quit bytes, which means this state is unreachable by default,
// although it is always present for reasons of implementation simplicity.
@@ -101,7 +101,7 @@ macro_rules! err {
// # A quit state means we give up. If he DFA has no quit state,
// # then special.quit_id == 0 == dead, which is handled by the
// # conditional above.
-// return Err(MatchError::Quit { byte, offset: offset - 1 })
+// return Err(MatchError::quit { byte, offset: offset - 1 })
// if special.min_match <= current_state <= special.max_match:
// last_match = Some(offset)
// if special.min_accel <= current_state <= special.max_accel:
@@ -157,34 +157,34 @@ macro_rules! err {
// |----------------------------|------------------------
// special non-special*
#[derive(Clone, Copy, Debug)]
-pub struct Special {
+pub(crate) struct Special {
/// The identifier of the last special state in a DFA. A state is special
/// if and only if its identifier is less than or equal to `max`.
- pub max: StateID,
+ pub(crate) max: StateID,
/// The identifier of the quit state in a DFA. (There is no analogous field
/// for the dead state since the dead state's ID is always zero, regardless
/// of state ID size.)
- pub quit_id: StateID,
+ pub(crate) quit_id: StateID,
/// The identifier of the first match state.
- pub min_match: StateID,
+ pub(crate) min_match: StateID,
/// The identifier of the last match state.
- pub max_match: StateID,
+ pub(crate) max_match: StateID,
/// The identifier of the first accelerated state.
- pub min_accel: StateID,
+ pub(crate) min_accel: StateID,
/// The identifier of the last accelerated state.
- pub max_accel: StateID,
+ pub(crate) max_accel: StateID,
/// The identifier of the first start state.
- pub min_start: StateID,
+ pub(crate) min_start: StateID,
/// The identifier of the last start state.
- pub max_start: StateID,
+ pub(crate) max_start: StateID,
}
impl Special {
/// Creates a new set of special ranges for a DFA. All ranges are initially
/// set to only contain the dead state. This is interpreted as an empty
/// range.
- #[cfg(feature = "alloc")]
- pub fn new() -> Special {
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn new() -> Special {
Special {
max: DEAD,
quit_id: DEAD,
@@ -198,8 +198,8 @@ impl Special {
}
/// Remaps all of the special state identifiers using the function given.
- #[cfg(feature = "alloc")]
- pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
Special {
max: map(self.max),
quit_id: map(self.quit_id),
@@ -220,14 +220,14 @@ impl Special {
///
/// Upon success, this returns the number of bytes read in addition to the
/// special state IDs themselves.
- pub fn from_bytes(
+ pub(crate) fn from_bytes(
mut slice: &[u8],
) -> Result<(Special, usize), DeserializeError> {
- bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
+ wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
let mut nread = 0;
let mut read_id = |what| -> Result<StateID, DeserializeError> {
- let (id, nr) = bytes::try_read_state_id(slice, what)?;
+ let (id, nr) = wire::try_read_state_id(slice, what)?;
nread += nr;
slice = &slice[StateID::SIZE..];
Ok(id)
@@ -259,7 +259,7 @@ impl Special {
/// Validate that the information describing special states satisfies
/// all known invariants.
- pub fn validate(&self) -> Result<(), DeserializeError> {
+ pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
// Check that both ends of the range are DEAD or neither are.
if self.min_match == DEAD && self.max_match != DEAD {
err!("min_match is DEAD, but max_match is not");
@@ -329,18 +329,18 @@ impl Special {
}
/// Validate that the special state information is compatible with the
- /// given state count.
- pub fn validate_state_count(
+ /// given state len.
+ pub(crate) fn validate_state_len(
&self,
- count: usize,
+ len: usize,
stride2: usize,
) -> Result<(), DeserializeError> {
// We assume that 'validate' has already passed, so we know that 'max'
- // is truly the max. So all we need to check is that the max state
- // ID is less than the state ID count. The max legal value here is
- // count-1, which occurs when there are no non-special states.
- if (self.max.as_usize() >> stride2) >= count {
- err!("max should not be greater than or equal to state count");
+ // is truly the max. So all we need to check is that the max state ID
+ // is less than the state ID len. The max legal value here is len-1,
+ // which occurs when there are no non-special states.
+ if (self.max.as_usize() >> stride2) >= len {
+ err!("max should not be greater than or equal to state length");
}
Ok(())
}
@@ -350,11 +350,11 @@ impl Special {
/// this will return an error. The number of bytes written is returned
/// on success. The number of bytes written is guaranteed to be a multiple
/// of 8.
- pub fn write_to<E: Endian>(
+ pub(crate) fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
- use crate::util::bytes::write_state_id as write;
+ use crate::util::wire::write_state_id as write;
if dst.len() < self.write_to_len() {
return Err(SerializeError::buffer_too_small("special state ids"));
@@ -384,14 +384,14 @@ impl Special {
}
/// Returns the total number of bytes written by `write_to`.
- pub fn write_to_len(&self) -> usize {
+ pub(crate) fn write_to_len(&self) -> usize {
8 * StateID::SIZE
}
/// Sets the maximum special state ID based on the current values. This
/// should be used once all possible state IDs are set.
- #[cfg(feature = "alloc")]
- pub fn set_max(&mut self) {
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn set_max(&mut self) {
use core::cmp::max;
self.max = max(
self.quit_id,
@@ -399,45 +399,62 @@ impl Special {
);
}
+ /// Sets the maximum special state ID such that starting states are not
+ /// considered "special." This also marks the min/max starting states as
+ /// DEAD such that 'is_start_state' always returns false, even if the state
+ /// is actually a starting state.
+ ///
+ /// This is useful when there is no prefilter set. It will avoid
+ /// ping-ponging between the hot path in the DFA search code and the start
+ /// state handling code, which is typically only useful for executing a
+ /// prefilter.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn set_no_special_start_states(&mut self) {
+ use core::cmp::max;
+ self.max = max(self.quit_id, max(self.max_match, self.max_accel));
+ self.min_start = DEAD;
+ self.max_start = DEAD;
+ }
+
/// Returns true if and only if the given state ID is a special state.
#[inline]
- pub fn is_special_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_special_state(&self, id: StateID) -> bool {
id <= self.max
}
/// Returns true if and only if the given state ID is a dead state.
#[inline]
- pub fn is_dead_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
id == DEAD
}
/// Returns true if and only if the given state ID is a quit state.
#[inline]
- pub fn is_quit_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.quit_id == id
}
/// Returns true if and only if the given state ID is a match state.
#[inline]
- pub fn is_match_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_match_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
}
/// Returns true if and only if the given state ID is an accel state.
#[inline]
- pub fn is_accel_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
}
/// Returns true if and only if the given state ID is a start state.
#[inline]
- pub fn is_start_state(&self, id: StateID) -> bool {
+ pub(crate) fn is_start_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
}
/// Returns the total number of match states for a dense table based DFA.
#[inline]
- pub fn match_len(&self, stride: usize) -> usize {
+ pub(crate) fn match_len(&self, stride: usize) -> usize {
if self.matches() {
(self.max_match.as_usize() - self.min_match.as_usize() + stride)
/ stride
@@ -448,13 +465,13 @@ impl Special {
/// Returns true if and only if there is at least one match state.
#[inline]
- pub fn matches(&self) -> bool {
+ pub(crate) fn matches(&self) -> bool {
self.min_match != DEAD
}
/// Returns the total number of accel states.
- #[cfg(feature = "alloc")]
- pub fn accel_len(&self, stride: usize) -> usize {
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn accel_len(&self, stride: usize) -> usize {
if self.accels() {
(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
/ stride
@@ -465,13 +482,13 @@ impl Special {
/// Returns true if and only if there is at least one accel state.
#[inline]
- pub fn accels(&self) -> bool {
+ pub(crate) fn accels(&self) -> bool {
self.min_accel != DEAD
}
/// Returns true if and only if there is at least one start state.
#[inline]
- pub fn starts(&self) -> bool {
+ pub(crate) fn starts(&self) -> bool {
self.min_start != DEAD
}
}
diff --git a/vendor/regex-automata/src/dfa/start.rs b/vendor/regex-automata/src/dfa/start.rs
new file mode 100644
index 000000000..fddc702df
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/start.rs
@@ -0,0 +1,74 @@
+use core::mem::size_of;
+
+use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
+
+/// The kind of anchored starting configurations to support in a DFA.
+///
+/// Fully compiled DFAs need to be explicitly configured as to which anchored
+/// starting configurations to support. The reason for not just supporting
+/// everything unconditionally is that it can use more resources (such as
+/// memory and build time). The downside of this is that if you try to execute
+/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
+/// by the DFA, then the search will return an error.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum StartKind {
+ /// Support both anchored and unanchored searches.
+ Both,
+ /// Support only unanchored searches. Requesting an anchored search will
+ /// panic.
+ ///
+ /// Note that even if an unanchored search is requested, the pattern itself
+ /// may still be anchored. For example, `^abc` will only match `abc` at the
+ /// start of a haystack. This will remain true, even if the regex engine
+ /// only supported unanchored searches.
+ Unanchored,
+ /// Support only anchored searches. Requesting an unanchored search will
+ /// panic.
+ Anchored,
+}
+
+impl StartKind {
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(StartKind, usize), DeserializeError> {
+ wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
+ let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
+ match n {
+ 0 => Ok((StartKind::Both, nr)),
+ 1 => Ok((StartKind::Unanchored, nr)),
+ 2 => Ok((StartKind::Anchored, nr)),
+ _ => Err(DeserializeError::generic("unrecognized start kind")),
+ }
+ }
+
+ pub(crate) fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("start kind"));
+ }
+ let n = match *self {
+ StartKind::Both => 0,
+ StartKind::Unanchored => 1,
+ StartKind::Anchored => 2,
+ };
+ E::write_u32(n, dst);
+ Ok(nwrite)
+ }
+
+ pub(crate) fn write_to_len(&self) -> usize {
+ size_of::<u32>()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn has_unanchored(&self) -> bool {
+ matches!(*self, StartKind::Both | StartKind::Unanchored)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn has_anchored(&self) -> bool {
+ matches!(*self, StartKind::Both | StartKind::Anchored)
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/transducer.rs b/vendor/regex-automata/src/dfa/transducer.rs
deleted file mode 100644
index 58b34e00a..000000000
--- a/vendor/regex-automata/src/dfa/transducer.rs
+++ /dev/null
@@ -1,207 +0,0 @@
-use crate::{
- dfa::{automaton::Automaton, dense, sparse},
- util::id::StateID,
-};
-
-impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> {
- type State = StateID;
-
- #[inline]
- fn start(&self) -> StateID {
- self.start_state_forward(None, &[], 0, 0)
- }
-
- #[inline]
- fn is_match(&self, state: &StateID) -> bool {
- self.is_match_state(*state)
- }
-
- #[inline]
- fn accept(&self, state: &StateID, byte: u8) -> StateID {
- if fst::Automaton::is_match(self, state) {
- return *state;
- }
- self.next_state(*state, byte)
- }
-
- #[inline]
- fn accept_eof(&self, state: &StateID) -> Option<StateID> {
- if fst::Automaton::is_match(self, state) {
- return Some(*state);
- }
- Some(self.next_eoi_state(*state))
- }
-
- #[inline]
- fn can_match(&self, state: &StateID) -> bool {
- !self.is_dead_state(*state)
- }
-}
-
-impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> {
- type State = StateID;
-
- #[inline]
- fn start(&self) -> StateID {
- self.start_state_forward(None, &[], 0, 0)
- }
-
- #[inline]
- fn is_match(&self, state: &StateID) -> bool {
- self.is_match_state(*state)
- }
-
- #[inline]
- fn accept(&self, state: &StateID, byte: u8) -> StateID {
- if fst::Automaton::is_match(self, state) {
- return *state;
- }
- self.next_state(*state, byte)
- }
-
- #[inline]
- fn accept_eof(&self, state: &StateID) -> Option<StateID> {
- if fst::Automaton::is_match(self, state) {
- return Some(*state);
- }
- Some(self.next_eoi_state(*state))
- }
-
- #[inline]
- fn can_match(&self, state: &StateID) -> bool {
- !self.is_dead_state(*state)
- }
-}
-
-#[cfg(test)]
-mod tests {
- use bstr::BString;
- use fst::{Automaton, IntoStreamer, Set, Streamer};
-
- use crate::dfa::{dense, sparse};
-
- fn search<A: Automaton, D: AsRef<[u8]>>(
- set: &Set<D>,
- aut: A,
- ) -> Vec<BString> {
- let mut stream = set.search(aut).into_stream();
-
- let mut results = vec![];
- while let Some(key) = stream.next() {
- results.push(BString::from(key));
- }
- results
- }
-
- #[test]
- fn dense_anywhere() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::DFA::new("ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
- }
-
- #[test]
- fn dense_anchored() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new()
- .configure(dense::Config::new().anchored(true))
- .build("ba.*")
- .unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-
- #[test]
- fn dense_assertions_start() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new().build("^ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-
- #[test]
- fn dense_assertions_end() {
- let set =
- Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new().build(".*x$").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bax", "xbax"]);
- }
-
- #[test]
- fn dense_assertions_word() {
- let set =
- Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
- let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["foo", "zzz foo zzz"]);
- }
-
- #[test]
- fn sparse_anywhere() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = sparse::DFA::new("ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
- }
-
- #[test]
- fn sparse_anchored() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new()
- .configure(dense::Config::new().anchored(true))
- .build("ba.*")
- .unwrap()
- .to_sparse()
- .unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-
- #[test]
- fn sparse_assertions_start() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa =
- dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-
- #[test]
- fn sparse_assertions_end() {
- let set =
- Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa =
- dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bax", "xbax"]);
- }
-
- #[test]
- fn sparse_assertions_word() {
- let set =
- Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
- let dfa = dense::Builder::new()
- .build(r"(?-u)\bfoo\b")
- .unwrap()
- .to_sparse()
- .unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["foo", "zzz foo zzz"]);
- }
-}
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs
index 1fbce5f5f..67261c1a3 100644
--- a/vendor/regex-automata/src/hybrid/dfa.rs
+++ b/vendor/regex-automata/src/hybrid/dfa.rs
@@ -7,36 +7,47 @@ This module also contains a [`hybrid::dfa::Builder`](Builder) and a
[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
*/
-use core::{borrow::Borrow, iter, mem::size_of};
+use core::{iter, mem::size_of};
-use alloc::{sync::Arc, vec::Vec};
+use alloc::vec::Vec;
use crate::{
hybrid::{
error::{BuildError, CacheError},
- id::{LazyStateID, LazyStateIDError, OverlappingState},
+ id::{LazyStateID, LazyStateIDError},
search,
},
nfa::thompson,
util::{
alphabet::{self, ByteClasses, ByteSet},
determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
- id::{PatternID, StateID as NFAStateID},
- matchtypes::{HalfMatch, MatchError, MatchKind},
- prefilter,
+ empty,
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID as NFAStateID},
+ search::{
+ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
+ },
sparse_set::SparseSets,
- start::Start,
+ start::{Start, StartByteMap},
},
};
-/// The mininum number of states that a lazy DFA's cache size must support.
+/// The minimum number of states that a lazy DFA's cache size must support.
///
/// This is checked at time of construction to ensure that at least some small
/// number of states can fit in the given capacity allotment. If we can't fit
/// at least this number of states, then the thinking is that it's pretty
/// senseless to use the lazy DFA. More to the point, parts of the code do
/// assume that the cache can fit at least some small number of states.
-const MIN_STATES: usize = 5;
+const MIN_STATES: usize = SENTINEL_STATES + 2;
+
+/// The number of "sentinel" states that get added to every lazy DFA.
+///
+/// These are special states indicating status conditions of a search: unknown,
+/// dead and quit. These states in particular also use zero NFA states, so
+/// their memory usage is quite small. This is relevant for computing the
+/// minimum memory needed for a lazy DFA cache.
+const SENTINEL_STATES: usize = 3;
/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
///
@@ -92,26 +103,26 @@ const MIN_STATES: usize = 5;
/// a cache and pass it to our search routine.
///
/// ```
-/// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let mut cache = dfa.create_cache();
///
/// let expected = Some(HalfMatch::must(0, 8));
-/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?);
+/// assert_eq!(expected, dfa.try_search_fwd(
+/// &mut cache, &Input::new("foo12345"))?,
+/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct DFA {
- nfa: Arc<thompson::NFA>,
+ config: Config,
+ nfa: thompson::NFA,
stride2: usize,
+ start_map: StartByteMap,
classes: ByteClasses,
quitset: ByteSet,
- anchored: bool,
- match_kind: MatchKind,
- starts_for_each_pattern: bool,
cache_capacity: usize,
- minimum_cache_clear_count: Option<usize>,
}
impl DFA {
@@ -124,7 +135,7 @@ impl DFA {
/// # Example
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+bar")?;
/// let mut cache = dfa.create_cache();
@@ -132,10 +143,11 @@ impl DFA {
/// let expected = HalfMatch::must(0, 11);
/// assert_eq!(
/// Some(expected),
- /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<DFA, BuildError> {
DFA::builder().build(pattern)
}
@@ -149,7 +161,7 @@ impl DFA {
/// # Example
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
/// let mut cache = dfa.create_cache();
@@ -157,10 +169,11 @@ impl DFA {
/// let expected = HalfMatch::must(1, 3);
/// assert_eq!(
/// Some(expected),
- /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
DFA::builder().build_many(patterns)
}
@@ -170,19 +183,23 @@ impl DFA {
/// # Example
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::always_match()?;
/// let mut cache = dfa.create_cache();
///
/// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(
+ /// &mut cache, &Input::new(""))?,
+ /// );
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(
+ /// &mut cache, &Input::new("foo"))?,
+ /// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn always_match() -> Result<DFA, BuildError> {
let nfa = thompson::NFA::always_match();
- Builder::new().build_from_nfa(Arc::new(nfa))
+ Builder::new().build_from_nfa(nfa)
}
/// Create a new lazy DFA that never matches any input.
@@ -190,44 +207,51 @@ impl DFA {
/// # Example
///
/// ```
- /// use regex_automata::hybrid::dfa::DFA;
+ /// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::never_match()?;
/// let mut cache = dfa.create_cache();
///
- /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?);
- /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
+ /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn never_match() -> Result<DFA, BuildError> {
let nfa = thompson::NFA::never_match();
- Builder::new().build_from_nfa(Arc::new(nfa))
+ Builder::new().build_from_nfa(nfa)
}
/// Return a default configuration for a `DFA`.
///
- /// This is a convenience routine to avoid needing to import the `Config`
+ /// This is a convenience routine to avoid needing to import the [`Config`]
/// type when customizing the construction of a lazy DFA.
///
/// # Example
///
- /// This example shows how to build a lazy DFA that only executes searches
- /// in anchored mode.
+ /// This example shows how to build a lazy DFA that heuristically supports
+ /// Unicode word boundaries.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
///
/// let re = DFA::builder()
- /// .configure(DFA::config().anchored(true))
- /// .build(r"[0-9]+")?;
+ /// .configure(DFA::config().unicode_word_boundary(true))
+ /// .build(r"\b\w+\b")?;
/// let mut cache = re.create_cache();
///
- /// let haystack = "abc123xyz".as_bytes();
- /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?);
- /// assert_eq!(
- /// Some(HalfMatch::must(0, 3)),
- /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?,
- /// );
+ /// // Since our haystack is all ASCII, the DFA search sees then and knows
+ /// // it is legal to interpret Unicode word boundaries as ASCII word
+ /// // boundaries.
+ /// let input = Input::new("!!foo!!");
+ /// let expected = HalfMatch::must(0, 5);
+ /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
+ ///
+ /// // But if our haystack contains non-ASCII, then the search will fail
+ /// // with an error.
+ /// let input = Input::new("!!βββ!!");
+ /// let expected = MatchError::quit(b'\xCE', 2);
+ /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -243,29 +267,20 @@ impl DFA {
/// # Example
///
/// This example shows how to use the builder to disable UTF-8 mode
- /// everywhere for lazy DFAs. This includes disabling it for both the
- /// concrete syntax (e.g., `.` matches any byte and Unicode character
- /// classes like `\p{Letter}` are not allowed) and for the unanchored
- /// search prefix. The latter enables the regex to match anywhere in a
- /// sequence of arbitrary bytes. (Typically, the unanchored search prefix
- /// will only permit matching valid UTF-8.)
+ /// everywhere for lazy DFAs.
///
/// ```
- /// use regex_automata::{
- /// hybrid::dfa::DFA,
- /// nfa::thompson,
- /// HalfMatch, SyntaxConfig,
- /// };
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
///
/// let re = DFA::builder()
- /// .syntax(SyntaxConfig::new().utf8(false))
- /// .thompson(thompson::Config::new().utf8(false))
+ /// .syntax(syntax::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
- /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
/// let expected = Some(HalfMatch::must(0, 9));
- /// let got = re.find_leftmost_fwd(&mut cache, haystack)?;
+ /// let got = re.try_search_fwd(&mut cache, &input)?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -302,7 +317,8 @@ impl DFA {
/// This shows how to re-purpose a cache for use with a different DFA.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa1 = DFA::new(r"\w")?;
/// let dfa2 = DFA::new(r"\W")?;
@@ -310,7 +326,7 @@ impl DFA {
/// let mut cache = dfa1.create_cache();
/// assert_eq!(
/// Some(HalfMatch::must(0, 2)),
- /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
/// );
///
/// // Using 'cache' with dfa2 is not allowed. It may result in panics or
@@ -322,7 +338,7 @@ impl DFA {
/// dfa2.reset_cache(&mut cache);
/// assert_eq!(
/// Some(HalfMatch::must(0, 3)),
- /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -337,13 +353,13 @@ impl DFA {
///
/// # Example
///
- /// This example shows the pattern count for a DFA that never matches:
+ /// This example shows the pattern length for a DFA that never matches:
///
/// ```
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::never_match()?;
- /// assert_eq!(dfa.pattern_count(), 0);
+ /// assert_eq!(dfa.pattern_len(), 0);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -353,7 +369,7 @@ impl DFA {
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::always_match()?;
- /// assert_eq!(dfa.pattern_count(), 1);
+ /// assert_eq!(dfa.pattern_len(), 1);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
@@ -363,15 +379,37 @@ impl DFA {
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
- /// assert_eq!(dfa.pattern_count(), 3);
+ /// assert_eq!(dfa.pattern_len(), 3);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn pattern_count(&self) -> usize {
+ pub fn pattern_len(&self) -> usize {
self.nfa.pattern_len()
}
+ /// Returns the equivalence classes that make up the alphabet for this DFA.
+ ///
+ /// Unless [`Config::byte_classes`] was disabled, it is possible that
+ /// multiple distinct bytes are grouped into the same equivalence class
+ /// if it is impossible for them to discriminate between a match and a
+ /// non-match. This has the effect of reducing the overall alphabet size
+ /// and in turn potentially substantially reducing the size of the DFA's
+ /// transition table.
+ ///
+ /// The downside of using equivalence classes like this is that every state
+ /// transition will automatically use this map to convert an arbitrary
+ /// byte to its corresponding equivalence class. In practice this has a
+ /// negligible impact on performance.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.classes
+ }
+
+ /// Returns this lazy DFA's configuration.
+ pub fn get_config(&self) -> &Config {
+ &self.config
+ }
+
/// Returns a reference to the underlying NFA.
- pub fn nfa(&self) -> &Arc<thompson::NFA> {
+ pub fn get_nfa(&self) -> &thompson::NFA {
&self.nfa
}
@@ -393,253 +431,231 @@ impl DFA {
1 << self.stride2()
}
- /// Returns the total number of elements in the alphabet for this
- /// transition table. This is always less than or equal to `self.stride()`.
- /// It is only equal when the alphabet length is a power of 2. Otherwise,
- /// it is always strictly less.
- fn alphabet_len(&self) -> usize {
- self.classes.alphabet_len()
- }
-
/// Returns the memory usage, in bytes, of this lazy DFA.
///
/// This does **not** include the stack size used up by this lazy DFA. To
- /// compute that, use `std::mem::size_of::<DFA>()`. This also does
- /// not include the size of the `Cache` used.
+ /// compute that, use `std::mem::size_of::<DFA>()`. This also does not
+ /// include the size of the `Cache` used.
+ ///
+ /// This also does not include any heap memory used by the NFA inside of
+ /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
+ /// thus not owned by this hybrid NFA/DFA. More practically, several regex
+ /// engines in this crate embed an NFA, and reporting the NFA's memory
+ /// usage in all of them would likely result in reporting higher heap
+ /// memory than is actually used.
pub fn memory_usage(&self) -> usize {
- // Everything else is on the stack.
- self.nfa.memory_usage()
+ // The only thing that uses heap memory in a DFA is the NFA. But the
+ // NFA has shared ownership, so reporting its memory as part of the
+ // hybrid DFA is likely to lead to double-counting the NFA memory
+ // somehow. In particular, this DFA does not really own an NFA, so
+ // including it in the DFA's memory usage doesn't seem semantically
+ // correct.
+ 0
}
}
impl DFA {
- /// Executes a forward search and returns the end position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state. This is useful for implementing boolean `is_match`-like
- /// routines, where as little work is done as possible.
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
///
- /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as
- /// providing a prefilter, a specific pattern to match and the bounds of
- /// the search within the haystack. This routine is meant as a convenience
- /// for common cases where the additional functionality is not needed.
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
- /// This example demonstrates how the position returned might differ from
- /// what one might expect when executing a traditional leftmost search.
+ /// This example shows how to run a basic search.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let mut cache = dfa.create_cache();
- /// // Normally, the end of the leftmost first match here would be 8,
- /// // corresponding to the end of the input. But the "earliest" semantics
- /// // this routine cause it to stop as soon as a match is known, which
- /// // occurs once 'foo[0-9]' has matched.
- /// let expected = HalfMatch::must(0, 4);
- /// assert_eq!(
- /// Some(expected),
- /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?,
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(
+ /// &mut cache, &Input::new("foo12345"))?,
/// );
///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over later parts.
/// let dfa = DFA::new("abc|a")?;
/// let mut cache = dfa.create_cache();
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the shortest match semantics detect a match earlier.
- /// let expected = HalfMatch::must(0, 1);
- /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?);
+ /// let expected = HalfMatch::must(0, 3);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(
+ /// &mut cache, &Input::new("abc"))?,
+ /// );
+ ///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- #[inline]
- pub fn find_earliest_fwd(
- &self,
- cache: &mut Cache,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len())
- }
-
- /// Executes a reverse search and returns the start position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state.
- ///
- /// Note that while it is not technically necessary to build a reverse
- /// automaton to use a reverse search, it is likely that you'll want to do
- /// so. Namely, the typical use of a reverse search is to find the starting
- /// location of a match once its end is discovered from a forward search. A
- /// reverse DFA automaton can be built by configuring the intermediate NFA
- /// to be reversed via
- /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Example
+ /// # Example: specific pattern search
///
- /// This example demonstrates how the position returned might differ from
- /// what one might expect when executing a traditional leftmost reverse
- /// search.
+ /// This example shows how to build a lazy multi-DFA that permits searching
+ /// for specific patterns.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
- ///
- /// let dfa = DFA::builder()
- /// .thompson(thompson::Config::new().reverse(true))
- /// .build("[a-z]+[0-9]+")?;
- /// let mut cache = dfa.create_cache();
- /// // Normally, the end of the leftmost first match here would be 0,
- /// // corresponding to the beginning of the input. But the "earliest"
- /// // semantics of this routine cause it to stop as soon as a match is
- /// // known, which occurs once '[a-z][0-9]+' has matched.
- /// let expected = HalfMatch::must(0, 2);
- /// assert_eq!(
- /// Some(expected),
- /// dfa.find_earliest_rev(&mut cache, b"foo12345")?,
- /// );
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// Anchored, HalfMatch, PatternID, Input,
+ /// };
///
/// let dfa = DFA::builder()
- /// .thompson(thompson::Config::new().reverse(true))
- /// .build("abc|c")?;
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
/// let mut cache = dfa.create_cache();
- /// // Normally, the end of the leftmost first match here would be 0,
- /// // but the shortest match semantics detect a match earlier.
- /// let expected = HalfMatch::must(0, 2);
- /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?);
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- #[inline]
- pub fn find_earliest_rev(
- &self,
- cache: &mut Cache,
- bytes: &[u8],
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len())
- }
-
- /// Executes a forward search and returns the end position of the leftmost
- /// match that is found. If no match exists, then `None` is returned.
+ /// let haystack = "foo123";
///
- /// In particular, this method continues searching even after it enters
- /// a match state. The search only terminates once it has reached the
- /// end of the input or when it has entered a dead or quit state. Upon
- /// termination, the position of the last byte seen while still in a match
- /// state is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
+ /// assert_eq!(expected, got);
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// let got = dfa.try_search_fwd(&mut cache, &input)?;
+ /// assert_eq!(expected, got);
///
- /// # Example
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
///
- /// Leftmost first match semantics corresponds to the match with the
- /// smallest starting offset, but where the end offset is determined by
- /// preferring earlier branches in the original regular expression. For
- /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
- /// will match `Samwise` in `Samwise`.
+ /// # Example: specifying the bounds of a search
///
- /// Generally speaking, the "leftmost first" match is how most backtracking
- /// regular expressions tend to work. This is in contrast to POSIX-style
- /// regular expressions that yield "leftmost longest" matches. Namely,
- /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
- /// leftmost longest semantics. (This crate does not currently support
- /// leftmost longest semantics.)
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
- /// let dfa = DFA::new("foo[0-9]+")?;
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries since our haystack is pure ASCII.
+ /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
/// let mut cache = dfa.create_cache();
- /// let expected = HalfMatch::must(0, 8);
- /// assert_eq!(
- /// Some(expected),
- /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?,
- /// );
+ /// let haystack = "foo123bar";
///
- /// // Even though a match is found after reading the first byte (`a`),
- /// // the leftmost first match semantics demand that we find the earliest
- /// // match that prefers earlier parts of the pattern over latter parts.
- /// let dfa = DFA::new("abc|a")?;
- /// let mut cache = dfa.create_cache();
- /// let expected = HalfMatch::must(0, 3);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?);
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.try_search_fwd(
+ /// &mut cache,
+ /// &Input::new(&haystack[3..6]),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let got = dfa.try_search_fwd(
+ /// &mut cache,
+ /// &Input::new(haystack).range(3..6),
+ /// )?;
+ /// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- pub fn find_leftmost_fwd(
+ pub fn try_search_fwd(
&self,
cache: &mut Cache,
- bytes: &[u8],
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len())
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ let hm = match search::find_fwd(self, cache, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ // We get to this point when we know our DFA can match the empty string
+ // AND when UTF-8 mode is enabled. In this case, we skip any matches
+ // whose offset splits a codepoint. Such a match is necessarily a
+ // zero-width match, because UTF-8 mode requires the underlying NFA
+ // to be built such that all non-empty matches span valid UTF-8.
+ // Therefore, any match that ends in the middle of a codepoint cannot
+ // be part of a span of valid UTF-8 and thus must be an empty match.
+ // In such cases, we skip it, so as not to report matches that split a
+ // codepoint.
+ //
+ // Note that this is not a checked assumption. Callers *can* provide an
+ // NFA with UTF-8 mode enabled but produces non-empty matches that span
+ // invalid UTF-8. But doing so is documented to result in unspecified
+ // behavior.
+ empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+ let got = search::find_fwd(self, cache, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
}
/// Executes a reverse search and returns the start of the position of the
/// leftmost match that is found. If no match exists, then `None` is
/// returned.
///
- /// In particular, this method continues searching even after it enters
- /// a match state. The search only terminates once it has reached the
- /// end of the input or when it has entered a dead or quit state. Upon
- /// termination, the position of the last byte seen while still in a match
- /// state is returned.
- ///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
- /// In particular, this routine is principally
- /// useful when used in conjunction with the
- /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers
- /// e) configuration. In general, it's unlikely to be correct to use both
- /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since
- /// any particular DFA will only support searching in one direction with
+ /// This routine is principally useful when used in
+ /// conjunction with the
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
+ /// configuration. In general, it's unlikely to be correct to use both
+ /// `try_search_fwd` and `try_search_rev` with the same DFA since any
+ /// particular DFA will only support searching in one direction with
/// respect to the pattern.
///
/// ```
- /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch};
+ /// use regex_automata::{
+ /// nfa::thompson,
+ /// hybrid::dfa::DFA,
+ /// HalfMatch, Input,
+ /// };
///
/// let dfa = DFA::builder()
/// .thompson(thompson::Config::new().reverse(true))
@@ -648,7 +664,7 @@ impl DFA {
/// let expected = HalfMatch::must(0, 0);
/// assert_eq!(
/// Some(expected),
- /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?,
+ /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
/// );
///
/// // Even though a match is found after reading the last byte (`c`),
@@ -659,17 +675,133 @@ impl DFA {
/// .build("abc|c")?;
/// let mut cache = dfa.create_cache();
/// let expected = HalfMatch::must(0, 0);
- /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?);
+ /// assert_eq!(Some(expected), dfa.try_search_rev(
+ /// &mut cache, &Input::new("abc"))?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: UTF-8 mode
+ ///
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
+ ///
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build(r"")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&mut cache, &input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// underlying NFA disabled:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&mut cache, &input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- pub fn find_leftmost_rev(
+ pub fn try_search_rev(
&self,
cache: &mut Cache,
- bytes: &[u8],
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len())
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ let hm = match search::find_rev(self, cache, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ empty::skip_splits_rev(input, hm, hm.offset(), |input| {
+ let got = search::find_rev(self, cache, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
}
/// Executes an overlapping forward search and returns the end position of
@@ -681,15 +813,34 @@ impl DFA {
/// state from prior calls so that the implementation knows where the last
/// match occurred.
///
- /// # Errors
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should remain invariant throughout
+ /// iteration. The `OverlappingState` given to the search will keep track
+ /// of the current position of the search. (This is because multiple
+ /// matches may be reported at the same position, so only the search
+ /// implementation itself knows when to advance the position.)
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
+ /// # Errors
///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
@@ -708,10 +859,10 @@ impl DFA {
/// the search to find totally new matches (potentially of other patterns).
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
- /// hybrid::{dfa::DFA, OverlappingState},
- /// HalfMatch,
- /// MatchKind,
+ /// hybrid::dfa::{DFA, OverlappingState},
+ /// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
@@ -719,12 +870,14 @@ impl DFA {
/// .build_many(&[r"\w+$", r"\S+$"])?;
/// let mut cache = dfa.create_cache();
///
- /// let haystack = "@foo".as_bytes();
+ /// let haystack = "@foo";
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
- /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(
+ /// &mut cache, &Input::new(haystack), &mut state,
+ /// )?;
+ /// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
@@ -732,418 +885,265 @@ impl DFA {
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
- /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(
+ /// &mut cache, &Input::new(haystack), &mut state,
+ /// )?;
+ /// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- pub fn find_overlapping_fwd(
+ pub fn try_search_overlapping_fwd(
&self,
cache: &mut Cache,
- bytes: &[u8],
+ input: &Input<'_>,
state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- self.find_overlapping_fwd_at(
- cache,
- None,
- None,
- bytes,
- 0,
- bytes.len(),
- state,
- )
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ search::find_overlapping_fwd(self, cache, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_fwd(self, cache, input, state)
+ },
+ ),
+ }
}
- /// Executes a forward search and returns the end position of the first
- /// match that is found as early as possible. If no match exists, then
+ /// Executes a reverse overlapping search and returns the start of the
+ /// position of the leftmost match that is found. If no match exists, then
/// `None` is returned.
///
- /// This routine stops scanning input as soon as the search observes a
- /// match state. This is useful for implementing boolean `is_match`-like
- /// routines, where as little work is done as possible.
- ///
- /// This is like [`DFA::find_earliest_fwd`], except it provides some
- /// additional control over how the search is executed:
- ///
- /// * `pre` is a prefilter scanner that, when given, is used whenever the
- /// DFA enters its starting state. This is meant to speed up searches where
- /// one or a small number of literal prefixes are known.
- /// * `pattern_id` specifies a specific pattern in the DFA to run an
- /// anchored search for. If not given, then a search for any pattern is
- /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be
- /// enabled to use this functionality.
- /// * `start` and `end` permit searching a specific region of the haystack
- /// `bytes`. This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `bytes`. (Because the existence of look-around
- /// operations such as `\b`, `^` and `$` need to take the surrounding
- /// context into account. This cannot be done if the haystack doesn't
- /// contain it.)
- ///
- /// The examples below demonstrate each of these additional parameters.
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should remain invariant throughout
+ /// iteration. The `OverlappingState` given to the search will keep track
+ /// of the current position of the search. (This is because multiple
+ /// matches may be reported at the same position, so only the search
+ /// implementation itself knows when to advance the position.)
///
- /// # Errors
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
+ /// # Errors
///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
- /// # Panics
- ///
- /// This routine panics if a `pattern_id` is given and this lazy DFA does
- /// not support specific pattern searches.
+ /// # Example: UTF-8 mode
///
- /// It also panics if the given haystack range is not valid.
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
///
- /// # Example: prefilter
- ///
- /// This example shows how to provide a prefilter for a pattern where all
- /// matches start with a `z` byte.
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
///
/// ```
/// use regex_automata::{
- /// hybrid::dfa::DFA,
- /// util::prefilter::{Candidate, Prefilter, Scanner, State},
- /// HalfMatch,
+ /// hybrid::dfa::{DFA, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
/// };
///
- /// #[derive(Debug)]
- /// pub struct ZPrefilter;
- ///
- /// impl Prefilter for ZPrefilter {
- /// fn next_candidate(
- /// &self,
- /// _: &mut State,
- /// haystack: &[u8],
- /// at: usize,
- /// ) -> Candidate {
- /// // Try changing b'z' to b'q' and observe this test fail since
- /// // the prefilter will skip right over the match.
- /// match haystack.iter().position(|&b| b == b'z') {
- /// None => Candidate::None,
- /// Some(i) => Candidate::PossibleStartOfMatch(at + i),
- /// }
- /// }
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build_many(&[r"", r"☃"])?;
+ /// let mut cache = dfa.create_cache();
///
- /// fn heap_bytes(&self) -> usize {
- /// 0
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
/// }
/// }
///
- /// let dfa = DFA::new("z[0-9]{3}")?;
- /// let mut cache = dfa.create_cache();
- ///
- /// let haystack = "foobar z123 q123".as_bytes();
- /// // A scanner executes a prefilter while tracking some state that helps
- /// // determine whether a prefilter is still "effective" or not.
- /// let mut scanner = Scanner::new(&ZPrefilter);
- ///
- /// let expected = Some(HalfMatch::must(0, 11));
- /// let got = dfa.find_earliest_fwd_at(
- /// &mut cache,
- /// Some(&mut scanner),
- /// None,
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
- /// # Example: specific pattern search
- ///
- /// This example shows how to build a lazy multi-DFA that permits searching
- /// for specific patterns.
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// underlying NFA disabled:
///
/// ```
/// use regex_automata::{
- /// hybrid::dfa::DFA,
- /// HalfMatch,
- /// PatternID,
+ /// hybrid::dfa::{DFA, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
- /// .configure(DFA::config().starts_for_each_pattern(true))
- /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
- /// let mut cache = dfa.create_cache();
- /// let haystack = "foo123".as_bytes();
- ///
- /// // Since we are using the default leftmost-first match and both
- /// // patterns match at the same starting position, only the first pattern
- /// // will be returned in this case when doing a search for any of the
- /// // patterns.
- /// let expected = Some(HalfMatch::must(0, 6));
- /// let got = dfa.find_earliest_fwd_at(
- /// &mut cache,
- /// None,
- /// None,
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
- ///
- /// // But if we want to check whether some other pattern matches, then we
- /// // can provide its pattern ID.
- /// let expected = Some(HalfMatch::must(1, 6));
- /// let got = dfa.find_earliest_fwd_at(
- /// &mut cache,
- /// None,
- /// Some(PatternID::must(1)),
- /// haystack,
- /// 0,
- /// haystack.len(),
- /// )?;
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- ///
- /// # Example: specifying the bounds of a search
- ///
- /// This example shows how providing the bounds of a search can produce
- /// different results than simply sub-slicing the haystack.
- ///
- /// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
- ///
- /// // N.B. We disable Unicode here so that we use a simple ASCII word
- /// // boundary. Alternatively, we could enable heuristic support for
- /// // Unicode word boundaries since our haystack is pure ASCII.
- /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build_many(&[r"", r"☃"])?;
/// let mut cache = dfa.create_cache();
- /// let haystack = "foo123bar".as_bytes();
///
- /// // Since we sub-slice the haystack, the search doesn't know about the
- /// // larger context and assumes that `123` is surrounded by word
- /// // boundaries. And of course, the match position is reported relative
- /// // to the sub-slice as well, which means we get `3` instead of `6`.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// let got = dfa.find_earliest_fwd_at(
- /// &mut cache,
- /// None,
- /// None,
- /// &haystack[3..6],
- /// 0,
- /// haystack[3..6].len(),
- /// )?;
- /// assert_eq!(expected, got);
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
+ /// }
+ /// }
///
- /// // But if we provide the bounds of the search within the context of the
- /// // entire haystack, then the search can take the surrounding context
- /// // into account. (And if we did find a match, it would be reported
- /// // as a valid offset into `haystack` instead of its sub-slice.)
- /// let expected = None;
- /// let got = dfa.find_earliest_fwd_at(
- /// &mut cache,
- /// None,
- /// None,
- /// haystack,
- /// 3,
- /// 6,
- /// )?;
- /// assert_eq!(expected, got);
+ /// // Now *all* positions match, even within a codepoint,
+ /// // because we lifted the requirement that matches
+ /// // correspond to valid UTF-8 spans.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- pub fn find_earliest_fwd_at(
+ pub fn try_search_overlapping_rev(
&self,
cache: &mut Cache,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_earliest_fwd(
- pre, self, cache, pattern_id, bytes, start, end,
- )
- }
-
- /// Executes a reverse search and returns the start position of the first
- /// match that is found as early as possible. If no match exists, then
- /// `None` is returned.
- ///
- /// This routine stops scanning input as soon as the search observes a
- /// match state.
- ///
- /// This is like [`DFA::find_earliest_rev`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`DFA::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// # Panics
- ///
- /// This routine panics if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It also panics if the given haystack range is not valid.
- #[inline]
- pub fn find_earliest_rev_at(
- &self,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_earliest_rev(self, cache, pattern_id, bytes, start, end)
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ search::find_overlapping_rev(self, cache, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_rev(self, cache, input, state)
+ },
+ ),
+ }
}
- /// Executes a forward search and returns the end position of the leftmost
- /// match that is found. If no match exists, then `None` is returned.
- ///
- /// This is like [`DFA::find_leftmost_fwd`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
- /// additional parameters along with examples of their usage.
- ///
- /// # Errors
+ /// Writes the set of patterns that match anywhere in the given search
+ /// configuration to `patset`. If multiple patterns match at the same
+ /// position and the underlying DFA supports overlapping matches, then all
+ /// matching patterns are written to the given set.
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
+ /// Unless all of the patterns in this DFA are anchored, then generally
+ /// speaking, this will visit every byte in the haystack.
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// This search routine *does not* clear the pattern set. This gives some
+ /// flexibility to the caller (e.g., running multiple searches with the
+ /// same pattern set), but does make the API bug-prone if you're reusing
+ /// the same pattern set for multiple searches but intended them to be
+ /// independent.
///
- /// # Panics
- ///
- /// This routine panics if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It also panics if the given haystack range is not valid.
- #[inline]
- pub fn find_leftmost_fwd_at(
- &self,
- cache: &mut Cache,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_leftmost_fwd(
- pre, self, cache, pattern_id, bytes, start, end,
- )
- }
-
- /// Executes a reverse search and returns the start of the position of the
- /// leftmost match that is found. If no match exists, then `None` is
- /// returned.
- ///
- /// This is like [`DFA::find_leftmost_rev`], except it provides some
- /// additional control over how the search is executed. See the
- /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
- /// additional parameters along with examples of their usage.
+ /// If a pattern ID matched but the given `PatternSet` does not have
+ /// sufficient capacity to store it, then it is not inserted and silently
+ /// dropped.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
- /// # Panics
- ///
- /// This routine panics if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
- ///
- /// It also panics if the given haystack range is not valid.
- #[inline]
- pub fn find_leftmost_rev_at(
- &self,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end)
- }
-
- /// Executes an overlapping forward search and returns the end position of
- /// matches as they are found. If no match exists, then `None` is returned.
- ///
- /// This routine is principally only useful when searching for multiple
- /// patterns on inputs where multiple patterns may match the same regions
- /// of text. In particular, callers must preserve the automaton's search
- /// state from prior calls so that the implementation knows where the last
- /// match occurred.
- ///
- /// This is like [`DFA::find_overlapping_fwd`], except it provides
- /// some additional control over how the search is executed. See the
- /// documentation of [`DFA::find_earliest_fwd_at`] for more details
- /// on the additional parameters along with examples of their usage.
- ///
- /// When using this routine to implement an iterator of overlapping
- /// matches, the `start` of the search should always be set to the end
- /// of the last match. If more patterns match at the previous location,
- /// then they will be immediately returned. (This is tracked by the given
- /// overlapping state.) Otherwise, the search continues at the starting
- /// position given.
- ///
- /// If for some reason you want the search to forget about its previous
- /// state and restart the search at a particular position, then setting the
- /// state to [`OverlappingState::start`] will accomplish that.
- ///
- /// # Errors
+ /// # Example
///
- /// This routine only errors if the search could not complete. For
- /// lazy DFAs generated by this crate, this only occurs in non-default
- /// configurations where quit bytes are used, Unicode word boundaries are
- /// heuristically enabled or limits are set on the number of times the lazy
- /// DFA's cache may be cleared.
+ /// This example shows how to find all matching patterns in a haystack,
+ /// even when some patterns match at the same position as other patterns.
///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// Input, MatchKind, PatternSet,
+ /// };
///
- /// # Panics
+ /// let patterns = &[
+ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
+ /// ];
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ /// let mut cache = dfa.create_cache();
///
- /// This routine panics if a `pattern_id` is given and the underlying
- /// DFA does not support specific pattern searches.
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(dfa.pattern_len());
+ /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
///
- /// It also panics if the given haystack range is not valid.
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
#[inline]
- pub fn find_overlapping_fwd_at(
+ pub fn try_which_overlapping_matches(
&self,
cache: &mut Cache,
- pre: Option<&mut prefilter::Scanner>,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<HalfMatch>, MatchError> {
- search::find_overlapping_fwd(
- pre, self, cache, pattern_id, bytes, start, end, state,
- )
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), MatchError> {
+ let mut state = OverlappingState::start();
+ while let Some(m) = {
+ self.try_search_overlapping_fwd(cache, input, &mut state)?;
+ state.get_match()
+ } {
+ let _ = patset.try_insert(m.pattern());
+ // There's nothing left to find, so we can stop. Or the caller
+ // asked us to.
+ if patset.is_full() || input.get_earliest() {
+ break;
+ }
+ }
+ Ok(())
}
}
@@ -1189,7 +1189,7 @@ impl DFA {
/// haystack by using the `next_state` method.
///
/// ```
- /// use regex_automata::hybrid::dfa::DFA;
+ /// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
@@ -1198,7 +1198,7 @@ impl DFA {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
- /// &mut cache, None, haystack, 0, haystack.len(),
+ /// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
@@ -1289,7 +1289,7 @@ impl DFA {
/// haystack by using the `next_state_untagged` method where possible.
///
/// ```
- /// use regex_automata::hybrid::dfa::DFA;
+ /// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
@@ -1298,7 +1298,7 @@ impl DFA {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
- /// &mut cache, None, haystack, 0, haystack.len(),
+ /// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// let mut at = 0;
@@ -1478,7 +1478,7 @@ impl DFA {
/// and then finishing the search with the final EOI transition.
///
/// ```
- /// use regex_automata::hybrid::dfa::DFA;
+ /// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
@@ -1487,7 +1487,7 @@ impl DFA {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
- /// &mut cache, None, haystack, 0, haystack.len(),
+ /// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
@@ -1524,44 +1524,43 @@ impl DFA {
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
- /// * The pattern ID, if present. When the underlying DFA has been
- /// configured with multiple patterns _and_ the DFA has been configured to
- /// build an anchored start state for each pattern, then a pattern ID may
- /// be specified to execute an anchored search for that specific pattern.
- /// If `pattern_id` is invalid or if the DFA isn't configured to build
- /// start states for each pattern, then implementations must panic. DFAs in
- /// this crate can be configured to build start states for each pattern via
- /// [`Config::starts_for_each_pattern`].
- /// * When `start > 0`, the byte at index `start - 1` may influence the
- /// start state if the regex uses `^` or `\b`.
- /// * Similarly, when `start == 0`, it may influence the start state when
- /// the regex uses `^` or `\A`.
- /// * Currently, `end` is unused.
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for forward searches.
///
- /// # Panics
+ /// # Errors
///
- /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
- /// also panics if `pattern_id` is non-None and does not refer to a valid
- /// pattern, or if the DFA was not configured to build anchored start
- /// states for each pattern.
- #[inline]
+ /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
+ /// needs to give up when determining the start state (for example, if
+ /// it sees a "quit" byte or if the cache has been cleared too many
+ /// times). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_forward(
&self,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<LazyStateID, CacheError> {
- let mut lazy = Lazy::new(self, cache);
- let start_type = Start::from_position_fwd(bytes, start, end);
- let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
- if !sid.is_unknown() {
- return Ok(sid);
+ input: &Input<'_>,
+ ) -> Result<LazyStateID, MatchError> {
+ if !self.quitset.is_empty() && input.start() > 0 {
+ let offset = input.start() - 1;
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start_type = self.start_map.fwd(input);
+ let start = LazyRef::new(self, cache)
+ .get_cached_start_id(input, start_type)?;
+ if !start.is_unknown() {
+ return Ok(start);
}
- lazy.cache_start_group(pattern_id, start_type)
+ Lazy::new(self, cache).cache_start_group(input, start_type)
}
/// Return the ID of the start state for this lazy DFA when executing a
@@ -1570,44 +1569,43 @@ impl DFA {
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
- /// * The pattern ID, if present. When the underlying DFA has been
- /// configured with multiple patterns _and_ the DFA has been configured to
- /// build an anchored start state for each pattern, then a pattern ID may
- /// be specified to execute an anchored search for that specific pattern.
- /// If `pattern_id` is invalid or if the DFA isn't configured to build
- /// start states for each pattern, then implementations must panic. DFAs in
- /// this crate can be configured to build start states for each pattern via
- /// [`Config::starts_for_each_pattern`].
- /// * When `end < bytes.len()`, the byte at index `end` may influence the
- /// start state if the regex uses `$` or `\b`.
- /// * Similarly, when `end == bytes.len()`, it may influence the start
- /// state when the regex uses `$` or `\z`.
- /// * Currently, `start` is unused.
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for reverse searches.
///
- /// # Panics
+ /// # Errors
///
- /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
- /// also panics if `pattern_id` is non-None and does not refer to a valid
- /// pattern, or if the DFA was not configured to build anchored start
- /// states for each pattern.
- #[inline]
+ /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
+ /// needs to give up when determining the start state (for example, if
+ /// it sees a "quit" byte or if the cache has been cleared too many
+ /// times). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_reverse(
&self,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Result<LazyStateID, CacheError> {
- let mut lazy = Lazy::new(self, cache);
- let start_type = Start::from_position_rev(bytes, start, end);
- let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
- if !sid.is_unknown() {
- return Ok(sid);
+ input: &Input<'_>,
+ ) -> Result<LazyStateID, MatchError> {
+ if !self.quitset.is_empty() && input.end() < input.haystack().len() {
+ let offset = input.end();
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
}
- lazy.cache_start_group(pattern_id, start_type)
+ let start_type = self.start_map.rev(input);
+ let start = LazyRef::new(self, cache)
+ .get_cached_start_id(input, start_type)?;
+ if !start.is_unknown() {
+ return Ok(start);
+ }
+ Lazy::new(self, cache).cache_start_group(input, start_type)
}
/// Returns the total number of patterns that match in this state.
@@ -1616,9 +1614,11 @@ impl DFA {
/// necessarily always return `1` for all match states.
///
/// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
- /// indices up to (but not including) the count returned by this routine
+ /// indices up to (but not including) the length returned by this routine
/// without panicking.
///
+ /// # Panics
+ ///
/// If the given state is not a match state, then this may either panic
/// or return an incorrect result.
///
@@ -1629,12 +1629,10 @@ impl DFA {
/// patterns have matched in a particular state, but also how to access
/// which specific patterns have matched.
///
- /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
- /// when building the DFA. If we used
- /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
- /// instead, then the DFA would not be constructed in a way that supports
- /// overlapping matches. (It would only report a single pattern that
- /// matches at any particular point in time.)
+ /// Notice that we must use [`MatchKind::All`] when building the DFA. If we
+ /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
+ /// constructed in a way that supports overlapping matches. (It would only
+ /// report a single pattern that matches at any particular point in time.)
///
/// Another thing to take note of is the patterns used and the order in
/// which the pattern IDs are reported. In the example below, pattern `3`
@@ -1645,7 +1643,8 @@ impl DFA {
/// other.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, MatchKind};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
@@ -1658,7 +1657,7 @@ impl DFA {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
- /// &mut cache, None, haystack, 0, haystack.len(),
+ /// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
@@ -1667,8 +1666,8 @@ impl DFA {
/// sid = dfa.next_eoi_state(&mut cache, sid)?;
///
/// assert!(sid.is_match());
- /// assert_eq!(dfa.match_count(&mut cache, sid), 3);
- /// // The following calls are guaranteed to not panic since `match_count`
+ /// assert_eq!(dfa.match_len(&mut cache, sid), 3);
+ /// // The following calls are guaranteed to not panic since `match_len`
/// // returned `3` above.
/// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
/// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
@@ -1677,22 +1676,22 @@ impl DFA {
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
- pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize {
+ pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
assert!(id.is_match());
- LazyRef::new(self, cache).get_cached_state(id).match_count()
+ LazyRef::new(self, cache).get_cached_state(id).match_len()
}
/// Returns the pattern ID corresponding to the given match index in the
/// given state.
///
- /// See [`DFA::match_count`] for an example of how to use this method
+ /// See [`DFA::match_len`] for an example of how to use this method
/// correctly. Note that if you know your lazy DFA is configured with a
/// single pattern, then this routine is never necessary since it will
/// always return a pattern ID of `0` for an index of `0` when `id`
/// corresponds to a match state.
///
/// Typically, this routine is used when implementing an overlapping
- /// search, as the example for `DFA::match_count` does.
+ /// search, as the example for `DFA::match_len` does.
///
/// # Panics
///
@@ -1713,7 +1712,7 @@ impl DFA {
// that finds the pattern ID from the corresponding `State`, which
// requires a bit of slicing/pointer-chasing. This optimization tends
// to only matter when matches are frequent.
- if self.pattern_count() == 1 {
+ if self.pattern_len() == 1 {
return PatternID::ZERO;
}
LazyRef::new(self, cache)
@@ -1809,6 +1808,25 @@ pub struct Cache {
/// clear count is set, then the cache will return an error instead of
/// clearing the cache if the count has been exceeded.
clear_count: usize,
+ /// The total number of bytes searched since the last time this cache was
+ /// cleared, not including the current search.
+ ///
+ /// This can be added to the length of the current search to get the true
+ /// total number of bytes searched.
+ ///
+ /// This is generally only non-zero when the
+ /// `Cache::search_{start,update,finish}` APIs are used to track search
+ /// progress.
+ bytes_searched: usize,
+ /// The progress of the current search.
+ ///
+ /// This is only non-`None` when callers utlize the `Cache::search_start`,
+ /// `Cache::search_update` and `Cache::search_finish` APIs.
+ ///
+ /// The purpose of recording search progress is to be able to make a
+ /// determination about the efficiency of the cache. Namely, by keeping
+ /// track of the
+ progress: Option<SearchProgress>,
}
impl Cache {
@@ -1823,14 +1841,18 @@ impl Cache {
starts: alloc::vec![],
states: alloc::vec![],
states_to_id: StateMap::new(),
- sparses: SparseSets::new(dfa.nfa.len()),
+ sparses: SparseSets::new(dfa.get_nfa().states().len()),
stack: alloc::vec![],
scratch_state_builder: StateBuilderEmpty::new(),
state_saver: StateSaver::none(),
memory_usage_state: 0,
clear_count: 0,
+ bytes_searched: 0,
+ progress: None,
};
+ debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
Lazy { dfa, cache: &mut cache }.init_cache();
+ debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
cache
}
@@ -1852,7 +1874,8 @@ impl Cache {
/// This shows how to re-purpose a cache for use with a different DFA.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa1 = DFA::new(r"\w")?;
/// let dfa2 = DFA::new(r"\W")?;
@@ -1860,7 +1883,7 @@ impl Cache {
/// let mut cache = dfa1.create_cache();
/// assert_eq!(
/// Some(HalfMatch::must(0, 2)),
- /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
/// );
///
/// // Using 'cache' with dfa2 is not allowed. It may result in panics or
@@ -1872,7 +1895,7 @@ impl Cache {
/// cache.reset(&dfa2);
/// assert_eq!(
/// Some(HalfMatch::must(0, 3)),
- /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -1881,6 +1904,69 @@ impl Cache {
Lazy::new(dfa, self).reset_cache()
}
+ /// Initializes a new search starting at the given position.
+ ///
+ /// If a previous search was unfinished, then it is finished automatically
+ /// and a new search is begun.
+ ///
+ /// Note that keeping track of search progress is _not necessary_
+ /// for correct implementations of search using a lazy DFA. Keeping
+ /// track of search progress is only necessary if you want the
+ /// [`Config::minimum_bytes_per_state`] configuration knob to work.
+ #[inline]
+ pub fn search_start(&mut self, at: usize) {
+ // If a previous search wasn't marked as finished, then finish it
+ // now automatically.
+ if let Some(p) = self.progress.take() {
+ self.bytes_searched += p.len();
+ }
+ self.progress = Some(SearchProgress { start: at, at });
+ }
+
+ /// Updates the current search to indicate that it has search to the
+ /// current position.
+ ///
+ /// No special care needs to be taken for reverse searches. Namely, the
+ /// position given may be _less than_ the starting position of the search.
+ ///
+ /// # Panics
+ ///
+ /// This panics if no search has been started by [`Cache::search_start`].
+ #[inline]
+ pub fn search_update(&mut self, at: usize) {
+ let p =
+ self.progress.as_mut().expect("no in-progress search to update");
+ p.at = at;
+ }
+
+ /// Indicates that a search has finished at the given position.
+ ///
+ /// # Panics
+ ///
+ /// This panics if no search has been started by [`Cache::search_start`].
+ #[inline]
+ pub fn search_finish(&mut self, at: usize) {
+ let mut p =
+ self.progress.take().expect("no in-progress search to finish");
+ p.at = at;
+ self.bytes_searched += p.len();
+ }
+
+ /// Returns the total number of bytes that have been searched since this
+ /// cache was last cleared.
+ ///
+ /// This is useful for determining the efficiency of the cache. For
+ /// example, the lazy DFA uses this value in conjunction with the
+ /// [`Config::minimum_bytes_per_state`] knob to help determine whether it
+ /// should quit searching.
+ ///
+ /// This always returns `0` if search progress isn't being tracked. Note
+ /// that the lazy DFA search routines in this crate always track search
+ /// progress.
+ pub fn search_total_len(&self) -> usize {
+ self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len())
+ }
+
/// Returns the total number of times this cache has been cleared since it
/// was either created or last reset.
///
@@ -1899,6 +1985,9 @@ impl Cache {
const ID_SIZE: usize = size_of::<LazyStateID>();
const STATE_SIZE: usize = size_of::<State>();
+ // NOTE: If you make changes to the below, then
+ // 'minimum_cache_capacity' should be updated correspondingly.
+
self.trans.len() * ID_SIZE
+ self.starts.len() * ID_SIZE
+ self.states.len() * STATE_SIZE
@@ -1912,6 +2001,32 @@ impl Cache {
}
}
+/// Keeps track of the progress of the current search.
+///
+/// This is updated via the `Cache::search_{start,update,finish}` APIs to
+/// record how many bytes have been searched. This permits computing a
+/// heuristic that represents the efficiency of a cache, and thus helps inform
+/// whether the lazy DFA should give up or not.
+#[derive(Clone, Debug)]
+struct SearchProgress {
+ start: usize,
+ at: usize,
+}
+
+impl SearchProgress {
+ /// Returns the length, in bytes, of this search so far.
+ ///
+ /// This automatically handles the case of a reverse search, where `at`
+ /// is likely to be less than `start`.
+ fn len(&self) -> usize {
+ if self.start <= self.at {
+ self.at - self.start
+ } else {
+ self.start - self.at
+ }
+ }
+}
+
/// A map from states to state identifiers. When using std, we use a standard
/// hashmap, since it's a bit faster for this use case. (Other maps, like
/// one's based on FNV, have not yet been benchmarked.)
@@ -1960,6 +2075,7 @@ impl<'i, 'c> Lazy<'i, 'c> {
///
/// With 'inline(never)' hyperfine reports 1.1s per run. With
/// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
+ #[cold]
#[inline(never)]
fn cache_next_state(
&mut self,
@@ -1969,8 +2085,8 @@ impl<'i, 'c> Lazy<'i, 'c> {
let stride2 = self.dfa.stride2();
let empty_builder = self.get_state_builder();
let builder = determinize::next(
- &self.dfa.nfa,
- self.dfa.match_kind,
+ self.dfa.get_nfa(),
+ self.dfa.get_config().get_match_kind(),
&mut self.cache.sparses,
&mut self.cache.stack,
&self.cache.states[current.as_usize_untagged() >> stride2],
@@ -2002,26 +2118,32 @@ impl<'i, 'c> Lazy<'i, 'c> {
///
/// If caching this state would otherwise result in a cache that has been
/// cleared too many times, then an error is returned.
+ #[cold]
+ #[inline(never)]
fn cache_start_group(
&mut self,
- pattern_id: Option<PatternID>,
+ input: &Input<'_>,
start: Start,
- ) -> Result<LazyStateID, CacheError> {
- let nfa_start_id = match pattern_id {
- Some(pid) => {
- assert!(
- self.dfa.starts_for_each_pattern,
- "attempted to search for a specific pattern \
- without enabling starts_for_each_pattern",
- );
- self.dfa.nfa.start_pattern(pid)
+ ) -> Result<LazyStateID, MatchError> {
+ let mode = input.get_anchored();
+ let nfa_start_id = match mode {
+ Anchored::No => self.dfa.get_nfa().start_unanchored(),
+ Anchored::Yes => self.dfa.get_nfa().start_anchored(),
+ Anchored::Pattern(pid) => {
+ if !self.dfa.get_config().get_starts_for_each_pattern() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ match self.dfa.get_nfa().start_pattern(pid) {
+ None => return Ok(self.as_ref().dead_id()),
+ Some(sid) => sid,
+ }
}
- None if self.dfa.anchored => self.dfa.nfa.start_anchored(),
- None => self.dfa.nfa.start_unanchored(),
};
- let id = self.cache_start_one(nfa_start_id, start)?;
- self.set_start_state(pattern_id, start, id);
+ let id = self
+ .cache_start_one(nfa_start_id, start)
+ .map_err(|_| MatchError::gave_up(input.start()))?;
+ self.set_start_state(input, start, id);
Ok(id)
}
@@ -2042,22 +2164,33 @@ impl<'i, 'c> Lazy<'i, 'c> {
start: Start,
) -> Result<LazyStateID, CacheError> {
let mut builder_matches = self.get_state_builder().into_matches();
- determinize::set_lookbehind_from_start(&start, &mut builder_matches);
+ determinize::set_lookbehind_from_start(
+ self.dfa.get_nfa(),
+ &start,
+ &mut builder_matches,
+ );
self.cache.sparses.set1.clear();
determinize::epsilon_closure(
- self.dfa.nfa.borrow(),
+ self.dfa.get_nfa(),
nfa_start_id,
- *builder_matches.look_have(),
+ builder_matches.look_have(),
&mut self.cache.stack,
&mut self.cache.sparses.set1,
);
let mut builder = builder_matches.into_nfa();
determinize::add_nfa_states(
- self.dfa.nfa.borrow(),
+ &self.dfa.get_nfa(),
&self.cache.sparses.set1,
&mut builder,
);
- self.add_builder_state(builder, |id| id.to_start())
+ let tag_starts = self.dfa.get_config().get_specialize_start_states();
+ self.add_builder_state(builder, |id| {
+ if tag_starts {
+ id.to_start()
+ } else {
+ id
+ }
+ })
}
/// Either add the given builder state to this cache, or return an ID to an
@@ -2164,7 +2297,7 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// clearings, then this will return a cache error. In this case,
/// callers should bubble this up as the cache can't be used until it is
/// reset. Implementations of search should convert this error into a
- /// `MatchError::GaveUp`.
+ /// [`MatchError::gave_up`].
///
/// If 'self.state_saver' is set to save a state, then this state is
/// persisted through cache clearing. Otherwise, the cache is returned to
@@ -2175,21 +2308,68 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Otherwise, any lazy state ID generated by the cache prior to resetting
/// it is invalid after the reset.
fn try_clear_cache(&mut self) -> Result<(), CacheError> {
- // Currently, the only heuristic we use is the minimum cache clear
- // count. If we pass that minimum, then we give up.
- //
- // It would be good to also add a heuristic based on "bytes searched
- // per generated state," but this requires API design work. Namely,
- // we really do not want to add a counter increment to the transition
- // function, which implies we need to expose APIs to update the number
- // of bytes searched by implementers of the search routines. And that
- // doesn't seem great... But we should do it if this heuristic isn't
- // enough. (The original lazy DFA implementation in the 'regex' crate
- // had this heuristic, since the lazy DFA was coupled with the search
- // routines.)
- if let Some(min_count) = self.dfa.minimum_cache_clear_count {
+ let c = self.dfa.get_config();
+ if let Some(min_count) = c.get_minimum_cache_clear_count() {
if self.cache.clear_count >= min_count {
- return Err(CacheError::too_many_cache_clears());
+ if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
+ let len = self.cache.search_total_len();
+ let min_bytes =
+ min_bytes_per.saturating_mul(self.cache.states.len());
+ // If we've searched 0 bytes then probably something has
+ // gone wrong and the lazy DFA search implementation isn't
+ // correctly updating the search progress state.
+ if len == 0 {
+ trace!(
+ "number of bytes searched is 0, but \
+ a minimum bytes per state searched ({}) is \
+ enabled, maybe Cache::search_update \
+ is not being used?",
+ min_bytes_per,
+ );
+ }
+ if len < min_bytes {
+ trace!(
+ "lazy DFA cache has been cleared {} times, \
+ which exceeds the limit of {}, \
+ AND its bytes searched per state is less \
+ than the configured minimum of {}, \
+ therefore lazy DFA is giving up \
+ (bytes searched since cache clear = {}, \
+ number of states = {})",
+ self.cache.clear_count,
+ min_count,
+ min_bytes_per,
+ len,
+ self.cache.states.len(),
+ );
+ return Err(CacheError::bad_efficiency());
+ } else {
+ trace!(
+ "lazy DFA cache has been cleared {} times, \
+ which exceeds the limit of {}, \
+ AND its bytes searched per state is greater \
+ than the configured minimum of {}, \
+ therefore lazy DFA is continuing! \
+ (bytes searched since cache clear = {}, \
+ number of states = {})",
+ self.cache.clear_count,
+ min_count,
+ min_bytes_per,
+ len,
+ self.cache.states.len(),
+ );
+ }
+ } else {
+ trace!(
+ "lazy DFA cache has been cleared {} times, \
+ which exceeds the limit of {}, \
+ since there is no configured bytes per state \
+ minimum, lazy DFA is giving up",
+ self.cache.clear_count,
+ min_count,
+ );
+ return Err(CacheError::too_many_cache_clears());
+ }
}
}
self.clear_cache();
@@ -2209,18 +2389,13 @@ impl<'i, 'c> Lazy<'i, 'c> {
// If a new DFA is used, it might have a different number of NFA
// states, so we need to make sure our sparse sets have the appropriate
// size.
- self.cache.sparses.resize(self.dfa.nfa.len());
+ self.cache.sparses.resize(self.dfa.get_nfa().states().len());
self.cache.clear_count = 0;
+ self.cache.progress = None;
}
/// Clear the cache used by this lazy DFA.
///
- /// If clearing the cache exceeds the minimum number of required cache
- /// clearings, then this will return a cache error. In this case,
- /// callers should bubble this up as the cache can't be used until it is
- /// reset. Implementations of search should convert this error into a
- /// `MatchError::GaveUp`.
- ///
/// If 'self.state_saver' is set to save a state, then this state is
/// persisted through cache clearing. Otherwise, the cache is returned to
/// its state after initialization with two exceptions: its clear count
@@ -2236,6 +2411,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
self.cache.states_to_id.clear();
self.cache.memory_usage_state = 0;
self.cache.clear_count += 1;
+ self.cache.bytes_searched = 0;
+ if let Some(ref mut progress) = self.cache.progress {
+ progress.start = progress.at;
+ }
trace!(
"lazy DFA cache has been cleared (count: {})",
self.cache.clear_count
@@ -2260,6 +2439,10 @@ impl<'i, 'c> Lazy<'i, 'c> {
let new_id = self
.add_state(state, |id| {
if old_id.is_start() {
+ // We don't need to consult the
+ // 'specialize_start_states' config knob here, because
+ // if it's disabled, old_id.is_start() will never
+ // return true.
id.to_start()
} else {
id
@@ -2282,9 +2465,13 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Primarily, this adds the three sentinel states and allocates some
/// initial memory.
fn init_cache(&mut self) {
- let mut starts_len = Start::count();
- if self.dfa.starts_for_each_pattern {
- starts_len += Start::count() * self.dfa.pattern_count();
+ // Why multiply by 2 here? Because we make room for both the unanchored
+ // and anchored start states. Unanchored is first and then anchored.
+ let mut starts_len = Start::len().checked_mul(2).unwrap();
+ // ... but if we also want start states for every pattern, we make room
+ // for that too.
+ if self.dfa.get_config().get_starts_for_each_pattern() {
+ starts_len += Start::len() * self.dfa.pattern_len();
}
self.cache
.starts
@@ -2357,7 +2544,7 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// Set all transitions on the state 'from' to 'to'.
fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
- for unit in self.dfa.classes.representatives() {
+ for unit in self.dfa.classes.representatives(..) {
self.set_transition(from, unit, to);
}
}
@@ -2387,22 +2574,23 @@ impl<'i, 'c> Lazy<'i, 'c> {
/// 'starts_for_each_pattern' is not enabled.
fn set_start_state(
&mut self,
- pattern_id: Option<PatternID>,
+ input: &Input<'_>,
start: Start,
id: LazyStateID,
) {
assert!(self.as_ref().is_valid(id));
let start_index = start.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => {
+ let index = match input.get_anchored() {
+ Anchored::No => start_index,
+ Anchored::Yes => Start::len() + start_index,
+ Anchored::Pattern(pid) => {
assert!(
- self.dfa.starts_for_each_pattern,
+ self.dfa.get_config().get_starts_for_each_pattern(),
"attempted to search for a specific pattern \
without enabling starts_for_each_pattern",
);
let pid = pid.as_usize();
- Start::count() + (Start::count() * pid) + start_index
+ (2 * Start::len()) + (Start::len() * pid) + start_index
}
};
self.cache.starts[index] = id;
@@ -2451,25 +2639,30 @@ impl<'i, 'c> LazyRef<'i, 'c> {
///
/// If the start state has not yet been computed, then this returns an
/// unknown lazy state ID.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn get_cached_start_id(
&self,
- pattern_id: Option<PatternID>,
+ input: &Input<'_>,
start: Start,
- ) -> LazyStateID {
+ ) -> Result<LazyStateID, MatchError> {
let start_index = start.as_usize();
- let index = match pattern_id {
- None => start_index,
- Some(pid) => {
- let pid = pid.as_usize();
- assert!(
- pid < self.dfa.pattern_count(),
- "invalid pattern ID: {:?}",
- pid
- );
- Start::count() + (Start::count() * pid) + start_index
+ let mode = input.get_anchored();
+ let index = match mode {
+ Anchored::No => start_index,
+ Anchored::Yes => Start::len() + start_index,
+ Anchored::Pattern(pid) => {
+ if !self.dfa.get_config().get_starts_for_each_pattern() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ if pid.as_usize() >= self.dfa.pattern_len() {
+ return Ok(self.dead_id());
+ }
+ (2 * Start::len())
+ + (Start::len() * pid.as_usize())
+ + start_index
}
};
- self.cache.starts[index]
+ Ok(self.cache.starts[index])
}
/// Return the cached NFA/DFA powerset state for the given ID.
@@ -2530,6 +2723,11 @@ impl<'i, 'c> LazyRef<'i, 'c> {
fn state_fits_in_cache(&self, state: &State) -> bool {
let needed = self.cache.memory_usage()
+ self.memory_usage_for_one_more_state(state.memory_usage());
+ trace!(
+ "lazy DFA cache capacity check: {:?} ?<=? {:?}",
+ needed,
+ self.dfa.cache_capacity
+ );
needed <= self.dfa.cache_capacity
}
@@ -2573,7 +2771,7 @@ enum StateSaver {
/// is stored in 'Saved' since it may have changed.
ToSave { id: LazyStateID, state: State },
/// An ID that of a state that has been persisted through a lazy DFA
- /// cache clearing. The ID recorded here corresonds to an ID that was
+ /// cache clearing. The ID recorded here corresponds to an ID that was
/// once marked as ToSave. The IDs are likely not equivalent even though
/// the states they point to are.
Saved(LazyStateID),
@@ -2620,14 +2818,11 @@ impl StateSaver {
/// A lazy DFA configuration is a simple data object that is typically used
/// with [`Builder::configure`].
///
-/// The default configuration guarantees that a search will _never_ return
-/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with
-/// [`Config::quit`], enabling heuristic support for Unicode word boundaries
-/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear
-/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search
-/// to return an error. See the corresponding configuration options for more
-/// details on when those error conditions arise.
-#[derive(Clone, Copy, Debug, Default)]
+/// The default configuration guarantees that a search will never return a
+/// "gave up" or "quit" error, although it is possible for a search to fail
+/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
+/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
+#[derive(Clone, Debug, Default)]
pub struct Config {
// As with other configuration types in this crate, we put all our knobs
// in options so that we can distinguish between "default" and "not set."
@@ -2636,15 +2831,17 @@ pub struct Config {
// 'overwrite' method.
//
// For docs on the fields below, see the corresponding method setters.
- anchored: Option<bool>,
match_kind: Option<MatchKind>,
+ pre: Option<Option<Prefilter>>,
starts_for_each_pattern: Option<bool>,
byte_classes: Option<bool>,
unicode_word_boundary: Option<bool>,
quitset: Option<ByteSet>,
+ specialize_start_states: Option<bool>,
cache_capacity: Option<usize>,
skip_cache_capacity_check: Option<bool>,
minimum_cache_clear_count: Option<Option<usize>>,
+ minimum_bytes_per_state: Option<Option<usize>>,
}
impl Config {
@@ -2653,116 +2850,6 @@ impl Config {
Config::default()
}
- /// Set whether matching must be anchored at the beginning of the input.
- ///
- /// When enabled, a match must begin at the start of a search. When
- /// disabled (the default), the lazy DFA will act as if the pattern started
- /// with a `(?s:.)*?`, which enables a match to appear anywhere.
- ///
- /// Note that if you want to run both anchored and unanchored
- /// searches without building multiple automatons, you can enable the
- /// [`Config::starts_for_each_pattern`] configuration instead. This will
- /// permit unanchored any-pattern searches and pattern-specific anchored
- /// searches. See the documentation for that configuration for an example.
- ///
- /// By default this is disabled.
- ///
- /// **WARNING:** this is subtly different than using a `^` at the start of
- /// your regex. A `^` forces a regex to match exclusively at the start of
- /// input, regardless of where you begin your search. In contrast, enabling
- /// this option will allow your regex to match anywhere in your input,
- /// but the match must start at the beginning of a search. (Most of the
- /// higher level convenience search routines make "start of input" and
- /// "start of search" equivalent, but some routines allow treating these as
- /// orthogonal.)
- ///
- /// For example, consider the haystack `aba` and the following searches:
- ///
- /// 1. The regex `^a` is compiled with `anchored=false` and searches
- /// `aba` starting at position `2`. Since `^` requires the match to
- /// start at the beginning of the input and `2 > 0`, no match is found.
- /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
- /// starting at position `2`. This reports a match at `[2, 3]` since
- /// the match starts where the search started. Since there is no `^`,
- /// there is no requirement for the match to start at the beginning of
- /// the input.
- /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
- /// starting at position `1`. Since `b` corresponds to position `1` and
- /// since the regex is anchored, it finds no match.
- /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
- /// startting at position `1`. Since the regex is neither anchored nor
- /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
- /// prefix that permits it to match anywhere. Thus, it reports a match
- /// at `[2, 3]`.
- ///
- /// # Example
- ///
- /// This demonstrates the differences between an anchored search and
- /// a pattern that begins with `^` (as described in the above warning
- /// message).
- ///
- /// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
- ///
- /// let haystack = "aba".as_bytes();
- ///
- /// let dfa = DFA::builder()
- /// .configure(DFA::config().anchored(false)) // default
- /// .build(r"^a")?;
- /// let mut cache = dfa.create_cache();
- /// let got = dfa.find_leftmost_fwd_at(
- /// &mut cache, None, None, haystack, 2, 3,
- /// )?;
- /// // No match is found because 2 is not the beginning of the haystack,
- /// // which is what ^ requires.
- /// let expected = None;
- /// assert_eq!(expected, got);
- ///
- /// let dfa = DFA::builder()
- /// .configure(DFA::config().anchored(true))
- /// .build(r"a")?;
- /// let mut cache = dfa.create_cache();
- /// let got = dfa.find_leftmost_fwd_at(
- /// &mut cache, None, None, haystack, 2, 3,
- /// )?;
- /// // An anchored search can still match anywhere in the haystack, it just
- /// // must begin at the start of the search which is '2' in this case.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// assert_eq!(expected, got);
- ///
- /// let dfa = DFA::builder()
- /// .configure(DFA::config().anchored(true))
- /// .build(r"a")?;
- /// let mut cache = dfa.create_cache();
- /// let got = dfa.find_leftmost_fwd_at(
- /// &mut cache, None, None, haystack, 1, 3,
- /// )?;
- /// // No match is found since we start searching at offset 1 which
- /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
- /// // is found.
- /// let expected = None;
- /// assert_eq!(expected, got);
- ///
- /// let dfa = DFA::builder()
- /// .configure(DFA::config().anchored(false))
- /// .build(r"a")?;
- /// let mut cache = dfa.create_cache();
- /// let got = dfa.find_leftmost_fwd_at(
- /// &mut cache, None, None, haystack, 1, 3,
- /// )?;
- /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
- /// // pattern. Even though the search starts at 'b', the 'match anything'
- /// // prefix allows the search to match 'a'.
- /// let expected = Some(HalfMatch::must(0, 3));
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn anchored(mut self, yes: bool) -> Config {
- self.anchored = Some(yes);
- self
- }
-
/// Set the desired match semantics.
///
/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
@@ -2789,21 +2876,24 @@ impl Config {
/// report overlapping matches.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
- /// hybrid::{dfa::DFA, OverlappingState},
- /// HalfMatch, MatchKind,
+ /// hybrid::dfa::{DFA, OverlappingState},
+ /// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .build_many(&[r"\w+$", r"\S+$"])?;
/// let mut cache = dfa.create_cache();
- /// let haystack = "@foo".as_bytes();
+ /// let haystack = "@foo";
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
- /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(
+ /// &mut cache, &Input::new(haystack), &mut state,
+ /// )?;
+ /// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
@@ -2811,8 +2901,10 @@ impl Config {
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
- /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
- /// assert_eq!(expected, got);
+ /// dfa.try_search_overlapping_fwd(
+ /// &mut cache, &Input::new(haystack), &mut state,
+ /// )?;
+ /// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -2829,36 +2921,38 @@ impl Config {
/// for you, so it's usually not necessary to do this yourself.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind};
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson::NFA,
+ /// Anchored, HalfMatch, Input, MatchKind,
+ /// };
///
- /// let haystack = "123foobar456".as_bytes();
- /// let pattern = r"[a-z]+";
+ /// let input = Input::new("123foobar456");
+ /// let pattern = r"[a-z]+r";
///
/// let dfa_fwd = DFA::new(pattern)?;
/// let dfa_rev = DFA::builder()
- /// .configure(DFA::config()
- /// .anchored(true)
- /// .match_kind(MatchKind::All)
- /// )
+ /// .thompson(NFA::config().reverse(true))
+ /// .configure(DFA::config().match_kind(MatchKind::All))
/// .build(pattern)?;
/// let mut cache_fwd = dfa_fwd.create_cache();
/// let mut cache_rev = dfa_rev.create_cache();
///
/// let expected_fwd = HalfMatch::must(0, 9);
/// let expected_rev = HalfMatch::must(0, 3);
- /// let got_fwd = dfa_fwd.find_leftmost_fwd(
- /// &mut cache_fwd, haystack,
- /// )?.unwrap();
+ /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
/// // Here we don't specify the pattern to search for since there's only
/// // one pattern and we're doing a leftmost search. But if this were an
/// // overlapping search, you'd need to specify the pattern that matched
/// // in the forward direction. (Otherwise, you might wind up finding the
/// // starting position of a match of some other pattern.) That in turn
/// // requires building the reverse automaton with starts_for_each_pattern
- /// // enabled. Indeed, this is what Regex does internally.
- /// let got_rev = dfa_rev.find_leftmost_rev_at(
- /// &mut cache_rev, None, haystack, 0, got_fwd.offset(),
- /// )?.unwrap();
+ /// // enabled.
+ /// let input = input
+ /// .clone()
+ /// .range(..got_fwd.offset())
+ /// .anchored(Anchored::Yes);
+ /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
/// assert_eq!(expected_fwd, got_fwd);
/// assert_eq!(expected_rev, got_rev);
///
@@ -2869,6 +2963,86 @@ impl Config {
self
}
+ /// Set a prefilter to be used whenever a start state is entered.
+ ///
+ /// A [`Prefilter`] in this context is meant to accelerate searches by
+ /// looking for literal prefixes that every match for the corresponding
+ /// pattern (or patterns) must start with. Once a prefilter produces a
+ /// match, the underlying search routine continues on to try and confirm
+ /// the match.
+ ///
+ /// Be warned that setting a prefilter does not guarantee that the search
+ /// will be faster. While it's usually a good bet, if the prefilter
+ /// produces a lot of false positive candidates (i.e., positions matched
+ /// by the prefilter but not by the regex), then the overall result can
+ /// be slower than if you had just executed the regex engine without any
+ /// prefilters.
+ ///
+ /// Note that unless [`Config::specialize_start_states`] has been
+ /// explicitly set, then setting this will also enable (when `pre` is
+ /// `Some`) or disable (when `pre` is `None`) start state specialization.
+ /// This occurs because without start state specialization, a prefilter
+ /// is likely to be less effective. And without a prefilter, start state
+ /// specialization is usually pointless.
+ ///
+ /// By default no prefilter is set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 11)),
+ /// re.try_search_fwd(&mut cache, &input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Be warned though that an incorrect prefilter can lead to incorrect
+ /// results!
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// // No match reported even though there clearly is one!
+ /// None,
+ /// re.try_search_fwd(&mut cache, &input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
+ self.pre = Some(pre);
+ if self.specialize_start_states.is_none() {
+ self.specialize_start_states =
+ Some(self.get_prefilter().is_some());
+ }
+ self
+ }
+
/// Whether to compile a separate start state for each pattern in the
/// lazy DFA.
///
@@ -2897,50 +3071,45 @@ impl Config {
/// for matches of any pattern or to search for anchored matches of one
/// particular pattern while using the same DFA. (Otherwise, you would need
/// to compile a new DFA for each pattern.)
- /// 3. Since the start states added for each pattern are anchored, if you
- /// compile an unanchored DFA with one pattern while also enabling this
- /// option, then you can use the same DFA to perform anchored or unanchored
- /// searches. The latter you get with the standard search APIs. The former
- /// you get from the various `_at` search methods that allow you specify a
- /// pattern ID to search for.
///
/// By default this is disabled.
///
/// # Example
///
/// This example shows how to use this option to permit the same lazy DFA
- /// to run both anchored and unanchored searches for a single pattern.
+ /// to run both general searches for any pattern and anchored searches for
+ /// a specific pattern.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID};
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// Anchored, HalfMatch, Input, PatternID,
+ /// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().starts_for_each_pattern(true))
- /// .build(r"foo[0-9]+")?;
+ /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
/// let mut cache = dfa.create_cache();
- /// let haystack = b"quux foo123";
- ///
- /// // Here's a normal unanchored search. Notice that we use 'None' for the
- /// // pattern ID. Since the DFA was built as an unanchored machine, it
- /// // uses its default unanchored starting state.
- /// let expected = HalfMatch::must(0, 11);
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
- /// &mut cache, None, None, haystack, 0, haystack.len(),
- /// )?);
- /// // But now if we explicitly specify the pattern to search ('0' being
- /// // the only pattern in the DFA), then it will use the starting state
- /// // for that specific pattern which is always anchored. Since the
- /// // pattern doesn't have a match at the beginning of the haystack, we
- /// // find nothing.
- /// assert_eq!(None, dfa.find_leftmost_fwd_at(
- /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
- /// )?);
- /// // And finally, an anchored search is not the same as putting a '^' at
- /// // beginning of the pattern. An anchored search can only match at the
- /// // beginning of the *search*, which we can change:
- /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
- /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
- /// )?);
+ /// let haystack = "bar foo123";
+ ///
+ /// // Here's a normal unanchored search that looks for any pattern.
+ /// let expected = HalfMatch::must(0, 10);
+ /// let input = Input::new(haystack);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
+ /// // We can also do a normal anchored search for any pattern. Since it's
+ /// // an anchored search, we position the start of the search where we
+ /// // know the match will begin.
+ /// let expected = HalfMatch::must(0, 10);
+ /// let input = Input::new(haystack).range(4..);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
+ /// // Since we compiled anchored start states for each pattern, we can
+ /// // also look for matches of other patterns explicitly, even if a
+ /// // different pattern would have normally matched.
+ /// let expected = HalfMatch::must(1, 10);
+ /// let input = Input::new(haystack)
+ /// .range(4..)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -2990,7 +3159,7 @@ impl Config {
/// When set, this will attempt to implement Unicode word boundaries as if
/// they were ASCII word boundaries. This only works when the search input
/// is ASCII only. If a non-ASCII byte is observed while searching, then a
- /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+ /// [`MatchError::quit`] error is returned.
///
/// A possible alternative to enabling this option is to simply use an
/// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
@@ -3014,8 +3183,7 @@ impl Config {
/// When using a [`Regex`](crate::hybrid::regex::Regex), this
/// corresponds to using the `try_` suite of methods. Alternatively,
/// if callers can guarantee that their input is ASCII only, then a
- /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
- /// returned while searching.
+ /// [`MatchError::quit`] error will never be returned while searching.
///
/// This is disabled by default.
///
@@ -3028,7 +3196,7 @@ impl Config {
/// ```
/// use regex_automata::{
/// hybrid::dfa::DFA,
- /// HalfMatch, MatchError, MatchKind,
+ /// HalfMatch, Input, MatchError,
/// };
///
/// let dfa = DFA::builder()
@@ -3038,9 +3206,9 @@ impl Config {
///
/// // The match occurs before the search ever observes the snowman
/// // character, so no error occurs.
- /// let haystack = "foo 123 ☃".as_bytes();
+ /// let haystack = "foo 123 ☃";
/// let expected = Some(HalfMatch::must(0, 7));
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// // Notice that this search fails, even though the snowman character
@@ -3048,9 +3216,23 @@ impl Config {
/// // routines read one byte past the end of the search to account for
/// // look-around, and indeed, this is required here to determine whether
/// // the trailing \b matches.
- /// let haystack = "foo 123☃".as_bytes();
- /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack);
+ /// let haystack = "foo 123 ☃";
+ /// let expected = MatchError::quit(0xE2, 8);
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // Another example is executing a search where the span of the haystack
+ /// // we specify is all ASCII, but there is non-ASCII just before it. This
+ /// // correctly also reports an error.
+ /// let input = Input::new("β123").range(2..);
+ /// let expected = MatchError::quit(0xB2, 1);
+ /// let got = dfa.try_search_fwd(&mut cache, &input);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // And similarly for the trailing word boundary.
+ /// let input = Input::new("123β").range(..3);
+ /// let expected = MatchError::quit(0xCE, 3);
+ /// let got = dfa.try_search_fwd(&mut cache, &input);
/// assert_eq!(Err(expected), got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -3066,9 +3248,9 @@ impl Config {
/// Add a "quit" byte to the lazy DFA.
///
- /// When a quit byte is seen during search time, then search will return
- /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
- /// offset at which the search stopped.
+ /// When a quit byte is seen during search time, then search will return a
+ /// [`MatchError::quit`] error indicating the offset at which the search
+ /// stopped.
///
/// A quit byte will always overrule any other aspects of a regex. For
/// example, if the `x` byte is added as a quit byte and the regex `\w` is
@@ -3109,19 +3291,23 @@ impl Config {
/// a user supplied pattern from matching across a line boundary.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
/// let mut cache = dfa.create_cache();
///
- /// let haystack = "foo\nbar".as_bytes();
+ /// let haystack = "foo\nbar";
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
- /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err();
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = dfa.try_search_fwd(
+ /// &mut cache,
+ /// &Input::new(haystack),
+ /// ).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -3144,6 +3330,89 @@ impl Config {
self
}
+ /// Enable specializing start states in the lazy DFA.
+ ///
+ /// When start states are specialized, an implementor of a search routine
+ /// using a lazy DFA can tell when the search has entered a starting state.
+ /// When start states aren't specialized, then it is impossible to know
+ /// whether the search has entered a start state.
+ ///
+ /// Ideally, this option wouldn't need to exist and we could always
+ /// specialize start states. The problem is that start states can be quite
+ /// active. This in turn means that an efficient search routine is likely
+ /// to ping-pong between a heavily optimized hot loop that handles most
+ /// states and to a less optimized specialized handling of start states.
+ /// This causes branches to get heavily mispredicted and overall can
+ /// materially decrease throughput. Therefore, specializing start states
+ /// should only be enabled when it is needed.
+ ///
+ /// Knowing whether a search is in a start state is typically useful when a
+ /// prefilter is active for the search. A prefilter is typically only run
+ /// when in a start state and a prefilter can greatly accelerate a search.
+ /// Therefore, the possible cost of specializing start states is worth it
+ /// in this case. Otherwise, if you have no prefilter, there is likely no
+ /// reason to specialize start states.
+ ///
+ /// This is disabled by default, but note that it is automatically
+ /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
+ /// `specialize_start_states` has already been set, [`Config::prefilter`]
+ /// will automatically enable or disable it based on whether a prefilter
+ /// is present or not, respectively. This is done because a prefilter's
+ /// effectiveness is rooted in being executed whenever the DFA is in a
+ /// start state, and that's only possible to do when they are specialized.
+ ///
+ /// Note that it is plausibly reasonable to _disable_ this option
+ /// explicitly while _enabling_ a prefilter. In that case, a prefilter
+ /// will still be run at the beginning of a search, but never again. This
+ /// in theory could strike a good balance if you're in a situation where a
+ /// prefilter is likely to produce many false positive candidates.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to enable start state specialization and then
+ /// shows how to check whether a state is a start state or not.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().specialize_start_states(true))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
+ /// // The ID returned by 'start_state_forward' will always be tagged as
+ /// // a start state when start state specialization is enabled.
+ /// assert!(sid.is_tagged());
+ /// assert!(sid.is_start());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Compare the above with the default lazy DFA configuration where
+ /// start states are _not_ specialized. In this case, the start state
+ /// is not tagged and `sid.is_start()` returns false.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
+ ///
+ /// let dfa = DFA::new(r"[a-z]+")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
+ /// // Start states are not tagged in the default configuration!
+ /// assert!(!sid.is_tagged());
+ /// assert!(!sid.is_start());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn specialize_start_states(mut self, yes: bool) -> Config {
+ self.specialize_start_states = Some(yes);
+ self
+ }
+
/// Sets the maximum amount of heap memory, in bytes, to allocate to the
/// cache for use during a lazy DFA search. If the lazy DFA would otherwise
/// use more heap memory, then, depending on other configuration knobs,
@@ -3157,7 +3426,7 @@ impl Config {
///
/// Note that while building a lazy DFA will do a "minimum" check to ensure
/// the capacity is big enough, this is more or less about correctness.
- /// If the cache is bigger than the minimum but still too small, then the
+ /// If the cache is bigger than the minimum but still "too small," then the
/// lazy DFA could wind up spending a lot of time clearing the cache and
/// recomputing transitions, thus negating the performance benefits of a
/// lazy DFA. Thus, setting the cache capacity is mostly an experimental
@@ -3175,7 +3444,8 @@ impl Config {
/// a smaller cache capacity.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let pattern = r"\p{L}{1000}";
///
@@ -3191,7 +3461,7 @@ impl Config {
///
/// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
/// let expected = Some(HalfMatch::must(0, 2000));
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -3211,7 +3481,11 @@ impl Config {
/// while a minimum cache capacity does permit the lazy DFA to function
/// where it otherwise couldn't, it's plausible that it may not function
/// well if it's constantly running out of room. In that case, the speed
- /// advantages of the lazy DFA may be negated.
+ /// advantages of the lazy DFA may be negated. On the other hand, the
+ /// "minimum" cache capacity computed may not be completely accurate and
+ /// could actually be bigger than what is really necessary. Therefore, it
+ /// is plausible that using the minimum cache capacity could still result
+ /// in very good performance.
///
/// This is disabled by default.
///
@@ -3224,7 +3498,8 @@ impl Config {
/// too small.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let pattern = r"\p{L}{1000}";
///
@@ -3241,7 +3516,7 @@ impl Config {
///
/// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
/// let expected = Some(HalfMatch::must(0, 2000));
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -3254,23 +3529,27 @@ impl Config {
/// Configure a lazy DFA search to quit after a certain number of cache
/// clearings.
///
- /// When a minimum is set, then a lazy DFA search will "give up" after
- /// the minimum number of cache clearings has occurred. This is typically
- /// useful in scenarios where callers want to detect whether the lazy DFA
- /// search is "efficient" or not. If the cache is cleared too many times,
- /// this is a good indicator that it is not efficient, and thus, the caller
- /// may wish to use some other regex engine.
+ /// When a minimum is set, then a lazy DFA search will *possibly* "give
+ /// up" after the minimum number of cache clearings has occurred. This is
+ /// typically useful in scenarios where callers want to detect whether the
+ /// lazy DFA search is "efficient" or not. If the cache is cleared too many
+ /// times, this is a good indicator that it is not efficient, and thus, the
+ /// caller may wish to use some other regex engine.
///
/// Note that the number of times a cache is cleared is a property of
/// the cache itself. Thus, if a cache is used in a subsequent search
- /// with a similarly configured lazy DFA, then it would cause the
- /// search to "give up" if the cache needed to be cleared. The cache
- /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or
+ /// with a similarly configured lazy DFA, then it could cause the
+ /// search to "give up" if the cache needed to be cleared, depending
+ /// on its internal count and configured minimum. The cache clear
+ /// count can only be reset to `0` via [`DFA::reset_cache`] (or
/// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
/// you're using the `Regex` API).
///
/// By default, no minimum is configured. Thus, a lazy DFA search will
- /// never give up due to cache clearings.
+ /// never give up due to cache clearings. If you do set this option, you
+ /// might consider also setting [`Config::minimum_bytes_per_state`] in
+ /// order for the lazy DFA to take efficiency into account before giving
+ /// up.
///
/// # Example
///
@@ -3279,13 +3558,11 @@ impl Config {
/// in a search that returns an error.
///
/// It is important to note that the precise mechanics of how and when
- /// a cache gets cleared is an implementation detail. Thus, the asserts
- /// in the tests below with respect to the particular offsets at which a
- /// search gave up should be viewed strictly as a demonstration. They are
- /// not part of any API guarantees offered by this crate.
+ /// a cache gets cleared is an implementation detail.
///
/// ```
- /// use regex_automata::{hybrid::dfa::DFA, MatchError};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
///
/// // This is a carefully chosen regex. The idea is to pick one
/// // that requires some decent number of states (hence the bounded
@@ -3309,37 +3586,42 @@ impl Config {
/// .build(pattern)?;
/// let mut cache = dfa.create_cache();
///
+ /// // Our search will give up before reaching the end!
/// let haystack = "a".repeat(101).into_bytes();
- /// assert_eq!(
- /// dfa.find_leftmost_fwd(&mut cache, &haystack),
- /// Err(MatchError::GaveUp { offset: 25 }),
- /// );
+ /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
+ /// assert!(matches!(
+ /// *result.unwrap_err().kind(),
+ /// MatchErrorKind::GaveUp { .. },
+ /// ));
///
/// // Now that we know the cache is full, if we search a haystack that we
/// // know will require creating at least one new state, it should not
- /// // be able to make any progress.
+ /// // be able to make much progress.
/// let haystack = "β".repeat(101).into_bytes();
- /// assert_eq!(
- /// dfa.find_leftmost_fwd(&mut cache, &haystack),
- /// Err(MatchError::GaveUp { offset: 0 }),
- /// );
+ /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
+ /// assert!(matches!(
+ /// *result.unwrap_err().kind(),
+ /// MatchErrorKind::GaveUp { .. },
+ /// ));
///
/// // If we reset the cache, then we should be able to create more states
/// // and make more progress with searching for betas.
/// cache.reset(&dfa);
/// let haystack = "β".repeat(101).into_bytes();
- /// assert_eq!(
- /// dfa.find_earliest_fwd(&mut cache, &haystack),
- /// Err(MatchError::GaveUp { offset: 26 }),
- /// );
+ /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
+ /// assert!(matches!(
+ /// *result.unwrap_err().kind(),
+ /// MatchErrorKind::GaveUp { .. },
+ /// ));
///
/// // ... switching back to ASCII still makes progress since it just needs
/// // to set transitions on existing states!
/// let haystack = "a".repeat(101).into_bytes();
- /// assert_eq!(
- /// dfa.find_earliest_fwd(&mut cache, &haystack),
- /// Err(MatchError::GaveUp { offset: 13 }),
- /// );
+ /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
+ /// assert!(matches!(
+ /// *result.unwrap_err().kind(),
+ /// MatchErrorKind::GaveUp { .. },
+ /// ));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
@@ -3348,9 +3630,48 @@ impl Config {
self
}
- /// Returns whether this configuration has enabled anchored searches.
- pub fn get_anchored(&self) -> bool {
- self.anchored.unwrap_or(false)
+ /// Configure a lazy DFA search to quit only when its efficiency drops
+ /// below the given minimum.
+ ///
+ /// The efficiency of the cache is determined by the number of DFA states
+ /// compiled per byte of haystack searched. For example, if the efficiency
+ /// is 2, then it means the lazy DFA is creating a new DFA state after
+ /// searching approximately 2 bytes in a haystack. Generally speaking, 2
+ /// is quite bad and it's likely that even a slower regex engine like the
+ /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
+ ///
+ /// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
+ /// Namely, this option only kicks in when the cache has been cleared more
+ /// than the minimum number. If no minimum is set, then the cache is simply
+ /// cleared whenever it fills up and it is impossible for the lazy DFA to
+ /// quit due to ineffective use of the cache.
+ ///
+ /// In general, if one is setting [`Config::minimum_cache_clear_count`],
+ /// then one should probably also set this knob as well. The reason is
+ /// that the absolute number of times the cache is cleared is generally
+ /// not a great predictor of efficiency. For example, if a new DFA state
+ /// is created for every 1,000 bytes searched, then it wouldn't be hard
+ /// for the cache to get cleared more than `N` times and then cause the
+ /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
+ /// good from a performance perspective, and it's likely that the lazy
+ /// DFA should continue searching, even if it requires clearing the cache
+ /// occasionally.
+ ///
+ /// Finally, note that if you're implementing your own lazy DFA search
+ /// routine and also want this efficiency check to work correctly, then
+ /// you'll need to use the following routines to record search progress:
+ ///
+ /// * Call [`Cache::search_start`] at the beginning of every search.
+ /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is
+ /// called.
+ /// * Call [`Cache::search_finish`] before completing a search. (It is
+ /// not strictly necessary to call this when an error is returned, as
+ /// `Cache::search_start` will automatically finish the previous search
+ /// for you. But calling it where possible before returning helps improve
+ /// the accuracy of how many bytes have actually been searched.)
+ pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
+ self.minimum_bytes_per_state = Some(min);
+ self
}
/// Returns the match semantics set in this configuration.
@@ -3358,6 +3679,11 @@ impl Config {
self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
}
+ /// Returns the prefilter set in this configuration, if one at all.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
+ }
+
/// Returns whether this configuration has enabled anchored starting states
/// for every pattern in the DFA.
pub fn get_starts_for_each_pattern(&self) -> bool {
@@ -3378,14 +3704,23 @@ impl Config {
self.unicode_word_boundary.unwrap_or(false)
}
- /// Returns whether this configuration will instruct the DFA to enter a
- /// quit state whenever the given byte is seen during a search. When at
+ /// Returns whether this configuration will instruct the lazy DFA to enter
+ /// a quit state whenever the given byte is seen during a search. When at
/// least one byte has this enabled, it is possible for a search to return
/// an error.
pub fn get_quit(&self, byte: u8) -> bool {
self.quitset.map_or(false, |q| q.contains(byte))
}
+ /// Returns whether this configuration will instruct the lazy DFA to
+ /// "specialize" start states. When enabled, the lazy DFA will tag start
+ /// states so that search routines using the lazy DFA can detect when
+ /// it's in a start state and do some kind of optimization (like run a
+ /// prefilter).
+ pub fn get_specialize_start_states(&self) -> bool {
+ self.specialize_start_states.unwrap_or(false)
+ }
+
/// Returns the cache capacity set on this configuration.
pub fn get_cache_capacity(&self) -> usize {
self.cache_capacity.unwrap_or(2 * (1 << 20))
@@ -3404,6 +3739,14 @@ impl Config {
self.minimum_cache_clear_count.unwrap_or(None)
}
+ /// Returns, if set, the minimum number of bytes per state that need to be
+ /// processed in order for the lazy DFA to keep going. If the minimum falls
+ /// below this number (and the cache has been cleared a minimum number of
+ /// times), then the lazy DFA will return a "gave up" error.
+ pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
+ self.minimum_bytes_per_state.unwrap_or(None)
+ }
+
/// Returns the minimum lazy DFA cache capacity required for the given NFA.
///
/// The cache capacity required for a particular NFA may change without
@@ -3449,6 +3792,11 @@ impl Config {
// It is important to distinguish any "quit" bytes from all other
// bytes. Otherwise, a non-quit byte may end up in the same class
// as a quit byte, and thus cause the DFA stop when it shouldn't.
+ //
+ // Test case:
+ //
+ // regex-cli find hybrid regex -w @conn.json.1000x.log \
+ // '^#' '\b10\.55\.182\.100\b'
if !quit.is_empty() {
set.add_set(&quit);
}
@@ -3466,7 +3814,7 @@ impl Config {
nfa: &thompson::NFA,
) -> Result<ByteSet, BuildError> {
let mut quit = self.quitset.unwrap_or(ByteSet::empty());
- if nfa.has_word_boundary_unicode() {
+ if nfa.look_set_any().contains_word_unicode() {
if self.get_unicode_word_boundary() {
for b in 0x80..=0xFF {
quit.add(b);
@@ -3491,10 +3839,10 @@ impl Config {
/// always used. If an option in `o` is not set, then the corresponding
/// option in `self` is used. If it's not set in `self` either, then it
/// remains not set.
- fn overwrite(self, o: Config) -> Config {
+ fn overwrite(&self, o: Config) -> Config {
Config {
- anchored: o.anchored.or(self.anchored),
match_kind: o.match_kind.or(self.match_kind),
+ pre: o.pre.or_else(|| self.pre.clone()),
starts_for_each_pattern: o
.starts_for_each_pattern
.or(self.starts_for_each_pattern),
@@ -3503,6 +3851,9 @@ impl Config {
.unicode_word_boundary
.or(self.unicode_word_boundary),
quitset: o.quitset.or(self.quitset),
+ specialize_start_states: o
+ .specialize_start_states
+ .or(self.specialize_start_states),
cache_capacity: o.cache_capacity.or(self.cache_capacity),
skip_cache_capacity_check: o
.skip_cache_capacity_check
@@ -3510,6 +3861,9 @@ impl Config {
minimum_cache_clear_count: o
.minimum_cache_clear_count
.or(self.minimum_cache_clear_count),
+ minimum_bytes_per_state: o
+ .minimum_bytes_per_state
+ .or(self.minimum_bytes_per_state),
}
}
}
@@ -3556,27 +3910,25 @@ impl Config {
/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
/// * The pattern itself is permitted to match invalid UTF-8. For example,
/// things like `[^a]` that match any byte except for `a` are permitted.
-/// * Unanchored patterns can search through invalid UTF-8. That is, for
-/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
-/// `(?s:.)*?`.
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::DFA,
/// nfa::thompson,
-/// HalfMatch, SyntaxConfig,
+/// util::syntax,
+/// HalfMatch, Input,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().cache_capacity(5_000))
-/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
+/// .syntax(syntax::Config::new().unicode(false).utf8(false))
/// .build(r"foo[^b]ar.*")?;
/// let mut cache = dfa.create_cache();
///
/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
/// let expected = Some(HalfMatch::must(0, 10));
-/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -3584,7 +3936,8 @@ impl Config {
#[derive(Clone, Debug)]
pub struct Builder {
config: Config,
- thompson: thompson::Builder,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
}
impl Builder {
@@ -3592,7 +3945,8 @@ impl Builder {
pub fn new() -> Builder {
Builder {
config: Config::default(),
- thompson: thompson::Builder::new(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
}
}
@@ -3600,6 +3954,7 @@ impl Builder {
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
+ #[cfg(feature = "syntax")]
pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
self.build_many(&[pattern])
}
@@ -3608,22 +3963,31 @@ impl Builder {
///
/// When matches are returned, the pattern ID corresponds to the index of
/// the pattern in the slice given.
+ #[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<DFA, BuildError> {
- let nfa =
- self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
- self.build_from_nfa(Arc::new(nfa))
+ let nfa = self
+ .thompson
+ .clone()
+ // We can always forcefully disable captures because DFAs do not
+ // support them.
+ .configure(
+ thompson::Config::new()
+ .which_captures(thompson::WhichCaptures::None),
+ )
+ .build_many(patterns)
+ .map_err(BuildError::nfa)?;
+ self.build_from_nfa(nfa)
}
/// Build a DFA from the given NFA.
///
- /// Note that this requires an `Arc<thompson::NFA>` instead of a
- /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at
- /// search time. This means that the lazy DFA must hold on to its source
- /// NFA for the entirety of its lifetime. An `Arc` is used so that callers
- /// aren't forced to clone the NFA if it is needed elsewhere.
+ /// Note that this requires owning a `thompson::NFA`. While this may force
+ /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
+ /// are defined internally to support shared ownership such that cloning is
+ /// very cheap.
///
/// # Example
///
@@ -3631,26 +3995,29 @@ impl Builder {
/// in hand.
///
/// ```
- /// use std::sync::Arc;
- /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson,
+ /// HalfMatch, Input,
+ /// };
///
- /// let haystack = "foo123bar".as_bytes();
+ /// let haystack = "foo123bar";
///
/// // This shows how to set non-default options for building an NFA.
- /// let nfa = thompson::Builder::new()
- /// .configure(thompson::Config::new().shrink(false))
+ /// let nfa = thompson::Compiler::new()
+ /// .configure(thompson::Config::new().shrink(true))
/// .build(r"[0-9]+")?;
- /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?;
+ /// let dfa = DFA::builder().build_from_nfa(nfa)?;
/// let mut cache = dfa.create_cache();
/// let expected = Some(HalfMatch::must(0, 6));
- /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn build_from_nfa(
&self,
- nfa: Arc<thompson::NFA>,
+ nfa: thompson::NFA,
) -> Result<DFA, BuildError> {
let quitset = self.config.quit_set_from_nfa(&nfa)?;
let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
@@ -3675,12 +4042,11 @@ impl Builder {
// then we simply force the cache capacity to its minimum amount
// and mush on.
if self.config.get_skip_cache_capacity_check() {
- trace!(
+ debug!(
"given capacity ({}) is too small, \
since skip_cache_capacity_check is enabled, \
setting cache capacity to minimum ({})",
- cache_capacity,
- min_cache,
+ cache_capacity, min_cache,
);
cache_capacity = min_cache;
} else {
@@ -3694,22 +4060,19 @@ impl Builder {
// of states in our state ID space. This is unlikely to trigger in
// >=32-bit systems, but 16-bit systems have a pretty small state ID
// space since a number of bits are used up as sentinels.
- if let Err(err) = minimum_lazy_state_id(&nfa, &classes) {
+ if let Err(err) = minimum_lazy_state_id(&classes) {
return Err(BuildError::insufficient_state_id_capacity(err));
}
let stride2 = classes.stride2();
+ let start_map = StartByteMap::new(nfa.look_matcher());
Ok(DFA {
+ config: self.config.clone(),
nfa,
stride2,
+ start_map,
classes,
quitset,
- anchored: self.config.get_anchored(),
- match_kind: self.config.get_match_kind(),
- starts_for_each_pattern: self.config.get_starts_for_each_pattern(),
cache_capacity,
- minimum_cache_clear_count: self
- .config
- .get_minimum_cache_clear_count(),
})
}
@@ -3720,16 +4083,17 @@ impl Builder {
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
/// These settings only apply when constructing a lazy DFA directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.thompson.syntax(config);
self
@@ -3744,20 +4108,144 @@ impl Builder {
///
/// These settings only apply when constructing a DFA directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.thompson.configure(config);
self
}
}
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides little introspection capabilities. The only thing a
+/// caller can do is construct it and pass it around to permit search routines
+/// to use it to track state, and also ask whether a match has been found.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+ /// The match reported by the most recent overlapping search to use this
+ /// state.
+ ///
+ /// If a search does not find any matches, then it is expected to clear
+ /// this value.
+ pub(crate) mat: Option<HalfMatch>,
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on several
+ /// search-time factors.
+ pub(crate) id: Option<LazyStateID>,
+ /// The position of the search.
+ ///
+ /// When `id` is None (i.e., we are starting a search), this is set to
+ /// the beginning of the search as given by the caller regardless of its
+ /// current value. Subsequent calls to an overlapping search pick up at
+ /// this offset.
+ pub(crate) at: usize,
+ /// The index into the matching patterns of the next match to report if the
+ /// current state is a match state. Note that this may be 1 greater than
+ /// the total number of matches to report for the current match state. (In
+ /// which case, no more matches should be reported at the current position
+ /// and the search should advance to the next position.)
+ pub(crate) next_match_index: Option<usize>,
+ /// This is set to true when a reverse overlapping search has entered its
+ /// EOI transitions.
+ ///
+ /// This isn't used in a forward search because it knows to stop once the
+ /// position exceeds the end of the search range. In a reverse search,
+ /// since we use unsigned offsets, we don't "know" once we've gone past
+ /// `0`. So the only way to detect it is with this extra flag. The reverse
+ /// overlapping search knows to terminate specifically after it has
+ /// reported all matches after following the EOI transition.
+ pub(crate) rev_eoi: bool,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state of any
+ /// automaton.
+ pub fn start() -> OverlappingState {
+ OverlappingState {
+ mat: None,
+ id: None,
+ at: 0,
+ next_match_index: None,
+ rev_eoi: false,
+ }
+ }
+
+ /// Return the match result of the most recent search to execute with this
+ /// state.
+ ///
+ /// A searches will clear this result automatically, such that if no
+ /// match is found, this will correctly report `None`.
+ pub fn get_match(&self) -> Option<HalfMatch> {
+ self.mat
+ }
+}
+
+/// Runs the given overlapping `search` function (forwards or backwards) until
+/// a match is found whose offset does not split a codepoint.
+///
+/// This is *not* always correct to call. It should only be called when the
+/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width
+/// matches. Calling this when both of those things aren't true might result
+/// in legitimate matches getting skipped.
+#[cold]
+#[inline(never)]
+fn skip_empty_utf8_splits_overlapping<F>(
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ mut search: F,
+) -> Result<(), MatchError>
+where
+ F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
+{
+ // Note that this routine works for forwards and reverse searches
+ // even though there's no code here to handle those cases. That's
+ // because overlapping searches drive themselves to completion via
+ // `OverlappingState`. So all we have to do is push it until no matches are
+ // found.
+
+ let mut hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
+ if input.get_anchored().is_anchored() {
+ if !input.is_char_boundary(hm.offset()) {
+ state.mat = None;
+ }
+ return Ok(());
+ }
+ while !input.is_char_boundary(hm.offset()) {
+ search(input, state)?;
+ hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
+ }
+ Ok(())
+}
+
/// Based on the minimum number of states required for a useful lazy DFA cache,
/// this returns the minimum lazy state ID that must be representable.
///
-/// It's likely not plausible for this to impose constraints on 32-bit systems
-/// (or higher), but on 16-bit systems, the lazy state ID space is quite
-/// constrained and thus may be insufficient for bigger regexes.
+/// It's not likely for this to have any impact 32-bit systems (or higher), but
+/// on 16-bit systems, the lazy state ID space is quite constrained and thus
+/// may be insufficient if our MIN_STATES value is (for some reason) too high.
fn minimum_lazy_state_id(
- nfa: &thompson::NFA,
classes: &ByteClasses,
) -> Result<LazyStateID, LazyStateIDError> {
let stride = 1 << classes.stride2();
@@ -3774,37 +4262,71 @@ fn minimum_lazy_state_id(
/// than what is required in practice. Computing the true minimum effectively
/// requires determinization, which is probably too much work to do for a
/// simple check like this.
+///
+/// One of the issues with this approach IMO is that it requires that this
+/// be in sync with the calculation above for computing how much heap memory
+/// the DFA cache uses. If we get it wrong, it's possible for example for the
+/// minimum to be smaller than the computed heap memory, and thus, it may be
+/// the case that we can't add the required minimum number of states. That in
+/// turn will make lazy DFA panic because we assume that we can add at least a
+/// minimum number of states.
+///
+/// Another approach would be to always allow the minimum number of states to
+/// be added to the lazy DFA cache, even if it exceeds the configured cache
+/// limit. This does mean that the limit isn't really a limit in all cases,
+/// which is unfortunate. But it does at least guarantee that the lazy DFA can
+/// always make progress, even if it is slow. (This approach is very similar to
+/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
+/// rely on cache size calculation. Instead, it would just always permit a
+/// minimum number of states to be added.)
fn minimum_cache_capacity(
nfa: &thompson::NFA,
classes: &ByteClasses,
starts_for_each_pattern: bool,
) -> usize {
const ID_SIZE: usize = size_of::<LazyStateID>();
- let stride = 1 << classes.stride2();
+ const STATE_SIZE: usize = size_of::<State>();
- let sparses = 2 * nfa.len() * NFAStateID::SIZE;
+ let stride = 1 << classes.stride2();
+ let states_len = nfa.states().len();
+ let sparses = 2 * states_len * NFAStateID::SIZE;
let trans = MIN_STATES * stride * ID_SIZE;
- let mut starts = Start::count() * ID_SIZE;
+ let mut starts = Start::len() * ID_SIZE;
if starts_for_each_pattern {
- starts += (Start::count() * nfa.pattern_len()) * ID_SIZE;
+ starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
}
- // Every `State` has three bytes for flags, 4 bytes (max) for the number
- // of patterns, followed by 32-bit encodings of patterns and then delta
+ // The min number of states HAS to be at least 4: we have 3 sentinel states
+ // and then we need space for one more when we save a state after clearing
+ // the cache. We also need space for one more, otherwise we get stuck in a
+ // loop where we try to add a 5th state, which gets rejected, which clears
+ // the cache, which adds back a saved state (4th total state) which then
+ // tries to add the 5th state again.
+ assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5");
+ // The minimum number of non-sentinel states. We consider this separately
+ // because sentinel states are much smaller in that they contain no NFA
+ // states. Given our aggressive calculation here, it's worth being more
+ // precise with the number of states we need.
+ let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
+
+ // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
+ // patterns, followed by 32-bit encodings of patterns and then delta
// varint encodings of NFA state IDs. We use the worst case (which isn't
// technically possible) of 5 bytes for each NFA state ID.
//
// HOWEVER, three of the states needed by a lazy DFA are just the sentinel
// unknown, dead and quit states. Those states have a known size and it is
// small.
- assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3");
let dead_state_size = State::dead().memory_usage();
- let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5);
- let states = (3 * (size_of::<State>() + dead_state_size))
- + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size));
- let states_to_sid = states + (MIN_STATES * ID_SIZE);
- let stack = nfa.len() * NFAStateID::SIZE;
+ let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5);
+ let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
+ + (non_sentinel * (STATE_SIZE + max_state_size));
+ // NOTE: We don't double count heap memory used by State for this map since
+ // we use reference counting to avoid doubling memory usage. (This tends to
+ // be where most memory is allocated in the cache.)
+ let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
+ let stack = states_len * NFAStateID::SIZE;
let scratch_state_builder = max_state_size;
trans
@@ -3815,3 +4337,45 @@ fn minimum_cache_capacity(
+ stack
+ scratch_state_builder
}
+
+#[cfg(all(test, feature = "syntax"))]
+mod tests {
+ use super::*;
+
+ // Tests that we handle heuristic Unicode word boundary support in reverse
+ // DFAs in the specific case of contextual searches.
+ //
+ // I wrote this test when I discovered a bug in how heuristic word
+ // boundaries were handled. Namely, that the starting state selection
+ // didn't consider the DFA's quit byte set when looking at the byte
+ // immediately before the start of the search (or immediately after the
+ // end of the search in the case of a reverse search). As a result, it was
+ // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
+ // in the 'β' codepoint would be treated as a non-word character. But of
+ // course, this search should trigger the DFA to quit, since there is a
+ // non-ASCII byte in consideration.
+ //
+ // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
+ // if it wasn't empty. The forward case is tested in the doc test for the
+ // Config::unicode_word_boundary API. We test the reverse case here, which
+ // is sufficiently niche that it doesn't really belong in a doc test.
+ #[test]
+ fn heuristic_unicode_reverse() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap();
+ let mut cache = dfa.create_cache();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_rev(&mut cache, &input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_rev(&mut cache, &input);
+ assert_eq!(Err(expected), got);
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs
index 715da39bd..604daf3c3 100644
--- a/vendor/regex-automata/src/hybrid/error.rs
+++ b/vendor/regex-automata/src/hybrid/error.rs
@@ -7,6 +7,16 @@ use crate::{hybrid::id::LazyStateIDError, nfa};
/// to build a lazy DFA without heuristic Unicode support but with an NFA that
/// contains a Unicode word boundary.)
///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying
+/// [`nfa::thompson::BuildError`](crate::nfa::thompson::BuildError)
+/// type from its `source` method via the `std::error::Error` trait. This error
+/// only occurs when using convenience routines for building a lazy DFA
+/// directly from a pattern string.
+///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug)]
@@ -16,18 +26,14 @@ pub struct BuildError {
#[derive(Clone, Debug)]
enum BuildErrorKind {
- NFA(nfa::thompson::Error),
+ NFA(nfa::thompson::BuildError),
InsufficientCacheCapacity { minimum: usize, given: usize },
InsufficientStateIDCapacity { err: LazyStateIDError },
Unsupported(&'static str),
}
impl BuildError {
- fn kind(&self) -> &BuildErrorKind {
- &self.kind
- }
-
- pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError {
+ pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
BuildError { kind: BuildErrorKind::NFA(err) }
}
@@ -60,19 +66,16 @@ impl BuildError {
#[cfg(feature = "std")]
impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
- match self.kind() {
+ match self.kind {
BuildErrorKind::NFA(ref err) => Some(err),
- BuildErrorKind::InsufficientCacheCapacity { .. } => None,
- // LazyStateIDError is an implementation detail, don't expose it.
- BuildErrorKind::InsufficientStateIDCapacity { .. } => None,
- BuildErrorKind::Unsupported(_) => None,
+ _ => None,
}
}
}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- match self.kind() {
+ match self.kind {
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
write!(
@@ -103,7 +106,9 @@ impl core::fmt::Display for BuildError {
/// The default configuration of a lazy DFA in this crate is
/// set such that a `CacheError` will never occur. Instead,
/// callers must opt into this behavior with settings like
-/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count).
+/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count)
+/// and
+/// [`dfa::Config::minimum_bytes_per_state`](crate::hybrid::dfa::Config::minimum_bytes_per_state).
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
@@ -114,6 +119,10 @@ impl CacheError {
pub(crate) fn too_many_cache_clears() -> CacheError {
CacheError(())
}
+
+ pub(crate) fn bad_efficiency() -> CacheError {
+ CacheError(())
+ }
}
#[cfg(feature = "std")]
diff --git a/vendor/regex-automata/src/hybrid/id.rs b/vendor/regex-automata/src/hybrid/id.rs
index a6fcde52e..662e3c98f 100644
--- a/vendor/regex-automata/src/hybrid/id.rs
+++ b/vendor/regex-automata/src/hybrid/id.rs
@@ -1,4 +1,4 @@
-/// A state identifier especially tailored for lazy DFAs.
+/// A state identifier specifically tailored for lazy DFAs.
///
/// A lazy state ID logically represents a pointer to a DFA state. In practice,
/// by limiting the number of DFA states it can address, it reserves some
@@ -12,18 +12,28 @@
///
/// * **Unknown** - The state has not yet been computed. The
/// parameters used to get this state ID must be re-passed to
-/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
-/// unknown state ID.
+/// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never
+/// return an unknown state ID.
/// * **Dead** - A dead state only has transitions to itself. It indicates that
/// the search cannot do anything else and should stop with whatever result it
/// has.
/// * **Quit** - A quit state indicates that the automaton could not answer
/// whether a match exists or not. Correct search implementations must return a
-/// [`MatchError::Quit`](crate::MatchError::Quit).
-/// * **Start** - A start state indicates that the automaton will begin
-/// searching at a starting state. Branching on this isn't required for
-/// correctness, but a common optimization is to use this to more quickly look
-/// for a prefix.
+/// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit
+/// state.
+/// * **Start** - A start state is a state in which a search can begin.
+/// Lazy DFAs usually have more than one start state. Branching on
+/// this isn't required for correctness, but a common optimization is
+/// to run a prefilter when a search enters a start state. Note that
+/// start states are *not* tagged automatically, and one must enable the
+/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states)
+/// setting for start states to be tagged. The reason for this is
+/// that a DFA search loop is usually written to execute a prefilter once it
+/// enters a start state. But if there is no prefilter, this handling can be
+/// quite diastrous as the DFA may ping-pong between the special handling code
+/// and a possible optimized hot path for handling untagged states. When start
+/// states aren't specialized, then they are untagged and remain in the hot
+/// path.
/// * **Match** - A match state indicates that a match has been found.
/// Depending on the semantics of your search implementation, it may either
/// continue until the end of the haystack or a dead state, or it might quit
@@ -44,12 +54,12 @@
/// Notice also how a correct search implementation deals with
/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
/// the lazy DFA routines. When a `CacheError` occurs, it returns
-/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
+/// [`MatchError::gave_up`](crate::MatchError::gave_up).
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::{Cache, DFA},
-/// HalfMatch, MatchError, PatternID,
+/// HalfMatch, MatchError, Input,
/// };
///
/// fn find_leftmost_first(
@@ -62,8 +72,9 @@
/// // be match states (since DFAs in this crate delay matches by 1
/// // byte), so we don't need to check if the start state is a match.
/// let mut sid = dfa.start_state_forward(
-/// cache, None, haystack, 0, haystack.len(),
-/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
+/// cache,
+/// &Input::new(haystack),
+/// )?;
/// let mut last_match = None;
/// // Walk all the bytes in the haystack. We can quit early if we see
/// // a dead or a quit state. The former means the automaton will
@@ -72,7 +83,7 @@
/// for (i, &b) in haystack.iter().enumerate() {
/// sid = dfa
/// .next_state(cache, sid, b)
-/// .map_err(|_| MatchError::GaveUp { offset: i })?;
+/// .map_err(|_| MatchError::gave_up(i))?;
/// if sid.is_tagged() {
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
@@ -88,18 +99,21 @@
/// if last_match.is_some() {
/// return Ok(last_match);
/// }
-/// return Err(MatchError::Quit { byte: b, offset: i });
+/// return Err(MatchError::quit(b, i));
/// }
/// // Implementors may also want to check for start states and
/// // handle them differently for performance reasons. But it is
-/// // not necessary for correctness.
+/// // not necessary for correctness. Note that in order to check
+/// // for start states, you'll need to enable the
+/// // 'specialize_start_states' config knob, otherwise start
+/// // states will not be tagged.
/// }
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk
/// // the special "EOI" transition at the end of the search.
/// sid = dfa
/// .next_eoi_state(cache, sid)
-/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
+/// .map_err(|_| MatchError::gave_up(haystack.len()))?;
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
/// dfa.match_pattern(cache, sid, 0),
@@ -175,7 +189,8 @@ impl LazyStateID {
#[inline]
pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
if id > LazyStateID::MAX {
- return Err(LazyStateIDError { attempted: id as u64 });
+ let attempted = u64::try_from(id).unwrap();
+ return Err(LazyStateIDError { attempted });
}
Ok(LazyStateID::new_unchecked(id))
}
@@ -187,20 +202,10 @@ impl LazyStateID {
/// sacrifice memory safety.
#[inline]
const fn new_unchecked(id: usize) -> LazyStateID {
+ // FIXME: Use as_u32() once const functions in traits are stable.
LazyStateID(id as u32)
}
- /// Return this lazy state ID as its raw value if and only if it is not
- /// tagged (and thus not an unknown, dead, quit, start or match state ID).
- #[inline]
- pub(crate) fn as_usize(&self) -> Option<usize> {
- if self.is_tagged() {
- None
- } else {
- Some(self.as_usize_unchecked())
- }
- }
-
/// Return this lazy state ID as an untagged `usize`.
///
/// If this lazy state ID is tagged, then the usize returned is the state
@@ -215,6 +220,7 @@ impl LazyStateID {
/// be tagged (and thus greater than LazyStateID::MAX).
#[inline]
pub(crate) const fn as_usize_unchecked(&self) -> usize {
+ // FIXME: Use as_usize() once const functions in traits are stable.
self.0 as usize
}
@@ -297,6 +303,11 @@ impl LazyStateID {
/// Return true if and only if this lazy state ID has been tagged as a
/// start state.
+ ///
+ /// Note that if
+ /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is
+ /// disabled (which is the default), then this will always return false
+ /// since start states won't be tagged.
#[inline]
pub const fn is_start(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_START > 0
@@ -341,75 +352,3 @@ impl core::fmt::Display for LazyStateIDError {
)
}
}
-
-/// Represents the current state of an overlapping search.
-///
-/// This is used for overlapping searches since they need to know something
-/// about the previous search. For example, when multiple patterns match at the
-/// same position, this state tracks the last reported pattern so that the next
-/// search knows whether to report another matching pattern or continue with
-/// the search at the next position. Additionally, it also tracks which state
-/// the last search call terminated in.
-///
-/// This type provides no introspection capabilities. The only thing a caller
-/// can do is construct it and pass it around to permit search routines to use
-/// it to track state.
-///
-/// Callers should always provide a fresh state constructed via
-/// [`OverlappingState::start`] when starting a new search. Reusing state from
-/// a previous search may result in incorrect results.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct OverlappingState {
- /// The state ID of the state at which the search was in when the call
- /// terminated. When this is a match state, `last_match` must be set to a
- /// non-None value.
- ///
- /// A `None` value indicates the start state of the corresponding
- /// automaton. We cannot use the actual ID, since any one automaton may
- /// have many start states, and which one is in use depends on several
- /// search-time factors.
- id: Option<LazyStateID>,
- /// Information associated with a match when `id` corresponds to a match
- /// state.
- last_match: Option<StateMatch>,
-}
-
-/// Internal state about the last match that occurred. This records both the
-/// offset of the match and the match index.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub(crate) struct StateMatch {
- /// The index into the matching patterns for the current match state.
- pub(crate) match_index: usize,
- /// The offset in the haystack at which the match occurred. This is used
- /// when reporting multiple matches at the same offset. That is, when
- /// an overlapping search runs, the first thing it checks is whether it's
- /// already in a match state, and if so, whether there are more patterns
- /// to report as matches in that state. If so, it increments `match_index`
- /// and returns the pattern and this offset. Once `match_index` exceeds the
- /// number of matching patterns in the current state, the search continues.
- pub(crate) offset: usize,
-}
-
-impl OverlappingState {
- /// Create a new overlapping state that begins at the start state of any
- /// automaton.
- pub fn start() -> OverlappingState {
- OverlappingState { id: None, last_match: None }
- }
-
- pub(crate) fn id(&self) -> Option<LazyStateID> {
- self.id
- }
-
- pub(crate) fn set_id(&mut self, id: LazyStateID) {
- self.id = Some(id);
- }
-
- pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
- self.last_match.as_mut()
- }
-
- pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
- self.last_match = Some(last_match);
- }
-}
diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs
index 4c8ca7ebe..44e67e129 100644
--- a/vendor/regex-automata/src/hybrid/mod.rs
+++ b/vendor/regex-automata/src/hybrid/mod.rs
@@ -1,5 +1,5 @@
/*!
-A module for building and searching with lazy determinstic finite automata
+A module for building and searching with lazy deterministic finite automata
(DFAs).
Like other modules in this crate, lazy DFAs support a rich regex syntax with
@@ -26,64 +26,38 @@ This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
-use regex_automata::{hybrid::regex::Regex, MultiMatch};
+use regex_automata::{hybrid::regex::Regex, Match};
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
let mut cache = re.create_cache();
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<MultiMatch> =
- re.find_leftmost_iter(&mut cache, text).collect();
+let haystack = "2018-12-24 2016-10-08";
+let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
assert_eq!(matches, vec![
- MultiMatch::must(0, 0, 10),
- MultiMatch::must(0, 11, 21),
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
-# Example: searching with regex sets
+# Example: searching with multiple regexes
The lazy DFAs in this module all fully support searching with multiple regexes
simultaneously. You can use this support with standard leftmost-first style
searching to find non-overlapping matches:
```
-use regex_automata::{hybrid::regex::Regex, MultiMatch};
+# if cfg!(miri) { return Ok(()); } // miri takes too long
+use regex_automata::{hybrid::regex::Regex, Match};
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
let mut cache = re.create_cache();
-let text = b"@foo bar";
-let matches: Vec<MultiMatch> =
- re.find_leftmost_iter(&mut cache, text).collect();
+let haystack = "@foo bar";
+let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
assert_eq!(matches, vec![
- MultiMatch::must(1, 0, 4),
- MultiMatch::must(0, 5, 8),
-]);
-# Ok::<(), Box<dyn std::error::Error>>(())
-```
-
-Or use overlapping style searches to find all possible occurrences:
-
-```
-use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch};
-
-// N.B. For overlapping searches, we need the underlying lazy DFA to report all
-// possible matches.
-let re = Regex::builder()
- .dfa(dfa::Config::new().match_kind(MatchKind::All))
- .build_many(&[r"\w{3}", r"\S{3}"])?;
-let mut cache = re.create_cache();
-
-let text = b"@foo bar";
-let matches: Vec<MultiMatch> =
- re.find_overlapping_iter(&mut cache, text).collect();
-assert_eq!(matches, vec![
- MultiMatch::must(1, 0, 3),
- MultiMatch::must(0, 1, 4),
- MultiMatch::must(1, 1, 4),
- MultiMatch::must(0, 5, 8),
- MultiMatch::must(1, 5, 8),
+ Match::must(1, 0..4),
+ Match::must(0, 5..8),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
@@ -122,8 +96,9 @@ created for each byte seen, which would make searching quite a bit slower.
A fully compiled DFA never has to worry about searches being slower once
it's built. (Aside from, say, the transition table being so large that it
is subject to harsh CPU cache effects.) However, of course, building a full
-DFA can be quite time consuming and memory hungry. Particularly when it's
-so easy to build large DFAs when Unicode mode is enabled.
+DFA can be quite time consuming and memory hungry. Particularly when large
+Unicode character classes are used, which tend to translate into very large
+DFAs.
A lazy DFA strikes a nice balance _in practice_, particularly in the
presence of Unicode mode, by only building what is needed. It avoids the
@@ -142,7 +117,8 @@ There are two things that are not supported by the lazy DFAs in this module:
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
of them) can only find the offsets of an entire match, but cannot resolve
the offsets of each capturing group. This is because DFAs do not have the
-expressive power necessary.
+expressive power necessary. Note that it is okay to build a lazy DFA from an
+NFA that contains capture groups. The capture groups will simply be ignored.
* Unicode word boundaries. These present particularly difficult challenges for
DFA construction and would result in an explosion in the number of states.
One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
@@ -154,22 +130,11 @@ There are no plans to lift either of these limitations.
Note that these restrictions are identical to the restrictions on fully
compiled DFAs.
-
-# Support for `alloc`-only
-
-This crate comes with `alloc` and `std` features that are enabled by default.
-One can disable the `std` feature and still use the full API of a lazy DFA.
-(You should use `std` when possible, since it permits providing implementations
-of the `std::error::Error` trait, and does enable some minor internal
-optimizations.)
-
-This module does require at least the `alloc` feature though. It is not
-available in any capacity without `alloc`.
*/
pub use self::{
error::{BuildError, CacheError},
- id::{LazyStateID, OverlappingState},
+ id::LazyStateID,
};
pub mod dfa;
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
index 7cc6b9064..75667daf9 100644
--- a/vendor/regex-automata/src/hybrid/regex.rs
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -1,10 +1,10 @@
/*!
A lazy DFA backed `Regex`.
-This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience
-routines you might have come to expect, such as finding a match and iterating
-over all non-overlapping matches. This `Regex` type is limited in its
-capabilities to what a lazy DFA can provide. Therefore, APIs involving
+This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
+convenience routines you might have come to expect, such as finding a match
+and iterating over all non-overlapping matches. This `Regex` type is limited
+in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
capturing groups, for example, are not provided.
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
@@ -14,20 +14,15 @@ find the start offset of a match.
See the [parent module](crate::hybrid) for examples.
*/
-use core::borrow::Borrow;
-
-use alloc::boxed::Box;
-
use crate::{
hybrid::{
dfa::{self, DFA},
error::BuildError,
- OverlappingState,
},
nfa::thompson,
util::{
- matchtypes::{MatchError, MatchKind, MultiMatch},
- prefilter::{self, Prefilter},
+ iter,
+ search::{Anchored, Input, Match, MatchError, MatchKind},
},
};
@@ -42,89 +37,20 @@ use crate::{
/// found by the forward DFA guarantees that the reverse DFA will also find
/// a match.
///
-/// A `Regex` can also have a prefilter set via the
-/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is
-/// enabled.
-///
-/// # Earliest vs Leftmost vs Overlapping
-///
-/// The search routines exposed on a `Regex` reflect three different ways
-/// of searching:
-///
-/// * "earliest" means to stop as soon as a match has been detected.
-/// * "leftmost" means to continue matching until the underlying
-/// automaton cannot advance. This reflects "standard" searching you
-/// might be used to in other regex engines. e.g., This permits
-/// non-greedy and greedy searching to work as you would expect.
-/// * "overlapping" means to find all possible matches, even if they
-/// overlap.
-///
-/// Generally speaking, when doing an overlapping search, you'll want to
-/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using
-/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
-/// likely to lead to odd behavior since `LeftmostFirst` specifically omits
-/// some matches that can never be reported due to its semantics.
-///
-/// The following example shows the differences between how these different
-/// types of searches impact looking for matches of `[a-z]+` in the
-/// haystack `abc`.
-///
-/// ```
-/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch};
-///
-/// let pattern = r"[a-z]+";
-/// let haystack = "abc".as_bytes();
-///
-/// // With leftmost-first semantics, we test "earliest" and "leftmost".
-/// let re = regex::Builder::new()
-/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst))
-/// .build(pattern)?;
-/// let mut cache = re.create_cache();
-///
-/// // "earliest" searching isn't impacted by greediness
-/// let mut it = re.find_earliest_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// // "leftmost" searching supports greediness (and non-greediness)
-/// let mut it = re.find_leftmost_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// // For overlapping, we want "all" match kind semantics.
-/// let re = regex::Builder::new()
-/// .dfa(dfa::Config::new().match_kind(MatchKind::All))
-/// .build(pattern)?;
-/// let mut cache = re.create_cache();
-///
-/// // In the overlapping search, we find all three possible matches
-/// // starting at the beginning of the haystack.
-/// let mut it = re.find_overlapping_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// # Ok::<(), Box<dyn std::error::Error>>(())
-/// ```
-///
/// # Fallibility
///
-/// In non-default configurations, the lazy DFAs generated in this module may
-/// return an error during a search. (Currently, the only way this happens is
-/// if quit bytes are added, Unicode word boundaries are heuristically enabled,
-/// or if the cache is configured to "give up" on a search if it has been
-/// cleared too many times. All of these are turned off by default, which means
-/// a search can never fail in the default configuration.) For convenience,
-/// the main search routines, like [`find_leftmost`](Regex::find_leftmost),
-/// will panic if an error occurs. However, if you need to use DFAs which may
-/// produce an error at search time, then there are fallible equivalents of
-/// all search routines. For example, for `find_leftmost`, its fallible analog
-/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed
-/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the
-/// infallible routines simply return `Option<MultiMatch>`.
+/// Most of the search routines defined on this type will _panic_ when the
+/// underlying search fails. This might be because the DFA gave up because it
+/// saw a quit byte, whether configured explicitly or via heuristic Unicode
+/// word boundary support, although neither are enabled by default. It might
+/// also fail if the underlying DFA determines it isn't making effective use of
+/// the cache (which also never happens by default). Or it might fail because
+/// an invalid `Input` configuration is given, for example, with an unsupported
+/// [`Anchored`] mode.
+///
+/// If you need to handle these error cases instead of allowing them to trigger
+/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
+/// that never panics.
///
/// # Example
///
@@ -134,28 +60,26 @@ use crate::{
/// across a line boundary.
///
/// ```
-/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError};
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dfa(dfa::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
/// let mut cache = re.create_cache();
///
-/// let haystack = "foo\nbar".as_bytes();
+/// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
-/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
-/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err();
+/// let expected = MatchError::quit(b'\n', 3);
+/// let got = re.try_search(&mut cache, &input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Debug)]
pub struct Regex {
- /// An optional prefilter that is passed down to the lazy DFA search
- /// routines when present. By default, no prefilter is set.
- pre: Option<Box<dyn Prefilter>>,
/// The forward lazy DFA. This can only find the end of a match.
forward: DFA,
/// The reverse lazy DFA. This can only find the start of a match.
@@ -169,9 +93,6 @@ pub struct Regex {
/// we might wind up finding the "leftmost" starting position of a totally
/// different pattern!
reverse: DFA,
- /// Whether iterators on this type should advance by one codepoint or one
- /// byte when an empty match is seen.
- utf8: bool,
}
/// Convenience routines for regex and cache construction.
@@ -185,86 +106,49 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 14)),
- /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"),
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(&mut cache, "zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Regex::builder().build(pattern)
}
- /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
/// This similarly uses the default regex configuration.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
/// let mut cache = re.create_cache();
///
- /// let mut it = re.find_leftmost_iter(
- /// &mut cache,
- /// b"abc 1 foo 4567 0 quux",
- /// );
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex, BuildError> {
Regex::builder().build_many(patterns)
}
- /// Return a default configuration for a `Regex`.
- ///
- /// This is a convenience routine to avoid needing to import the `Config`
- /// type when customizing the construction of a regex.
- ///
- /// # Example
- ///
- /// This example shows how to disable UTF-8 mode for `Regex` iteration.
- /// When UTF-8 mode is disabled, the position immediately following an
- /// empty match is where the next search begins, instead of the next
- /// position of a UTF-8 encoded codepoint.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn config() -> Config {
- Config::new()
- }
-
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
@@ -276,22 +160,20 @@ impl Regex {
/// everywhere.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
- /// hybrid::regex::Regex,
- /// nfa::thompson,
- /// MultiMatch, SyntaxConfig,
+ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
- /// let expected = Some(MultiMatch::must(0, 1, 9));
- /// let got = re.find_leftmost(&mut cache, haystack);
+ /// let expected = Some(Match::must(0, 1..9));
+ /// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -325,15 +207,16 @@ impl Regex {
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 2)),
- /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
@@ -344,8 +227,8 @@ impl Regex {
/// // allowed.
/// re2.reset_cache(&mut cache);
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -367,78 +250,49 @@ impl Regex {
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match`](Regex::try_is_match).
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::hybrid::regex::Regex;
- ///
- /// let re = Regex::new("foo[0-9]+bar")?;
- /// let mut cache = re.create_cache();
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar"));
- /// assert_eq!(false, re.is_match(&mut cache, b"foobar"));
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool {
- self.try_is_match(cache, haystack).unwrap()
- }
-
- /// Returns the first position at which a match is found.
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest`](Regex::try_find_earliest).
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::hybrid::regex::Regex;
///
- /// // Normally, the leftmost first match would greedily consume as many
- /// // decimal digits as it could. But a match is detected as soon as one
- /// // digit is seen.
- /// let re = Regex::new("foo[0-9]+")?;
+ /// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 4)),
- /// re.find_earliest(&mut cache, b"foo12345"),
- /// );
///
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the "earliest" match semantics detect a match earlier.
- /// let re = Regex::new("abc|a")?;
- /// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 1)),
- /// re.find_earliest(&mut cache, b"abc"),
- /// );
+ /// assert!(re.is_match(&mut cache, "foo12345bar"));
+ /// assert!(!re.is_match(&mut cache, "foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_earliest(
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
- haystack: &[u8],
- ) -> Option<MultiMatch> {
- self.try_find_earliest(cache, haystack).unwrap()
+ input: I,
+ ) -> bool {
+ // Not only can we do an "earliest" search, but we can avoid doing a
+ // reverse scan too.
+ self.forward()
+ .try_search_fwd(&mut cache.forward, &input.into().earliest(true))
+ .unwrap()
+ .is_some()
}
/// Returns the start and end offset of the leftmost match. If no match
@@ -446,25 +300,35 @@ impl Regex {
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{Match, hybrid::regex::Regex};
///
- /// // Greediness is applied appropriately when compared to find_earliest.
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 11)),
- /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"),
+ /// Some(Match::must(0, 3..11)),
+ /// re.find(&mut cache, "zzzfoo12345zzz"),
/// );
///
/// // Even though a match is found after reading the first byte (`a`),
@@ -473,892 +337,182 @@ impl Regex {
/// // parts.
/// let re = Regex::new("abc|a")?;
/// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re.find_leftmost(&mut cache, b"abc"),
- /// );
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_leftmost(
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
- haystack: &[u8],
- ) -> Option<MultiMatch> {
- self.try_find_leftmost(cache, haystack).unwrap()
+ input: I,
+ ) -> Option<Match> {
+ self.try_search(cache, &input.into()).unwrap()
}
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping`](Regex::try_find_overlapping).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{
- /// hybrid::{dfa::DFA, regex::Regex, OverlappingState},
- /// MatchKind,
- /// MultiMatch,
- /// };
- ///
- /// let re = Regex::builder()
- /// .dfa(DFA::config().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "@foo".as_bytes();
- /// let mut state = OverlappingState::start();
- ///
- /// let expected = Some(MultiMatch::must(1, 0, 4));
- /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
- /// // The first pattern also matches at the same position, so re-running
- /// // the search will yield another match. Notice also that the first
- /// // pattern is returned after the second. This is because the second
- /// // pattern begins its match before the first, is therefore an earlier
- /// // match and is thus reported first.
- /// let expected = Some(MultiMatch::must(0, 1, 4));
- /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_overlapping(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.try_find_overlapping(cache, haystack, state).unwrap()
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an "earliest" iterator.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::new("[0-9]+")?;
- /// let mut cache = re.create_cache();
- /// let haystack = "123".as_bytes();
- ///
- /// // Normally, a standard leftmost iterator would return a single
- /// // match, but since "earliest" detects matches earlier, we get
- /// // three matches.
- /// let mut it = re.find_earliest_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_earliest_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindEarliestMatches<'r, 'c, 't> {
- FindEarliestMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all non-overlapping leftmost matches in the
- /// given bytes. If no match exists, then the iterator yields no elements.
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// This corresponds to the "standard" regex search iterator.
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// # Panics
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
+ /// The above conditions also apply to the iterator returned as well. For
+ /// example, if the lazy DFA gives up or quits during a search using this
+ /// method, then a panic will occur during iteration.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
+ /// if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
///
- /// let text = b"foo1 foo12 foo123";
- /// let matches: Vec<MultiMatch> = re
- /// .find_leftmost_iter(&mut cache, text)
- /// .collect();
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
/// assert_eq!(matches, vec![
- /// MultiMatch::must(0, 0, 4),
- /// MultiMatch::must(0, 5, 10),
- /// MultiMatch::must(0, 11, 17),
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_leftmost_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{
- /// hybrid::{dfa::DFA, regex::Regex},
- /// MatchKind,
- /// MultiMatch,
- /// };
- ///
- /// let re = Regex::builder()
- /// .dfa(DFA::config().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let mut cache = re.create_cache();
- /// let haystack = "@foo".as_bytes();
- ///
- /// let mut it = re.find_overlapping_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_overlapping_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindOverlappingMatches<'r, 'c, 't> {
- FindOverlappingMatches::new(self, cache, haystack)
- }
-}
-
-/// Lower level infallible search routines that permit controlling where
-/// the search starts and ends in a particular sequence. This is useful for
-/// executing searches that need to take surrounding context into account. This
-/// is required for correctly implementing iteration because of look-around
-/// operators (`^`, `$`, `\b`).
-impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match_at`](Regex::try_is_match_at).
- pub fn is_match_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- self.try_is_match_at(cache, haystack, start, end).unwrap()
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
- pub fn find_earliest_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_earliest_at(cache, haystack, start, end).unwrap()
- }
-
- /// Returns the same as `find_leftmost`, but starts the search at the given
- /// offset.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches within the
- /// same haystack, which cannot be done correctly by simply providing a
- /// subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
- pub fn find_leftmost_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_leftmost_at(cache, haystack, start, end).unwrap()
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
- pub fn find_overlapping_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.try_find_overlapping_at(cache, haystack, start, end, state)
- .unwrap()
- }
-}
-
-/// Fallible search routines. These may return an error when the underlying
-/// lazy DFAs have been configured in a way that permits them to fail during a
-/// search.
-///
-/// Errors during search only occur when the lazy DFA has been explicitly
-/// configured to do so, usually by specifying one or more "quit" bytes or by
-/// heuristically enabling Unicode word boundaries.
-///
-/// Errors will never be returned using the default configuration. So these
-/// fallible routines are only needed for particular configurations.
-impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match`](Regex::is_match).
- pub fn try_is_match(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<bool, MatchError> {
- self.try_is_match_at(cache, haystack, 0, haystack.len())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest`](Regex::find_earliest).
- pub fn try_find_earliest(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at(cache, haystack, 0, haystack.len())
- }
-
- /// Returns the start and end offset of the leftmost match. If no match
- /// exists, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost`](Regex::find_leftmost).
- pub fn try_find_leftmost(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at(cache, haystack, 0, haystack.len())
- }
-
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping`](Regex::find_overlapping).
- pub fn try_find_overlapping(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state)
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_iter`](Regex::find_earliest_iter).
- pub fn try_find_earliest_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 'c, 't> {
- TryFindEarliestMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all non-overlapping leftmost matches in the
- /// given bytes. If no match exists, then the iterator yields no elements.
- ///
- /// This corresponds to the "standard" regex search iterator.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
- pub fn try_find_leftmost_iter<'r, 'c, 't>(
+ #[inline]
+ pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
&'r self,
cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 'c, 't> {
- TryFindLeftmostMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
- pub fn try_find_overlapping_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 'c, 't> {
- TryFindOverlappingMatches::new(self, cache, haystack)
+ input: I,
+ ) -> FindMatches<'r, 'c, 'h> {
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, cache, it }
}
}
-/// Lower level fallible search routines that permit controlling where the
-/// search starts and ends in a particular sequence.
+/// Lower level "search" primitives that accept a `&Input` for cheap reuse
+/// and return an error if one occurs instead of panicking.
impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match_at`](Regex::is_match_at).
- pub fn try_is_match_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<bool, MatchError> {
- self.forward()
- .find_leftmost_fwd_at(
- &mut cache.forward,
- self.scanner().as_mut(),
- None,
- haystack,
- start,
- end,
- )
- .map(|x| x.is_some())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_at`](Regex::find_earliest_at).
- pub fn try_find_earliest_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- )
- }
-
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
- /// # Searching a substring of the haystack
+ /// This is like [`Regex::find`] but with two differences:
///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
+ /// 1. It is not generic over `Into<Input>` and instead accepts a
+ /// `&Input`. This permits reusing the same `Input` for multiple searches
+ /// without needing to create a new one. This _may_ help with latency.
+ /// 2. It returns an error if the search could not complete where as
+ /// [`Regex::find`] will panic.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_at`](Regex::find_leftmost_at).
- pub fn try_find_leftmost_at(
+ #[inline]
+ pub fn try_search(
&self,
cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- )
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_at`](Regex::find_overlapping_at).
- pub fn try_find_overlapping_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- state,
- )
- }
-}
-
-impl Regex {
- #[inline(always)]
- fn try_find_earliest_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa
- .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)?
- {
+ let end = match self.forward().try_search_fwd(fcache, input)? {
None => return Ok(None),
Some(end) => end,
};
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // earliest case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof. Why not
- // just provide the pattern anyway? Well, if it is needed, then leaving
- // it out gives us a chance to find a witness.
- let start = rdfa
- .find_earliest_rev_at(rcache, None, haystack, start, end.offset())?
- .expect("reverse search must match if forward search does");
- assert_eq!(
- start.pattern(),
- end.pattern(),
- "forward and reverse search must match same pattern",
- );
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
- }
-
- #[inline(always)]
- fn try_find_leftmost_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
- let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa
- .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)?
- {
- None => return Ok(None),
- Some(end) => end,
- };
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // leftmost case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof. Why not
- // just provide the pattern anyway? Well, if it is needed, then leaving
- // it out gives us a chance to find a witness.
- let start = rdfa
- .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())?
+ // This special cases an empty match at the beginning of the search. If
+ // our end matches our start, then since a reverse DFA can't match past
+ // the start, it must follow that our starting position is also our end
+ // position. So short circuit and skip the reverse search.
+ if input.start() == end.offset() {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ end.offset()..end.offset(),
+ )));
+ }
+ // We can also skip the reverse search if we know our search was
+ // anchored. This occurs either when the input config is anchored or
+ // when we know the regex itself is anchored. In this case, we know the
+ // start of the match, if one is found, must be the start of the
+ // search.
+ if self.is_anchored(input) {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ input.start()..end.offset(),
+ )));
+ }
+ // N.B. I have tentatively convinced myself that it isn't necessary
+ // to specify the specific pattern for the reverse search since the
+ // reverse search will always find the same pattern to match as the
+ // forward search. But I lack a rigorous proof. Why not just provide
+ // the pattern anyway? Well, if it is needed, then leaving it out
+ // gives us a chance to find a witness. (Also, if we don't need to
+ // specify the pattern, then we don't need to build the reverse DFA
+ // with 'starts_for_each_pattern' enabled. It doesn't matter too much
+ // for the lazy DFA, but does make the overall DFA bigger.)
+ //
+ // We also need to be careful to disable 'earliest' for the reverse
+ // search, since it could be enabled for the forward search. In the
+ // reverse case, to satisfy "leftmost" criteria, we need to match as
+ // much as we can. We also need to be careful to make the search
+ // anchored. We don't want the reverse search to report any matches
+ // other than the one beginning at the end of our forward search.
+ let revsearch = input
+ .clone()
+ .span(input.start()..end.offset())
+ .anchored(Anchored::Yes)
+ .earliest(false);
+ let start = self
+ .reverse()
+ .try_search_rev(rcache, &revsearch)?
.expect("reverse search must match if forward search does");
- assert_eq!(
+ debug_assert_eq!(
start.pattern(),
end.pattern(),
"forward and reverse search must match same pattern",
);
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ debug_assert!(start.offset() <= end.offset());
+ Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
- #[inline(always)]
- fn try_find_overlapping_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
- let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa.find_overlapping_fwd_at(
- fcache, pre, None, haystack, start, end, state,
- )? {
- None => return Ok(None),
- Some(end) => end,
- };
- // Unlike the leftmost cases, the reverse overlapping search may match
- // a different pattern than the forward search. See test failures when
- // using `None` instead of `Some(end.pattern())` below. Thus, we must
- // run our reverse search using the pattern that matched in the forward
- // direction.
- let start = rdfa
- .find_leftmost_rev_at(
- rcache,
- Some(end.pattern()),
- haystack,
- 0,
- end.offset(),
- )?
- .expect("reverse search must match if forward search does");
- assert_eq!(
- start.pattern(),
- end.pattern(),
- "forward and reverse search must match same pattern",
- );
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ /// Returns true if either the given input specifies an anchored search
+ /// or if the underlying NFA is always anchored.
+ fn is_anchored(&self, input: &Input<'_>) -> bool {
+ match input.get_anchored() {
+ Anchored::No => {
+ self.forward().get_nfa().is_always_start_anchored()
+ }
+ Anchored::Yes | Anchored::Pattern(_) => true,
+ }
}
}
@@ -1386,360 +540,45 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::hybrid::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
- /// assert_eq!(3, re.pattern_count());
+ /// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn pattern_count(&self) -> usize {
- assert_eq!(
- self.forward().pattern_count(),
- self.reverse().pattern_count()
- );
- self.forward().pattern_count()
- }
-
- /// Convenience function for returning this regex's prefilter as a trait
- /// object.
- ///
- /// If this regex doesn't have a prefilter, then `None` is returned.
- pub fn prefilter(&self) -> Option<&dyn Prefilter> {
- self.pre.as_ref().map(|x| &**x)
- }
-
- /// Attach the given prefilter to this regex.
- pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) {
- self.pre = pre;
- }
-
- /// Convenience function for returning a prefilter scanner.
- fn scanner(&self) -> Option<prefilter::Scanner> {
- self.prefilter().map(prefilter::Scanner::new)
+ pub fn pattern_len(&self) -> usize {
+ assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
+ self.forward().pattern_len()
}
}
-/// An iterator over all non-overlapping earliest matches for a particular
-/// infallible search.
+/// An iterator over all non-overlapping matches for an infallible search.
///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
+/// The iterator yields a [`Match`] value until no more matches could be found.
+/// If the underlying regex engine returns an error, then a panic occurs.
///
-/// The lifetime variables are as follows:
+/// The lifetime parameters are as follows:
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>);
-
-impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindEarliestMatches<'r, 'c, 't> {
- FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// infallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
-///
-/// The lifetime variables are as follows:
+/// * `'r` represents the lifetime of the regex object.
+/// * `'h` represents the lifetime of the haystack being searched.
+/// * `'c` represents the lifetime of the regex cache.
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
+/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
-pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>);
-
-impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all overlapping matches for a particular infallible
-/// search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct FindOverlappingMatches<'r, 'c, 't>(
- TryFindOverlappingMatches<'r, 'c, 't>,
-);
-
-impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindOverlappingMatches<'r, 'c, 't> {
- FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping earliest matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindEarliestMatches<'r, 'c, 't> {
- re: &'r Regex,
- cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindEarliestMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_earliest_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindLeftmostMatches<'r, 'c, 't> {
- re: &'r Regex,
- cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindLeftmostMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_leftmost_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all overlapping matches for a particular fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindOverlappingMatches<'r, 'c, 't> {
+pub struct FindMatches<'r, 'c, 'h> {
re: &'r Regex,
cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- state: OverlappingState,
+ it: iter::Searcher<'h>,
}
-impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindOverlappingMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- state: OverlappingState::start(),
- }
- }
-}
+impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
+ type Item = Match;
-impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_overlapping_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- &mut self.state,
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- // Unlike the non-overlapping case, we're OK with empty matches at this
- // level. In particular, the overlapping search algorithm is itself
- // responsible for ensuring that progress is always made.
- self.last_end = m.end();
- Some(Ok(m))
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut cache, ref mut it } = *self;
+ it.advance(|input| re.try_search(cache, input))
}
}
@@ -1791,15 +630,16 @@ impl Cache {
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 2)),
- /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
@@ -1810,8 +650,8 @@ impl Cache {
/// // allowed.
/// cache.reset(&re2);
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -1821,13 +661,30 @@ impl Cache {
self.reverse.reset(re.reverse());
}
- /// Returns the heap memory usage, in bytes, as a sum of the forward and
- /// reverse lazy DFA caches.
+ /// Return a reference to the forward cache.
+ pub fn forward(&mut self) -> &dfa::Cache {
+ &self.forward
+ }
+
+ /// Return a reference to the reverse cache.
+ pub fn reverse(&mut self) -> &dfa::Cache {
+ &self.reverse
+ }
+
+ /// Return a mutable reference to the forward cache.
///
- /// This does **not** include the stack size used up by this cache. To
- /// compute that, use `std::mem::size_of::<Cache>()`.
- pub fn memory_usage(&self) -> usize {
- self.forward.memory_usage() + self.reverse.memory_usage()
+ /// If you need mutable references to both the forward and reverse caches,
+ /// then use [`Cache::as_parts_mut`].
+ pub fn forward_mut(&mut self) -> &mut dfa::Cache {
+ &mut self.forward
+ }
+
+ /// Return a mutable reference to the reverse cache.
+ ///
+ /// If you need mutable references to both the forward and reverse caches,
+ /// then use [`Cache::as_parts_mut`].
+ pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
+ &mut self.reverse
}
/// Return references to the forward and reverse caches, respectively.
@@ -1840,111 +697,14 @@ impl Cache {
pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
(&mut self.forward, &mut self.reverse)
}
-}
-/// The configuration used for compiling a hybrid NFA/DFA regex.
-///
-/// A regex configuration is a simple data object that is typically used with
-/// [`Builder::configure`].
-#[derive(Clone, Copy, Debug, Default)]
-pub struct Config {
- utf8: Option<bool>,
-}
-
-impl Config {
- /// Return a new default regex compiler configuration.
- pub fn new() -> Config {
- Config::default()
- }
-
- /// Whether to enable UTF-8 mode or not.
- ///
- /// When UTF-8 mode is enabled (the default) and an empty match is seen,
- /// the iterators on [`Regex`] will always start the next search at the
- /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
- /// mode is disabled, such searches are begun at the next byte offset.
- ///
- /// If this mode is enabled and invalid UTF-8 is given to search, then
- /// behavior is unspecified.
- ///
- /// Generally speaking, one should enable this when
- /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
- /// and
- /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
- /// are enabled, and disable it otherwise.
- ///
- /// # Example
- ///
- /// This example demonstrates the differences between when this option is
- /// enabled and disabled. The differences only arise when the regex can
- /// return matches of length zero.
- ///
- /// In this first snippet, we show the results when UTF-8 mode is disabled.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- ///
- /// And in this snippet, we execute the same search on the same haystack,
- /// but with UTF-8 mode enabled. Notice that byte offsets that would
- /// otherwise split the encoding of `☃` are not returned.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(true))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
+ /// Returns the heap memory usage, in bytes, as a sum of the forward and
+ /// reverse lazy DFA caches.
///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn utf8(mut self, yes: bool) -> Config {
- self.utf8 = Some(yes);
- self
- }
-
- /// Returns true if and only if this configuration has UTF-8 mode enabled.
- ///
- /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
- /// [`Regex`] will always start the next search at the next UTF-8 encoded
- /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
- /// next byte offset.
- pub fn get_utf8(&self) -> bool {
- self.utf8.unwrap_or(true)
- }
-
- /// Overwrite the default configuration such that the options in `o` are
- /// always used. If an option in `o` is not set, then the corresponding
- /// option in `self` is used. If it's not set in `self` either, then it
- /// remains not set.
- pub(crate) fn overwrite(self, o: Config) -> Config {
- Config { utf8: o.utf8.or(self.utf8) }
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.forward.memory_usage() + self.reverse.memory_usage()
}
}
@@ -1955,17 +715,15 @@ impl Config {
/// itself. This builder is different from a general purpose regex builder
/// in that it permits fine grain configuration of the construction process.
/// The trade off for this is complexity, and the possibility of setting a
-/// configuration that might not make sense. For example, there are three
+/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
-/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
-/// pattern itself can contain sub-expressions that match invalid UTF-8.
-/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
-/// controls whether the implicit unanchored prefix added to the NFA can
-/// match through invalid UTF-8 or not.
-/// * [`Config::utf8`] controls how the regex iterators themselves advance
-/// the starting position of the next search when a match with zero length is
-/// found.
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
+/// advance the starting position of the next search when a match with zero
+/// length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
@@ -1979,61 +737,54 @@ impl Config {
///
/// # Example
///
-/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
-/// the regex itself. This is generally what you want for matching on
-/// arbitrary bytes.
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
-/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
-/// .configure(Regex::config().utf8(false))
-/// .syntax(SyntaxConfig::new().utf8(false))
+/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
-/// let expected = Some(MultiMatch::must(0, 1, 9));
-/// let got = re.find_leftmost(&mut cache, haystack);
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
-/// // on the syntax permits this. Notice also that the
-/// // search was unanchored and skipped over invalid UTF-8.
-/// // Disabling UTF-8 on the Thompson NFA permits this.
-/// //
-/// // N.B. This example does not show the impact of
-/// // disabling UTF-8 mode on Config, since that
-/// // only impacts regexes that can produce matches of
-/// // length 0.
+/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
- config: Config,
dfa: dfa::Builder,
}
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
- Builder { config: Config::default(), dfa: DFA::builder() }
+ Builder { dfa: DFA::builder() }
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
+ #[cfg(feature = "syntax")]
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
/// Build a regex from the given patterns.
+ #[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
@@ -2044,9 +795,9 @@ impl Builder {
.clone()
.configure(
DFA::config()
- .anchored(true)
- .match_kind(MatchKind::All)
- .starts_for_each_pattern(true),
+ .prefilter(None)
+ .specialize_start_states(false)
+ .match_kind(MatchKind::All),
)
.thompson(thompson::Config::new().reverse(true))
.build_many(patterns)?;
@@ -2054,28 +805,62 @@ impl Builder {
}
/// Build a regex from its component forward and reverse hybrid NFA/DFAs.
- fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
- // The congruous method on DFA-backed regexes is exposed, but it's
- // not clear this builder is useful here since lazy DFAs can't be
- // serialized and there is only one type of them.
- let utf8 = self.config.get_utf8();
- Regex { pre: None, forward, reverse, utf8 }
- }
-
- /// Apply the given regex configuration options to this builder.
- pub fn configure(&mut self, config: Config) -> &mut Builder {
- self.config = self.config.overwrite(config);
- self
+ ///
+ /// This is useful when you've built a forward and reverse lazy DFA
+ /// separately, and want to combine them into a single regex. Once build,
+ /// the individual DFAs given can still be accessed via [`Regex::forward`]
+ /// and [`Regex::reverse`].
+ ///
+ /// It is important that the reverse lazy DFA be compiled under the
+ /// following conditions:
+ ///
+ /// * It should use [`MatchKind::All`] semantics.
+ /// * It should match in reverse.
+ /// * Otherwise, its configuration should match the forward DFA.
+ ///
+ /// If these conditions aren't satisfied, then the behavior of searches is
+ /// unspecified.
+ ///
+ /// Note that when using this constructor, no configuration is applied.
+ /// Since this routine provides the DFAs to the builder, there is no
+ /// opportunity to apply other configuration options.
+ ///
+ /// # Example
+ ///
+ /// This shows how to build individual lazy forward and reverse DFAs, and
+ /// then combine them into a single `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex},
+ /// nfa::thompson,
+ /// MatchKind,
+ /// };
+ ///
+ /// let fwd = DFA::new(r"foo[0-9]+")?;
+ /// let rev = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build(r"foo[0-9]+")?;
+ ///
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(true, re.is_match(&mut cache, "foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
+ Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
+ #[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
@@ -2086,6 +871,7 @@ impl Builder {
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
+ #[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.dfa.thompson(config);
self
@@ -2107,18 +893,3 @@ impl Default for Builder {
Builder::new()
}
}
-
-#[inline(always)]
-fn next_unwrap(
- item: Option<Result<MultiMatch, MatchError>>,
-) -> Option<MultiMatch> {
- match item {
- None => None,
- Some(Ok(m)) => Some(m),
- Some(Err(err)) => panic!(
- "unexpected regex search error: {}\n\
- to handle search errors, use try_ methods",
- err,
- ),
- }
-}
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
index 92760cee2..f23283685 100644
--- a/vendor/regex-automata/src/hybrid/search.rs
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -1,663 +1,802 @@
use crate::{
hybrid::{
- dfa::{Cache, DFA},
- id::{LazyStateID, OverlappingState, StateMatch},
+ dfa::{Cache, OverlappingState, DFA},
+ id::LazyStateID,
},
- nfa::thompson,
util::{
- id::PatternID,
- matchtypes::{HalfMatch, MatchError},
- prefilter, MATCH_OFFSET,
+ prefilter::Prefilter,
+ search::{HalfMatch, Input, MatchError, Span},
},
};
#[inline(never)]
-pub(crate) fn find_earliest_fwd(
- pre: Option<&mut prefilter::Scanner>,
+pub(crate) fn find_fwd(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
- } else {
- find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
}
-}
-
-#[inline(never)]
-pub(crate) fn find_leftmost_fwd(
- pre: Option<&mut prefilter::Scanner>,
- dfa: &DFA,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should never use
- // a prefilter.
- if pre.is_some() && pattern_id.is_none() {
- find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_config().get_prefilter()
+ };
+ // So what we do here is specialize four different versions of 'find_fwd':
+ // one for each of the combinations for 'has prefilter' and 'is earliest
+ // search'. The reason for doing this is that both of these things require
+ // branches and special handling in some code that can be very hot,
+ // and shaving off as much as we can when we don't need it tends to be
+ // beneficial in ad hoc benchmarks. To see these differences, you often
+ // need a query with a high match count. In other words, specializing these
+ // four routines *tends* to help latency more than throughput.
+ if pre.is_some() {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, cache, input, pre, true)
+ } else {
+ find_fwd_imp(dfa, cache, input, pre, false)
+ }
} else {
- find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
+ if input.get_earliest() {
+ find_fwd_imp(dfa, cache, input, None, true)
+ } else {
+ find_fwd_imp(dfa, cache, input, None, false)
+ }
}
}
-#[inline(always)]
-fn find_fwd(
- mut pre: Option<&mut prefilter::Scanner>,
- earliest: bool,
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_fwd_imp(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[..end];
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
+ let mut mat = None;
+ let mut sid = init_fwd(dfa, cache, input)?;
+ let mut at = input.start();
+ // This could just be a closure, but then I think it would be unsound
+ // because it would need to be safe to invoke. This way, the lack of safety
+ // is clearer in the code below.
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_untagged_unchecked(cache, $sid, byte)
+ }};
+ }
- let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = start;
- if let Some(ref mut pre) = pre {
- // If a prefilter doesn't report false positives, then we don't need to
- // touch the DFA at all. However, since all matches include the pattern
- // ID, and the prefilter infrastructure doesn't report pattern IDs, we
- // limit this optimization to cases where there is exactly one pattern.
- // In that case, any match must be the 0th pattern.
- if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
- return Ok(pre.next_candidate(bytes, at).into_option().map(
- |offset| HalfMatch { pattern: PatternID::ZERO, offset },
- ));
- } else if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, cache, &input, at)?;
}
}
}
}
- while at < end {
+ cache.search_start(at);
+ while at < input.end() {
if sid.is_tagged() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, sid, bytes[at])
+ .next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
- at += 1;
} else {
// SAFETY: There are two safety invariants we need to uphold
- // here in the loop below: that 'sid' is a valid state ID for
- // this DFA, and that 'at' is a valid index into 'bytes'. For
- // the former, we rely on the invariant that next_state* and
- // start_state_forward always returns a valid state ID (given a
- // valid state ID in the former case), and that we are only at this
- // place in the code if 'sid' is untagged. Moreover, every call to
- // next_state_untagged_unchecked below is guarded by a check that
- // sid is untagged. For the latter safety invariant, we always
- // guard unchecked access with a check that 'at' is less than
- // 'end', where 'end == bytes.len()'.
+ // here in the loops below: that 'sid' and 'prev_sid' are valid
+ // state IDs for this DFA, and that 'at' is a valid index into
+ // 'haystack'. For the former, we rely on the invariant that
+ // next_state* and start_state_forward always returns a valid state
+ // ID (given a valid state ID in the former case), and that we are
+ // only at this place in the code if 'sid' is untagged. Moreover,
+ // every call to next_state_untagged_unchecked below is guarded by
+ // a check that sid is untagged. For the latter safety invariant,
+ // we always guard unchecked access with a check that 'at' is less
+ // than 'end', where 'end <= haystack.len()'. In the unrolled loop
+ // below, we ensure that 'at' is always in bounds.
+ //
+ // PERF: For justification of omitting bounds checks, it gives us a
+ // ~10% bump in search time. This was used for a benchmark:
+ //
+ // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb
+ //
+ // PERF: For justification for the loop unrolling, we use a few
+ // different tests:
//
- // For justification, this gives us a ~10% bump in search time.
- // This was used for a benchmark:
+ // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb
+ // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb
+ // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb
//
- // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
+ // And there are three different configurations:
//
- // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
- // used OpenSubtitles2018.raw.sample.medium.en.
+ // nounroll: this entire 'else' block vanishes and we just
+ // always use 'dfa.next_state(..)'.
+ // unroll1: just the outer loop below
+ // unroll2: just the inner loop below
+ // unroll3: both the outer and inner loops below
+ //
+ // This results in a matrix of timings for each of the above
+ // regexes with each of the above unrolling configurations:
+ //
+ // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ'
+ // nounroll 1.51s 2.34s 1.51s
+ // unroll1 1.53s 2.32s 1.56s
+ // unroll2 2.22s 1.50s 0.61s
+ // unroll3 1.67s 1.45s 0.61s
+ //
+ // Ideally we'd be able to find a configuration that yields the
+ // best time for all regexes, but alas we settle for unroll3 that
+ // gives us *almost* the best for '\w{50}' and the best for the
+ // other two regexes.
+ //
+ // So what exactly is going on here? The first unrolling (grouping
+ // together runs of untagged transitions) specifically targets
+ // our choice of representation. The second unrolling (grouping
+ // together runs of self-transitions) specifically targets a common
+ // DFA topology. Let's dig in a little bit by looking at our
+ // regexes:
+ //
+ // '\w{50}': This regex spends a lot of time outside of the DFA's
+ // start state matching some part of the '\w' repetition. This
+ // means that it's a bit of a worst case for loop unrolling that
+ // targets self-transitions since the self-transitions in '\w{50}'
+ // are not particularly active for this haystack. However, the
+ // first unrolling (grouping together untagged transitions)
+ // does apply quite well here since very few transitions hit
+ // match/dead/quit/unknown states. It is however worth mentioning
+ // that if start states are configured to be tagged (which you
+ // typically want to do if you have a prefilter), then this regex
+ // actually slows way down because it is constantly ping-ponging
+ // out of the unrolled loop and into the handling of a tagged start
+ // state below. But when start states aren't tagged, the unrolled
+ // loop stays hot. (This is why it's imperative that start state
+ // tagging be disabled when there isn't a prefilter!)
+ //
+ // '(?m)^.+$': There are two important aspects of this regex: 1)
+ // on this haystack, its match count is very high, much higher
+ // than the other two regex and 2) it spends the vast majority
+ // of its time matching '.+'. Since Unicode mode is disabled,
+ // this corresponds to repeatedly following self transitions for
+ // the vast majority of the input. This does benefit from the
+ // untagged unrolling since most of the transitions will be to
+ // untagged states, but the untagged unrolling does more work than
+ // what is actually required. Namely, it has to keep track of the
+ // previous and next state IDs, which I guess requires a bit more
+ // shuffling. This is supported by the fact that nounroll+unroll1
+ // are both slower than unroll2+unroll3, where the latter has a
+ // loop unrolling that specifically targets self-transitions.
+ //
+ // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it
+ // spends the vast majority of its time in self-transitions for
+ // the (implicit) unanchored prefix. The main difference with
+ // '(?m)^.+$' is that it has a much lower match count. So there
+ // isn't much time spent in the overhead of reporting matches. This
+ // is the primary explainer in the perf difference here. We include
+ // this regex and the former to make sure we have comparison points
+ // with high and low match counts.
+ //
+ // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
+ //
+ // NOTE: In a follow-up, it turns out that the "inner" loop
+ // mentioned above was a pretty big pessimization in some other
+ // cases. Namely, it resulted in too much ping-ponging into and out
+ // of the loop, which resulted in nearly ~2x regressions in search
+ // time when compared to the originally lazy DFA in the regex crate.
+ // So I've removed the second loop unrolling that targets the
+ // self-transition case.
let mut prev_sid = sid;
- while at < end {
- prev_sid = sid;
- sid = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
+ while at < input.end() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() || at + 3 >= input.end() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
if sid.is_tagged() {
break;
}
- // SAFETY: we make four unguarded accesses to 'bytes[at]'
- // below, and each are safe because we know that 'at + 4' is
- // in bounds. Moreover, while we don't check whether 'sid' is
- // untagged directly, we know it is because of the check above.
- // And the unrolled loop below quits when the next state is not
- // equal to the previous state.
- //
- // PERF: For justification for eliminating bounds checks,
- // see above. For justification for the unrolling, we use
- // two tests. The one above with regex '(?m)^.+$', and also
- // '(?m)^.{40}$'. The former is kinda the best case for
- // unrolling, and gives a 1.67 boost primarily because the DFA
- // spends most of its time munching through the input in the
- // same state. But the latter pattern rarely spends time in the
- // same state through subsequent transitions, so unrolling is
- // pretty much always ineffective in that it craps out on the
- // first 'sid != next' check below. However, without unrolling,
- // search is only 1.03 times faster than with unrolling on the
- // latter pattern, which we deem to be an acceptable loss in
- // favor of optimizing the more common case of having a "hot"
- // state somewhere in the DFA.
- while at + 4 < end {
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at += 1;
+ at += 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
}
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
+ }
+ at += 1;
}
+ // If we quit out of the code above with an unknown state ID at
+ // any point, then we need to re-compute that transition using
+ // 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, prev_sid, bytes[at - 1])
- .map_err(|_| gave_up(at - 1))?;
+ .next_state(cache, prev_sid, input.haystack()[at])
+ .map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => {
+ cache.search_finish(span.end);
+ return Ok(mat);
+ }
+ Some(ref span) => {
+ // We want to skip any update to 'at' below
+ // at the end of this iteration and just
+ // jump immediately back to the next state
+ // transition at the leading position of the
+ // candidate match.
+ //
+ // ... but only if we actually made progress
+ // with our prefilter, otherwise if the start
+ // state has a self-loop, we can get stuck.
+ if span.start > at {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, cache, &input, at,
+ )?;
+ }
+ continue;
}
}
}
}
} else if sid.is_match() {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: at - MATCH_OFFSET,
- });
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ // Since slice ranges are inclusive at the beginning and
+ // exclusive at the end, and since forward searches report
+ // the end, we can return 'at' as-is. This only works because
+ // matches are delayed by 1 byte. So by the time we observe a
+ // match, 'at' has already been set to 1 byte past the actual
+ // match location, which is precisely the exclusive ending
+ // bound of the match.
+ mat = Some(HalfMatch::new(pattern, at));
if earliest {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
}
} else if sid.is_dead() {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
} else if sid.is_quit() {
- if last_match.is_some() {
- return Ok(last_match);
- }
- let offset = at - 1;
- return Err(MatchError::Quit { byte: bytes[offset], offset });
+ cache.search_finish(at);
+ return Err(MatchError::quit(input.haystack()[at], at));
} else {
debug_assert!(sid.is_unknown());
unreachable!("sid being unknown is a bug");
}
}
+ at += 1;
}
- // We are careful to use 'haystack' here, which contains the full context
- // that we might want to inspect.
- Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
+ eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
+ cache.search_finish(input.end());
+ Ok(mat)
}
#[inline(never)]
-pub(crate) fn find_earliest_rev(
+pub(crate) fn find_rev(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(true, dfa, cache, pattern_id, bytes, start, end)
+ if input.is_done() {
+ return Ok(None);
+ }
+ if input.get_earliest() {
+ find_rev_imp(dfa, cache, input, true)
+ } else {
+ find_rev_imp(dfa, cache, input, false)
+ }
}
-#[inline(never)]
-pub(crate) fn find_leftmost_rev(
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_rev_imp(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
-) -> Result<Option<HalfMatch>, MatchError> {
- find_rev(false, dfa, cache, pattern_id, bytes, start, end)
-}
-
-#[inline(always)]
-fn find_rev(
+ input: &Input<'_>,
earliest: bool,
- dfa: &DFA,
- cache: &mut Cache,
- pattern_id: Option<PatternID>,
- haystack: &[u8],
- start: usize,
- end: usize,
) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= haystack.len());
- assert!(end <= haystack.len());
-
- // Why do this? This lets 'bytes[at]' work without bounds checks below.
- // It seems the assert on 'end <= haystack.len()' above is otherwise
- // not enough. Why not just make 'bytes' scoped this way anyway? Well,
- // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
- // for resolving look-ahead.
- let bytes = &haystack[start..];
+ let mut mat = None;
+ let mut sid = init_rev(dfa, cache, input)?;
+ // In reverse search, the loop below can't handle the case of searching an
+ // empty slice. Ideally we could write something congruent to the forward
+ // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
+ // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
+ // this extra case handling by using a signed offset, but Rust makes it
+ // annoying to do. So... We just handle the empty case separately.
+ if input.start() == input.end() {
+ eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
- let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
- let mut last_match = None;
- let mut at = end - start;
- while at > 0 {
+ let mut at = input.end() - 1;
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_untagged_unchecked(cache, $sid, byte)
+ }};
+ }
+ cache.search_start(at);
+ loop {
if sid.is_tagged() {
- at -= 1;
+ cache.search_update(at);
sid = dfa
- .next_state(cache, sid, bytes[at])
+ .next_state(cache, sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
} else {
- // SAFETY: See comments in 'find_fwd' for both a safety argument
- // and a justification from a performance perspective as to 1) why
- // we elide bounds checks and 2) why we do a specialized version of
- // unrolling below.
+ // SAFETY: See comments in 'find_fwd' for a safety argument.
+ //
+ // PERF: The comments in 'find_fwd' also provide a justification
+ // from a performance perspective as to 1) why we elide bounds
+ // checks and 2) why we do a specialized version of unrolling
+ // below. The reverse search does have a slightly different
+ // consideration in that most reverse searches tend to be
+ // anchored and on shorter haystacks. However, this still makes a
+ // difference. Take this command for example:
+ //
+ // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb
+ //
+ // (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
+ // like in the justification for the forward direction. The 'regex'
+ // sub-command will find start-of-match and thus run the reverse
+ // direction.)
+ //
+ // Without unrolling below, the above command takes around 3.76s.
+ // But with the unrolling below, we get down to 2.55s. If we keep
+ // the unrolling but add in bounds checks, then we get 2.86s.
+ //
+ // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
let mut prev_sid = sid;
- while at > 0 && !sid.is_tagged() {
- prev_sid = sid;
+ while at >= input.start() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged()
+ || at <= input.start().saturating_add(3)
+ {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
at -= 1;
- while at > 3 {
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
- let next = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
- if sid != next {
- break;
- }
- at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
+ }
+ at -= 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if prev_sid.is_tagged() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if sid.is_tagged() {
+ break;
}
- sid = unsafe {
- dfa.next_state_untagged_unchecked(
- cache,
- sid,
- *bytes.get_unchecked(at),
- )
- };
+ at -= 1;
}
+ // If we quit out of the code above with an unknown state ID at
+ // any point, then we need to re-compute that transition using
+ // 'next_state', which will do NFA powerset construction for us.
if sid.is_unknown() {
+ cache.search_update(at);
sid = dfa
- .next_state(cache, prev_sid, bytes[at])
+ .next_state(cache, prev_sid, input.haystack()[at])
.map_err(|_| gave_up(at))?;
}
}
if sid.is_tagged() {
if sid.is_start() {
- continue;
+ // do nothing
} else if sid.is_match() {
- last_match = Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: start + at + MATCH_OFFSET,
- });
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ // Since reverse searches report the beginning of a match
+ // and the beginning is inclusive (not exclusive like the
+ // end of a match), we add 1 to make it inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
}
} else if sid.is_dead() {
- return Ok(last_match);
+ cache.search_finish(at);
+ return Ok(mat);
+ } else if sid.is_quit() {
+ cache.search_finish(at);
+ return Err(MatchError::quit(input.haystack()[at], at));
} else {
- debug_assert!(sid.is_quit());
- if last_match.is_some() {
- return Ok(last_match);
- }
- return Err(MatchError::Quit { byte: bytes[at], offset: at });
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
}
}
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
}
- Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
+ cache.search_finish(input.start());
+ eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ Ok(mat)
}
#[inline(never)]
pub(crate) fn find_overlapping_fwd(
- pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- // Searching with a pattern ID is always anchored, so we should only ever
- // use a prefilter when no pattern ID is given.
- if pre.is_some() && pattern_id.is_none() {
- find_overlapping_fwd_imp(
- pre,
- dfa,
- cache,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
} else {
- find_overlapping_fwd_imp(
- None,
- dfa,
- cache,
- pattern_id,
- bytes,
- start,
- end,
- caller_state,
- )
+ dfa.get_config().get_prefilter()
+ };
+ if pre.is_some() {
+ find_overlapping_fwd_imp(dfa, cache, input, pre, state)
+ } else {
+ find_overlapping_fwd_imp(dfa, cache, input, None, state)
}
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp(
- mut pre: Option<&mut prefilter::Scanner>,
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- mut start: usize,
- end: usize,
- caller_state: &mut OverlappingState,
-) -> Result<Option<HalfMatch>, MatchError> {
- assert!(start <= end);
- assert!(start <= bytes.len());
- assert!(end <= bytes.len());
-
- let mut sid = match caller_state.id() {
- None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
+ let mut sid = match state.id {
+ None => {
+ state.at = input.start();
+ init_fwd(dfa, cache, input)?
+ }
Some(sid) => {
- if let Some(last) = caller_state.last_match() {
- let match_count = dfa.match_count(cache, sid);
- if last.match_index < match_count {
- let m = HalfMatch {
- pattern: dfa.match_pattern(
- cache,
- sid,
- last.match_index,
- ),
- offset: last.offset,
- };
- last.match_index += 1;
- return Ok(Some(m));
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(cache, sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(cache, sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
}
}
-
- // This is a subtle but critical detail. If the caller provides a
- // non-None state ID, then it must be the case that the state ID
- // corresponds to one set by this function. The state ID therefore
- // corresponds to a match state, a dead state or some other state.
- // However, "some other" state _only_ occurs when the input has
- // been exhausted because the only way to stop before then is to
- // see a match or a dead/quit state.
- //
- // If the input is exhausted or if it's a dead state, then
- // incrementing the starting position has no relevance on
- // correctness, since the loop below will either not execute
- // at all or will immediately stop due to being in a dead state.
- // (Once in a dead state it is impossible to leave it.)
- //
- // Therefore, the only case we need to consider is when
- // caller_state is a match state. In this case, since our machines
- // support the ability to delay a match by a certain number of
- // bytes (to support look-around), it follows that we actually
- // consumed that many additional bytes on our previous search. When
- // the caller resumes their search to find subsequent matches, they
- // will use the ending location from the previous match as the next
- // starting point, which is `match_offset` bytes PRIOR to where
- // we scanned to on the previous search. Therefore, we need to
- // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
- //
- // Incidentally, since MATCH_OFFSET is non-zero, this also makes
- // dealing with empty matches convenient. Namely, callers needn't
- // special case them when implementing an iterator. Instead, this
- // ensures that forward progress is always made.
- start += MATCH_OFFSET;
+ // Once we've reported all matches at a given position, we need to
+ // advance the search to the next position.
+ state.at += 1;
+ if state.at > input.end() {
+ return Ok(());
+ }
sid
}
};
- let mut at = start;
- while at < end {
- let byte = bytes[at];
- sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
- at += 1;
+ // NOTE: We don't optimize the crap out of this routine primarily because
+ // it seems like most overlapping searches will have higher match counts,
+ // and thus, throughput is perhaps not as important. But if you have a use
+ // case for something faster, feel free to file an issue.
+ cache.search_start(state.at);
+ while state.at < input.end() {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[state.at])
+ .map_err(|_| gave_up(state.at))?;
if sid.is_tagged() {
- caller_state.set_id(sid);
+ state.id = Some(sid);
if sid.is_start() {
- if let Some(ref mut pre) = pre {
- if pre.is_effective(at) {
- match pre.next_candidate(bytes, at).into_option() {
- None => return Ok(None),
- Some(i) => {
- at = i;
+ if let Some(ref pre) = pre {
+ let span = Span::from(state.at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(()),
+ Some(ref span) => {
+ if span.start > state.at {
+ state.at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, cache, &input, state.at,
+ )?;
+ }
+ continue;
}
}
}
}
} else if sid.is_match() {
- let offset = at - MATCH_OFFSET;
- caller_state
- .set_last_match(StateMatch { match_index: 1, offset });
- return Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset,
- }));
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ cache.search_finish(state.at);
+ return Ok(());
} else if sid.is_dead() {
- return Ok(None);
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_quit() {
+ cache.search_finish(state.at);
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
} else {
- debug_assert!(sid.is_quit());
- return Err(MatchError::Quit { byte, offset: at - 1 });
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
}
}
+ state.at += 1;
+ cache.search_update(state.at);
}
- let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
- caller_state.set_id(sid);
- if let Ok(Some(ref last_match)) = result {
- caller_state.set_last_match(StateMatch {
- // '1' is always correct here since if we get to this point, this
- // always corresponds to the first (index '0') match discovered at
- // this position. So the next match to report at this position (if
- // it exists) is at index '1'.
- match_index: 1,
- offset: last_match.offset(),
- });
+ let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat);
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
}
+ cache.search_finish(input.end());
result
}
-#[inline(always)]
+#[inline(never)]
+pub(crate) fn find_overlapping_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let mut sid = match state.id {
+ None => {
+ let sid = init_rev(dfa, cache, input)?;
+ state.id = Some(sid);
+ if input.start() == input.end() {
+ state.rev_eoi = true;
+ } else {
+ state.at = input.end() - 1;
+ }
+ sid
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(cache, sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(cache, sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position. However, if we've
+ // already followed the EOI transition, then we know we're done
+ // with the search and there cannot be any more matches to report.
+ if state.rev_eoi {
+ return Ok(());
+ } else if state.at == input.start() {
+ // At this point, we should follow the EOI transition. This
+ // will cause us the skip the main loop below and fall through
+ // to the final 'eoi_rev' transition.
+ state.rev_eoi = true;
+ } else {
+ // We haven't hit the end of the search yet, so move on.
+ state.at -= 1;
+ }
+ sid
+ }
+ };
+ cache.search_start(state.at);
+ while !state.rev_eoi {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[state.at])
+ .map_err(|_| gave_up(state.at))?;
+ if sid.is_tagged() {
+ state.id = Some(sid);
+ if sid.is_start() {
+ // do nothing
+ } else if sid.is_match() {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at + 1));
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_dead() {
+ cache.search_finish(state.at);
+ return Ok(());
+ } else if sid.is_quit() {
+ cache.search_finish(state.at);
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ } else {
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
+ }
+ }
+ if state.at == input.start() {
+ break;
+ }
+ state.at -= 1;
+ cache.search_update(state.at);
+ }
+
+ let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat);
+ state.rev_eoi = true;
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ cache.search_finish(input.start());
+ result
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- let sid = dfa
- .start_state_forward(cache, pattern_id, bytes, start, end)
- .map_err(|_| gave_up(start))?;
+ let sid = dfa.start_state_forward(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!sid.is_match());
+ debug_assert!(!sid.is_match());
Ok(sid)
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev(
dfa: &DFA,
cache: &mut Cache,
- pattern_id: Option<PatternID>,
- bytes: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
- let sid = dfa
- .start_state_reverse(cache, pattern_id, bytes, start, end)
- .map_err(|_| gave_up(end))?;
+ let sid = dfa.start_state_reverse(cache, input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
- assert!(!sid.is_match());
+ debug_assert!(!sid.is_match());
Ok(sid)
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd(
dfa: &DFA,
cache: &mut Cache,
- bytes: &[u8],
- end: usize,
+ input: &Input<'_>,
sid: &mut LazyStateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- match bytes.get(end) {
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
Some(&b) => {
- *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
+ *sid =
+ dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, *sid, 0),
- offset: end,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if sid.is_quit() {
+ return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa
.next_eoi_state(cache, *sid)
- .map_err(|_| gave_up(bytes.len()))?;
+ .map_err(|_| gave_up(input.haystack().len()))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, *sid, 0),
- offset: bytes.len(),
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
}
}
+ Ok(())
}
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev(
dfa: &DFA,
cache: &mut Cache,
- bytes: &[u8],
- start: usize,
- state: LazyStateID,
-) -> Result<Option<HalfMatch>, MatchError> {
- if start > 0 {
- let sid = dfa
- .next_state(cache, state, bytes[start - 1])
- .map_err(|_| gave_up(start))?;
+ input: &Input<'_>,
+ sid: &mut LazyStateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa
+ .next_state(cache, *sid, byte)
+ .map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: start,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if sid.is_quit() {
+ return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
- let sid =
- dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
+ *sid =
+ dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?;
if sid.is_match() {
- Ok(Some(HalfMatch {
- pattern: dfa.match_pattern(cache, sid, 0),
- offset: 0,
- }))
- } else {
- Ok(None)
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
}
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
}
+ Ok(())
+}
+
+/// Re-compute the starting state that a DFA should be in after finding a
+/// prefilter candidate match at the position `at`.
+///
+/// It is always correct to call this, but not always necessary. Namely,
+/// whenever the DFA has a universal start state, the DFA can remain in the
+/// start state that it was in when it ran the prefilter. Why? Because in that
+/// case, there is only one start state.
+///
+/// When does a DFA have a universal start state? In precisely cases where
+/// it has no look-around assertions in its prefix. So for example, `\bfoo`
+/// does not have a universal start state because the start state depends on
+/// whether the byte immediately before the start position is a word byte or
+/// not. However, `foo\b` does have a universal start state because the word
+/// boundary does not appear in the pattern's prefix.
+///
+/// So... most cases don't need this, but when a pattern doesn't have a
+/// universal start state, then after a prefilter candidate has been found, the
+/// current state *must* be re-litigated as if computing the start state at the
+/// beginning of the search because it might change. That is, not all start
+/// states are created equal.
+///
+/// Why avoid it? Because while it's not super expensive, it isn't a trivial
+/// operation to compute the start state. It is much better to avoid it and
+/// just state in the current state if you know it to be correct.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn prefilter_restart(
+ dfa: &DFA,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ at: usize,
+) -> Result<LazyStateID, MatchError> {
+ let mut input = input.clone();
+ input.set_start(at);
+ init_fwd(dfa, cache, &input)
}
/// A convenience routine for constructing a "gave up" match error.
-#[inline(always)]
+#[cfg_attr(feature = "perf-inline", inline(always))]
fn gave_up(offset: usize) -> MatchError {
- MatchError::GaveUp { offset }
+ MatchError::gave_up(offset)
}
diff --git a/vendor/regex-automata/src/lib.rs b/vendor/regex-automata/src/lib.rs
index d9d7ada48..62260a5ae 100644
--- a/vendor/regex-automata/src/lib.rs
+++ b/vendor/regex-automata/src/lib.rs
@@ -1,47 +1,648 @@
/*!
-This crate provides an "expert" API for executing regular expressions using
-finite automata.
-
-**WARNING**: This `0.2` release of `regex-automata` was published
-before it was ready to unblock work elsewhere that needed some
-of the new APIs in this release. At the time of writing, it is
-strongly preferred that you continue using the
-[`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/)
-release. Since this release represents an unfinished state, please do not
-create issues for this release unless it's for a critical bug.
+This crate exposes a variety of regex engines used by the `regex` crate.
+It provides a vast, sprawling and "expert" level API to each regex engine.
+The regex engines provided by this crate focus heavily on finite automata
+implementations and specifically guarantee worst case `O(m * n)` time
+complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.)
+
+The primary goal of this crate is to serve as an implementation detail for the
+`regex` crate. A secondary goal is to make its internals available for use by
+others.
+
+# Table of contents
+
+* [Should I be using this crate?](#should-i-be-using-this-crate) gives some
+reasons for and against using this crate.
+* [Examples](#examples) provides a small selection of things you can do with
+this crate.
+* [Available regex engines](#available-regex-engines) provides a hyperlinked
+list of all regex engines in this crate.
+* [API themes](#api-themes) discusses common elements used throughout this
+crate.
+* [Crate features](#crate-features) documents the extensive list of Cargo
+features available.
+
+# Should I be using this crate?
+
+If you find yourself here because you just want to use regexes, then you should
+first check out whether the [`regex` crate](https://docs.rs/regex) meets
+your needs. It provides a streamlined and difficult-to-misuse API for regex
+searching.
+
+If you're here because there is something specific you want to do that can't
+be easily done with `regex` crate, then you are perhaps in the right place.
+It's most likely that the first stop you'll want to make is to explore the
+[`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper
+over a [`meta::Regex`], so its API will probably be the easiest to transition
+to. In contrast to the `regex` crate, the `meta::Regex` API supports more
+search parameters and does multi-pattern searches. However, it isn't quite as
+ergonomic.
+
+Otherwise, the following is an inexhaustive list of reasons to use this crate:
+
+* You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly.
+* You want more powerful multi-pattern search than what is provided by
+`RegexSet` in the `regex` crate. All regex engines in this crate support
+multi-pattern searches.
+* You want to use one of the `regex` crate's internal engines directly because
+of some interesting configuration that isn't possible via the `regex` crate.
+For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a
+dizzying number of options for controlling its execution.
+* You want to use the lower level search APIs. For example, both the [lazy
+DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring
+the automaton one state at a time. This might be useful, for example, for
+stream searches or searches of strings stored in non-contiguous in memory.
+* You want to build a fully compiled DFA and then [use zero-copy
+deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use
+it for searching. This use case is supported in core-only no-std/no-alloc
+environments.
+* You want to run [anchored searches](Input::anchored) without using the `^`
+anchor in your regex pattern.
+* You need to work-around contention issues with
+sharing a regex across multiple threads. The
+[`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing
+any kind of synchronization at all by requiring the caller to provide the
+mutable scratch spaced needed during a search.
+* You want to build your own regex engine on top of the `regex` crate's
+infrastructure.
+
+# Examples
+
+This section tries to identify a few interesting things you can do with this
+crate and demonstrates them.
+
+### Multi-pattern searches with capture groups
+
+One of the more frustrating limitations of `RegexSet` in the `regex` crate
+(at the time of writing) is that it doesn't report match positions. With this
+crate, multi-pattern support was intentionally designed in from the beginning,
+which means it works in all regex engines and even for capture groups as well.
+
+This example shows how to search for matches of multiple regexes, where each
+regex uses the same capture group names to parse different key-value formats.
+
+```
+use regex_automata::{meta::Regex, PatternID};
+
+let re = Regex::new_many(&[
+ r#"(?m)^(?<key>[[:word:]]+)=(?<val>[[:word:]]+)$"#,
+ r#"(?m)^(?<key>[[:word:]]+)="(?<val>[^"]+)"$"#,
+ r#"(?m)^(?<key>[[:word:]]+)='(?<val>[^']+)'$"#,
+ r#"(?m)^(?<key>[[:word:]]+):\s*(?<val>[[:word:]]+)$"#,
+])?;
+let hay = r#"
+best_album="Blow Your Face Out"
+best_quote='"then as it was, then again it will be"'
+best_year=1973
+best_simpsons_episode: HOMR
+"#;
+let mut kvs = vec![];
+for caps in re.captures_iter(hay) {
+ // N.B. One could use capture indices '1' and '2' here
+ // as well. Capture indices are local to each pattern.
+ // (Just like names are.)
+ let key = &hay[caps.get_group_by_name("key").unwrap()];
+ let val = &hay[caps.get_group_by_name("val").unwrap()];
+ kvs.push((key, val));
+}
+assert_eq!(kvs, vec![
+ ("best_album", "Blow Your Face Out"),
+ ("best_quote", "\"then as it was, then again it will be\""),
+ ("best_year", "1973"),
+ ("best_simpsons_episode", "HOMR"),
+]);
+
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+### Build a full DFA and walk it manually
+
+One of the regex engines in this crate is a fully compiled DFA. It takes worst
+case exponential time to build, but once built, it can be easily explored and
+used for searches. Here's a simple example that uses its lower level APIs to
+implement a simple anchored search by hand.
+
+```
+use regex_automata::{dfa::{Automaton, dense}, Input};
+
+let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
+let haystack = "Quartz";
+
+// The start state is determined by inspecting the position and the
+// initial bytes of the haystack.
+let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+// Walk all the bytes in the haystack.
+for &b in haystack.as_bytes().iter() {
+ state = dfa.next_state(state, b);
+}
+// DFAs in this crate require an explicit
+// end-of-input transition if a search reaches
+// the end of a haystack.
+state = dfa.next_eoi_state(state);
+assert!(dfa.is_match_state(state));
+
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or do the same with a lazy DFA that avoids exponential worst case compile time,
+but requires mutable scratch space to lazily build the DFA during the search.
+
+```
+use regex_automata::{hybrid::dfa::DFA, Input};
+
+let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
+let mut cache = dfa.create_cache();
+let hay = "Quartz";
+
+// The start state is determined by inspecting the position and the
+// initial bytes of the haystack.
+let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?;
+// Walk all the bytes in the haystack.
+for &b in hay.as_bytes().iter() {
+ state = dfa.next_state(&mut cache, state, b)?;
+}
+// DFAs in this crate require an explicit
+// end-of-input transition if a search reaches
+// the end of a haystack.
+state = dfa.next_eoi_state(&mut cache, state)?;
+assert!(state.is_match());
+
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+### Find all overlapping matches
+
+This example shows how to build a DFA and use it to find all possible matches,
+including overlapping matches. A similar example will work with a lazy DFA as
+well. This also works with multiple patterns and will report all matches at the
+same position where multiple patterns match.
+
+```
+use regex_automata::{
+ dfa::{dense, Automaton, OverlappingState},
+ Input, MatchKind,
+};
+
+let dfa = dense::DFA::builder()
+ .configure(dense::DFA::config().match_kind(MatchKind::All))
+ .build(r"(?-u)\w{3,}")?;
+let input = Input::new("homer marge bart lisa maggie");
+let mut state = OverlappingState::start();
+
+let mut matches = vec![];
+while let Some(hm) = {
+ dfa.try_search_overlapping_fwd(&input, &mut state)?;
+ state.get_match()
+} {
+ matches.push(hm.offset());
+}
+assert_eq!(matches, vec![
+ 3, 4, 5, // hom, home, homer
+ 9, 10, 11, // mar, marg, marge
+ 15, 16, // bar, bart
+ 20, 21, // lis, lisa
+ 25, 26, 27, 28, // mag, magg, maggi, maggie
+]);
+
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Available regex engines
+
+The following is a complete list of all regex engines provided by this crate,
+along with a very brief description of it and why you might want to use it.
+
+* [`dfa::regex::Regex`] is a regex engine that works on top of either
+[dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might
+use a DFA if you need the fastest possible regex engine in this crate and can
+afford the exorbitant memory usage usually required by DFAs. Low level APIs on
+fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton).
+Fully compiled dense DFAs can handle all regexes except for searching a regex
+with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based
+regex can only report the start and end of each match.
+* [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily
+built DFA. Its performance profile is very similar to that of fully compiled
+DFAs, but can be slower in some pathological cases. Fully compiled DFAs are
+also amenable to more optimizations, such as state acceleration, that aren't
+available in a lazy DFA. You might use this lazy DFA if you can't abide the
+worst case exponential compile time of a full DFA, but still want the DFA
+search performance in the vast majority of cases. A lazy DFA based regex can
+only report the start and end of each match.
+* [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but
+can report the matches of each capture group in addition to the start and end
+of each match. The catch is that it only works on a somewhat small subset of
+regexes known as "one-pass." You'll want to use this for cases when you need
+capture group matches and the regex is one-pass since it is likely to be faster
+than any alternative. A one-pass DFA can handle all types of regexes, but does
+have some reasonable limits on the number of capture groups it can handle.
+* [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses
+backtracking, but keeps track of the work it has done to avoid catastrophic
+backtracking. Like the one-pass DFA, it provides the matches of each capture
+group. It retains the `O(m * n)` worst case time bound. This tends to be slower
+than the one-pass DFA regex engine, but faster than the PikeVM. It can handle
+all types of regexes, but usually only works well with small haystacks and
+small regexes due to the memory required to avoid redoing work.
+* [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all
+regexes, of all sizes and provides capture group matches. It tends to be a tool
+of last resort because it is also usually the slowest regex engine.
+* [`meta::Regex`] is the meta regex engine that combines *all* of the above
+engines into one. The reason for this is that each of the engines above have
+their own caveats such as, "only handles a subset of regexes" or "is generally
+slow." The meta regex engine accounts for all of these caveats and composes
+the engines in a way that attempts to mitigate each engine's weaknesses while
+emphasizing its strengths. For example, it will attempt to run a lazy DFA even
+if it might fail. In which case, it will restart the search with a likely
+slower but more capable regex engine. The meta regex engine is what you should
+default to. Use one of the above engines directly only if you have a specific
+reason to.
+
+# API themes
+
+While each regex engine has its own APIs and configuration options, there are
+some general themes followed by all of them.
+
+### The `Input` abstraction
+
+Most search routines in this crate accept anything that implements
+`Into<Input>`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which
+means that things like `engine.search("foo")` will work as you would expect.
+
+By virtue of accepting an `Into<Input>` though, callers can provide more than
+just a haystack. Indeed, the [`Input`] type has more details, but briefly,
+callers can use it to configure various aspects of the search:
+
+* The span of the haystack to search via [`Input::span`] or [`Input::range`],
+which might be a substring of the haystack.
+* Whether to run an anchored search or not via [`Input::anchored`]. This
+permits one to require matches to start at the same offset that the search
+started.
+* Whether to ask the regex engine to stop as soon as a match is seen via
+[`Input::earliest`]. This can be used to find the offset of a match as soon
+as it is known without waiting for the full leftmost-first match to be found.
+This can also be used to avoid the worst case `O(m * n^2)` time complexity
+of iteration.
+
+Some lower level search routines accept an `&Input` for performance reasons.
+In which case, `&Input::new("haystack")` can be used for a simple search.
+
+### Error reporting
+
+Most, but not all, regex engines in this crate can fail to execute a search.
+When a search fails, callers cannot determine whether or not a match exists.
+That is, the result is indeterminate.
+
+Search failure, in all cases in this crate, is represented by a [`MatchError`].
+Routines that can fail start with the `try_` prefix in their name. For example,
+[`hybrid::regex::Regex::try_search`] can fail for a number of reasons.
+Conversely, routines that either can't fail or can panic on failure lack the
+`try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in
+cases where [`hybrid::regex::Regex::try_search`] would return an error, and
+[`meta::Regex::find`] will never panic. Therefore, callers need to pay close
+attention to the panicking conditions in the documentation.
+
+In most cases, the reasons that a search fails are either predictable or
+configurable, albeit at some additional cost.
+
+An example of predictable failure is
+[`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search).
+Namely, it fails whenever the multiplication of the haystack, the regex and some
+constant exceeds the
+[configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity).
+Callers can predict the failure in terms of haystack length via the
+[`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len)
+method. While this form of failure is technically avoidable by increasing the
+visited capacity, it isn't practical to do so for all inputs because the
+memory usage required for larger haystacks becomes impractically large. So in
+practice, if one is using the bounded backtracker, you really do have to deal
+with the failure.
+
+An example of configurable failure happens when one enables heuristic support
+for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate
+(except for the one-pass DFA) do not support Unicode word boundaries on
+non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word
+boundary will itself fail. However, one can configure DFAs to still be built in
+this case by
+[configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary).
+If the NFA the DFA is built from contains a Unicode word boundary, then the
+DFA will still be built, but special transitions will be added to every state
+that cause the DFA to fail if any non-ASCII byte is seen. This failure happens
+at search time and it requires the caller to opt into this.
+
+There are other ways for regex engines to fail in this crate, but the above
+two should represent the general theme of failures one can find. Dealing
+with these failures is, in part, one the responsibilities of the [meta regex
+engine](meta). Notice, for example, that the meta regex engine exposes an API
+that never returns an error nor panics. It carefully manages all of the ways
+in which the regex engines can fail and either avoids the predictable ones
+entirely (e.g., the bounded backtracker) or reacts to configured failures by
+falling back to a different engine (e.g., the lazy DFA quitting because it saw
+a non-ASCII byte).
+
+### Configuration and Builders
+
+Most of the regex engines in this crate come with two types to facilitate
+building the regex engine: a `Config` and a `Builder`. A `Config` is usually
+specific to that particular regex engine, but other objects such as parsing and
+NFA compilation have `Config` types too. A `Builder` is the thing responsible
+for taking inputs (either pattern strings or already-parsed patterns or even
+NFAs directly) and turning them into an actual regex engine that can be used
+for searching.
+
+The main reason why building a regex engine is a bit complicated is because
+of the desire to permit composition with de-coupled components. For example,
+you might want to [manually construct a Thompson NFA](nfa::thompson::Builder)
+and then build a regex engine from it without ever using a regex parser
+at all. On the other hand, you might also want to build a regex engine directly
+from the concrete syntax. This demonstrates why regex engine construction is
+so flexible: it needs to support not just convenient construction, but also
+construction from parts built elsewhere.
+
+This is also in turn why there are many different `Config` structs in this
+crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It
+accepts three different `Config` types for configuring construction of a lazy
+DFA regex:
+
+* [`hybrid::regex::Builder::syntax`] accepts a
+[`util::syntax::Config`] for configuring the options found in the
+[`regex-syntax`](regex_syntax) crate. For example, whether to match
+case insensitively.
+* [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for
+configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example,
+whether to build an NFA that matches the reverse language described by the
+regex.
+* [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for
+configuring construction of the pair of underlying lazy DFAs that make up the
+lazy DFA regex engine. For example, changing the capacity of the cache used to
+store the transition table.
+
+The lazy DFA regex engine uses all three of those configuration objects for
+methods like [`hybrid::regex::Builder::build`], which accepts a pattern
+string containing the concrete syntax of your regex. It uses the syntax
+configuration to parse it into an AST and translate it into an HIR. Then the
+NFA configuration when compiling the HIR into an NFA. And then finally the DFA
+configuration when lazily determinizing the NFA into a DFA.
+
+Notice though that the builder also has a
+[`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers
+to build the underlying pair of lazy DFAs themselves (one for the forward
+searching to find the end of a match and one for the reverse searching to find
+the start of a match), and then build the regex engine from them. The lazy
+DFAs, in turn, have their own builder that permits [construction directly from
+a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the
+rabbit hole, a Thompson NFA has its own compiler that permits [construction
+directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA
+regex engine builder lets you follow this rabbit hole all the way down, but
+also provides convenience routines that do it for you when you don't need
+precise control over every component.
+
+The [meta regex engine](meta) is a good example of something that utilizes the
+full flexibility of these builders. It often needs not only precise control
+over each component, but also shares them across multiple regex engines.
+(Most sharing is done by internal reference accounting. For example, an
+[`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning
+cheap.)
+
+### Size limits
+
+Unlike the `regex` crate, the `regex-automata` crate specifically does not
+enable any size limits by default. That means users of this crate need to
+be quite careful when using untrusted patterns. Namely, because bounded
+repetitions can grow exponentially by stacking them, it is possible to build a
+very large internal regex object from just a small pattern string. For example,
+the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB.
+
+There are multiple size limit options in this crate. If one or more size limits
+are relevant for the object you're building, they will be configurable via
+methods on a corresponding `Config` type.
+
+# Crate features
+
+This crate has a dizzying number of features. The main idea is to be able to
+control how much stuff you pull in for your specific use case, since the full
+crate is quite large and can dramatically increase compile times and binary
+size.
+
+The most barebones but useful configuration is to disable all default features
+and enable only `dfa-search`. This will bring in just the DFA deserialization
+and search routines without any dependency on `std` or `alloc`. This does
+require generating and serializing a DFA, and then storing it somewhere, but
+it permits regex searches in freestanding or embedded environments.
+
+Because there are so many features, they are split into a few groups.
+
+The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`,
+`nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything
+except for development related features like `logging`.
+
+### Ecosystem features
+
+* **std** - Enables use of the standard library. In terms of APIs, this usually
+just means that error types implement the `std::error::Error` trait. Otherwise,
+`std` sometimes enables the code to be faster, for example, using a `HashMap`
+instead of a `BTreeMap`. (The `std` feature matters more for dependencies like
+`aho-corasick` and `memchr`, where `std` is required to enable certain classes
+of SIMD optimizations.) Enabling `std` automatically enables `alloc`.
+* **alloc** - Enables use of the `alloc` library. This is required for most
+APIs in this crate. The main exception is deserializing and searching with
+fully compiled DFAs.
+* **logging** - Adds a dependency on the `log` crate and makes this crate emit
+log messages of varying degrees of utility. The log messages are especially
+useful in trying to understand what the meta regex engine is doing.
+
+### Performance features
+
+* **perf** - Enables all of the below features.
+* **perf-inline** - When enabled, `inline(always)` is used in (many) strategic
+locations to help performance at the expense of longer compile times and
+increased binary size.
+* **perf-literal** - Enables all literal related optimizations.
+ * **perf-literal-substring** - Enables all single substring literal
+ optimizations. This includes adding a dependency on the `memchr` crate.
+ * **perf-literal-multisubstring** - Enables all multiple substring literal
+ optimizations. This includes adding a dependency on the `aho-corasick`
+ crate.
+
+### Unicode features
+
+* **unicode** -
+ Enables all Unicode features. This feature is enabled by default, and will
+ always cover all Unicode features, even if more are added in the future.
+* **unicode-age** -
+ Provide the data for the
+ [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
+ This makes it possible to use classes like `\p{Age:6.0}` to refer to all
+ codepoints first introduced in Unicode 6.0
+* **unicode-bool** -
+ Provide the data for numerous Unicode boolean properties. The full list
+ is not included here, but contains properties like `Alphabetic`, `Emoji`,
+ `Lowercase`, `Math`, `Uppercase` and `White_Space`.
+* **unicode-case** -
+ Provide the data for case insensitive matching using
+ [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
+* **unicode-gencat** -
+ Provide the data for
+ [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
+ This includes, but is not limited to, `Decimal_Number`, `Letter`,
+ `Math_Symbol`, `Number` and `Punctuation`.
+* **unicode-perl** -
+ Provide the data for supporting the Unicode-aware Perl character classes,
+ corresponding to `\w`, `\s` and `\d`. This is also necessary for using
+ Unicode-aware word boundary assertions. Note that if this feature is
+ disabled, the `\s` and `\d` character classes are still available if the
+ `unicode-bool` and `unicode-gencat` features are enabled, respectively.
+* **unicode-script** -
+ Provide the data for
+ [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
+ This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
+ `Latin` and `Thai`.
+* **unicode-segment** -
+ Provide the data necessary to provide the properties used to implement the
+ [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
+ This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
+ `\p{sb=ATerm}`.
+* **unicode-word-boundary** -
+ Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When
+ this and `unicode-perl` are enabled, then data tables from `regex-syntax` are
+ used to implement Unicode word boundaries. However, if `regex-syntax` isn't
+ enabled as a dependency then one can still enable this feature. It will
+ cause `regex-automata` to bundle its own data table that would otherwise be
+ redundant with `regex-syntax`'s table.
+
+### Regex engine features
+
+* **syntax** - Enables a dependency on `regex-syntax`. This makes APIs
+for building regex engines from pattern strings available. Without the
+`regex-syntax` dependency, the only way to build a regex engine is generally
+to deserialize a previously built DFA or to hand assemble an NFA using its
+[builder API](nfa::thompson::Builder). Once you have an NFA, you can build any
+of the regex engines in this crate. The `syntax` feature also enables `alloc`.
+* **meta** - Enables the meta regex engine. This also enables the `syntax` and
+`nfa-pikevm` features, as both are the minimal requirements needed. The meta
+regex engine benefits from enabling any of the other regex engines and will
+use them automatically when appropriate.
+* **nfa** - Enables all NFA related features below.
+ * **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`.
+ * **nfa-pikevm** - Enables the PikeVM regex engine. This enables
+ `nfa-thompson`.
+ * **nfa-backtrack** - Enables the bounded backtracker regex engine. This
+ enables `nfa-thompson`.
+* **dfa** - Enables all DFA related features below.
+ * **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This
+ enables `nfa-thompson` and `dfa-search`.
+ * **dfa-search** - Enables APIs for searching with DFAs.
+ * **dfa-onepass** - Enables the one-pass DFA API. This enables
+ `nfa-thompson`.
+* **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This
+enables `alloc` and `nfa-thompson`.
+
*/
-#![allow(warnings)]
-// #![deny(missing_docs)]
-#![cfg_attr(not(feature = "std"), no_std)]
+// We are no_std.
+#![no_std]
+// All APIs need docs!
+#![deny(missing_docs)]
+// Some intra-doc links are broken when certain features are disabled, so we
+// only bleat about it when most (all?) features are enabled. But when we do,
+// we block the build. Links need to work.
+#![cfg_attr(
+ all(
+ feature = "std",
+ feature = "nfa",
+ feature = "dfa",
+ feature = "hybrid"
+ ),
+ deny(rustdoc::broken_intra_doc_links)
+)]
+// Broken rustdoc links are very easy to come by when you start disabling
+// features. Namely, features tend to change imports, and imports change what's
+// available to link to.
+//
+// Basically, we just don't support rustdoc for anything other than the maximal
+// feature configuration. Other configurations will work, they just won't be
+// perfect.
+//
+// So here, we specifically allow them so we don't even get warned about them.
+#![cfg_attr(
+ not(all(
+ feature = "std",
+ feature = "nfa",
+ feature = "dfa",
+ feature = "hybrid"
+ )),
+ allow(rustdoc::broken_intra_doc_links)
+)]
+// Kinda similar, but eliminating all of the dead code and unused import
+// warnings for every feature combo is a fool's errand. Instead, we just
+// suppress those, but still let them through in a common configuration when we
+// build most of everything.
+//
+// This does actually suggest that when features are disabled, we are actually
+// compiling more code than we need to be. And this is perhaps not so great
+// because disabling features is usually done in order to reduce compile times
+// by reducing the amount of code one compiles... However, usually, most of the
+// time this dead code is a relatively small amount from the 'util' module.
+// But... I confess... There isn't a ton of visibility on this.
+//
+// I'm happy to try to address this in a different way, but "let's annotate
+// every function in 'util' with some non-local combination of features" just
+// cannot be the way forward.
+#![cfg_attr(
+ not(all(
+ feature = "std",
+ feature = "nfa",
+ feature = "dfa",
+ feature = "hybrid",
+ feature = "perf-literal-substring",
+ feature = "perf-literal-multisubstring",
+ )),
+ allow(dead_code, unused_imports, unused_variables)
+)]
+// We generally want all types to impl Debug.
+#![warn(missing_debug_implementations)]
+// No clue why this thing is still unstable because it's pretty amazing. This
+// adds Cargo feature annotations to items in the rustdoc output. Which is
+// sadly hugely beneficial for this crate due to the number of features.
+#![cfg_attr(docsrs, feature(doc_auto_cfg))]
+// I have literally never tested this crate on 16-bit, so it is quite
+// suspicious to advertise support for it. But... the regex crate, at time
+// of writing, at least claims to support it by not doing any conditional
+// compilation based on the target pointer width. So I guess I remain
+// consistent with that here.
+//
+// If you are here because you're on a 16-bit system and you were somehow using
+// the regex crate previously, please file an issue. Please be prepared to
+// provide some kind of reproduction or carve out some path to getting 16-bit
+// working in CI. (Via qemu?)
#[cfg(not(any(
target_pointer_width = "16",
target_pointer_width = "32",
target_pointer_width = "64"
)))]
-compile_error!("regex-automata currently not supported on non-{16,32,64}");
+compile_error!("not supported on non-{16,32,64}, please file an issue");
+
+#[cfg(any(test, feature = "std"))]
+extern crate std;
#[cfg(feature = "alloc")]
extern crate alloc;
+#[cfg(doctest)]
+doc_comment::doctest!("../README.md");
+
#[doc(inline)]
-pub use crate::util::id::PatternID;
-#[cfg(feature = "alloc")]
-pub use crate::util::syntax::SyntaxConfig;
-pub use crate::util::{
- bytes::{DeserializeError, SerializeError},
- matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch},
-};
+pub use crate::util::primitives::PatternID;
+pub use crate::util::search::*;
#[macro_use]
mod macros;
+#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))]
pub mod dfa;
-#[cfg(feature = "alloc")]
+#[cfg(feature = "hybrid")]
pub mod hybrid;
-#[doc(hidden)]
-#[cfg(feature = "alloc")]
+#[cfg(feature = "meta")]
+pub mod meta;
+#[cfg(feature = "nfa-thompson")]
pub mod nfa;
-#[doc(hidden)]
pub mod util;
diff --git a/vendor/regex-automata/src/macros.rs b/vendor/regex-automata/src/macros.rs
index 649ba17c5..31b4ca381 100644
--- a/vendor/regex-automata/src/macros.rs
+++ b/vendor/regex-automata/src/macros.rs
@@ -1,20 +1,6 @@
-/// A simple macro for defining bitfield accessors/mutators.
-#[cfg(feature = "alloc")]
-macro_rules! define_bool {
- ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
- fn $is_fn_name(&self) -> bool {
- self.bools & (0b1 << $bit) > 0
- }
-
- fn $set_fn_name(&mut self, yes: bool) {
- if yes {
- self.bools |= 1 << $bit;
- } else {
- self.bools &= !(1 << $bit);
- }
- }
- };
-}
+// Some feature combinations result in some of these macros never being used.
+// Which is fine. Just squash the warnings.
+#![allow(unused_macros)]
macro_rules! log {
($($tt:tt)*) => {
@@ -25,6 +11,10 @@ macro_rules! log {
}
}
+macro_rules! debug {
+ ($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
+}
+
macro_rules! trace {
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
}
diff --git a/vendor/regex-automata/src/meta/error.rs b/vendor/regex-automata/src/meta/error.rs
new file mode 100644
index 000000000..ea9a3160e
--- /dev/null
+++ b/vendor/regex-automata/src/meta/error.rs
@@ -0,0 +1,241 @@
+use regex_syntax::{ast, hir};
+
+use crate::{nfa, util::search::MatchError, PatternID};
+
+/// An error that occurs when construction of a `Regex` fails.
+///
+/// A build error is generally a result of one of two possible failure
+/// modes. First is a parse or syntax error in the concrete syntax of a
+/// pattern. Second is that the construction of the underlying regex matcher
+/// fails, usually because it gets too big with respect to limits like
+/// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit).
+///
+/// This error provides very little introspection capabilities. You can:
+///
+/// * Ask for the [`PatternID`] of the pattern that caused an error, if one
+/// is available. This is available for things like syntax errors, but not for
+/// cases where build limits are exceeded.
+/// * Ask for the underlying syntax error, but only if the error is a syntax
+/// error.
+/// * Ask for a human readable message corresponding to the underlying error.
+/// * The `BuildError::source` method (from the `std::error::Error`
+/// trait implementation) may be used to query for an underlying error if one
+/// exists. There are no API guarantees about which error is returned.
+///
+/// When the `std` feature is enabled, this implements `std::error::Error`.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ Syntax { pid: PatternID, err: regex_syntax::Error },
+ NFA(nfa::thompson::BuildError),
+}
+
+impl BuildError {
+ /// If it is known which pattern ID caused this build error to occur, then
+ /// this method returns it.
+ ///
+ /// Some errors are not associated with a particular pattern. However, any
+ /// errors that occur as part of parsing a pattern are guaranteed to be
+ /// associated with a pattern ID.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, PatternID};
+ ///
+ /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
+ /// assert_eq!(Some(PatternID::must(2)), err.pattern());
+ /// ```
+ pub fn pattern(&self) -> Option<PatternID> {
+ match self.kind {
+ BuildErrorKind::Syntax { pid, .. } => Some(pid),
+ _ => None,
+ }
+ }
+
+ /// If this error occurred because the regex exceeded the configured size
+ /// limit before being built, then this returns the configured size limit.
+ ///
+ /// The limit returned is what was configured, and corresponds to the
+ /// maximum amount of heap usage in bytes.
+ pub fn size_limit(&self) -> Option<usize> {
+ match self.kind {
+ BuildErrorKind::NFA(ref err) => err.size_limit(),
+ _ => None,
+ }
+ }
+
+ /// If this error corresponds to a syntax error, then a reference to it is
+ /// returned by this method.
+ pub fn syntax_error(&self) -> Option<&regex_syntax::Error> {
+ match self.kind {
+ BuildErrorKind::Syntax { ref err, .. } => Some(err),
+ _ => None,
+ }
+ }
+
+ pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
+ let err = regex_syntax::Error::from(err);
+ BuildError { kind: BuildErrorKind::Syntax { pid, err } }
+ }
+
+ pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError {
+ let err = regex_syntax::Error::from(err);
+ BuildError { kind: BuildErrorKind::Syntax { pid, err } }
+ }
+
+ pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind {
+ BuildErrorKind::Syntax { ref err, .. } => Some(err),
+ BuildErrorKind::NFA(ref err) => Some(err),
+ }
+ }
+}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind {
+ BuildErrorKind::Syntax { pid, .. } => {
+ write!(f, "error parsing pattern {}", pid.as_usize())
+ }
+ BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+ }
+ }
+}
+
+/// An error that occurs when a search should be retried.
+///
+/// This retry error distinguishes between two different failure modes.
+///
+/// The first is one where potential quadratic behavior has been detected.
+/// In this case, whatever optimization that led to this behavior should be
+/// stopped, and the next best strategy should be used.
+///
+/// The second indicates that the underlying regex engine has failed for some
+/// reason. This usually occurs because either a lazy DFA's cache has become
+/// ineffective or because a non-ASCII byte has been seen *and* a Unicode word
+/// boundary was used in one of the patterns. In this failure case, a different
+/// regex engine that won't fail in these ways (PikeVM, backtracker or the
+/// one-pass DFA) should be used.
+///
+/// This is an internal error only and should never bleed into the public
+/// API.
+#[derive(Debug)]
+pub(crate) enum RetryError {
+ Quadratic(RetryQuadraticError),
+ Fail(RetryFailError),
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for RetryError {}
+
+impl core::fmt::Display for RetryError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match *self {
+ RetryError::Quadratic(ref err) => err.fmt(f),
+ RetryError::Fail(ref err) => err.fmt(f),
+ }
+ }
+}
+
+impl From<MatchError> for RetryError {
+ fn from(merr: MatchError) -> RetryError {
+ RetryError::Fail(RetryFailError::from(merr))
+ }
+}
+
+/// An error that occurs when potential quadratic behavior has been detected
+/// when applying either the "reverse suffix" or "reverse inner" optimizations.
+///
+/// When this error occurs, callers should abandon the "reverse" optimization
+/// and use a normal forward search.
+#[derive(Debug)]
+pub(crate) struct RetryQuadraticError(());
+
+impl RetryQuadraticError {
+ pub(crate) fn new() -> RetryQuadraticError {
+ RetryQuadraticError(())
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for RetryQuadraticError {}
+
+impl core::fmt::Display for RetryQuadraticError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "regex engine gave up to avoid quadratic behavior")
+ }
+}
+
+impl From<RetryQuadraticError> for RetryError {
+ fn from(err: RetryQuadraticError) -> RetryError {
+ RetryError::Quadratic(err)
+ }
+}
+
+/// An error that occurs when a regex engine "gives up" for some reason before
+/// finishing a search. Usually this occurs because of heuristic Unicode word
+/// boundary support or because of ineffective cache usage in the lazy DFA.
+///
+/// When this error occurs, callers should retry the regex search with a
+/// different regex engine.
+///
+/// Note that this has convenient `From` impls that will automatically
+/// convert a `MatchError` into this error. This works because the meta
+/// regex engine internals guarantee that errors like `HaystackTooLong` and
+/// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and
+/// `GaveUp`, which both correspond to this "failure" error.
+#[derive(Debug)]
+pub(crate) struct RetryFailError {
+ offset: usize,
+}
+
+impl RetryFailError {
+ pub(crate) fn from_offset(offset: usize) -> RetryFailError {
+ RetryFailError { offset }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for RetryFailError {}
+
+impl core::fmt::Display for RetryFailError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "regex engine failed at offset {:?}", self.offset)
+ }
+}
+
+impl From<RetryFailError> for RetryError {
+ fn from(err: RetryFailError) -> RetryError {
+ RetryError::Fail(err)
+ }
+}
+
+impl From<MatchError> for RetryFailError {
+ fn from(merr: MatchError) -> RetryFailError {
+ use crate::util::search::MatchErrorKind::*;
+
+ match *merr.kind() {
+ Quit { offset, .. } => RetryFailError::from_offset(offset),
+ GaveUp { offset } => RetryFailError::from_offset(offset),
+ // These can never occur because we avoid them by construction
+ // or with higher level control flow logic. For example, the
+ // backtracker's wrapper will never hand out a backtracker engine
+ // when the haystack would be too long.
+ HaystackTooLong { .. } | UnsupportedAnchored { .. } => {
+ unreachable!("found impossible error in meta engine: {}", merr)
+ }
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/meta/limited.rs b/vendor/regex-automata/src/meta/limited.rs
new file mode 100644
index 000000000..192a2625e
--- /dev/null
+++ b/vendor/regex-automata/src/meta/limited.rs
@@ -0,0 +1,267 @@
+/*!
+This module defines two bespoke reverse DFA searching routines. (One for the
+lazy DFA and one for the fully compiled DFA.) These routines differ from the
+usual ones by permitting the caller to specify a minimum starting position.
+That is, the search will begin at `input.end()` and will usually stop at
+`input.start()`, unless `min_start > input.start()`, in which case, the search
+will stop at `min_start`.
+
+In other words, this lets you say, "no, the search must not extend past this
+point, even if it's within the bounds of the given `Input`." And if the search
+*does* want to go past that point, it stops and returns a "may be quadratic"
+error, which indicates that the caller should retry using some other technique.
+
+These routines specifically exist to protect against quadratic behavior when
+employing the "reverse suffix" and "reverse inner" optimizations. Without the
+backstop these routines provide, it is possible for parts of the haystack to
+get re-scanned over and over again. The backstop not only prevents this, but
+*tells you when it is happening* so that you can change the strategy.
+
+Why can't we just use the normal search routines? We could use the normal
+search routines and just set the start bound on the provided `Input` to our
+`min_start` position. The problem here is that it's impossible to distinguish
+between "no match because we reached the end of input" and "determined there
+was no match well before the end of input." The former case is what we care
+about with respect to quadratic behavior. The latter case is totally fine.
+
+Why don't we modify the normal search routines to report the position at which
+the search stops? I considered this, and I still wonder if it is indeed the
+right thing to do. However, I think the straight-forward thing to do there
+would be to complicate the return type signature of almost every search routine
+in this crate, which I really do not want to do. It therefore might make more
+sense to provide a richer way for search routines to report meta data, but that
+was beyond my bandwidth to work on at the time of writing.
+
+See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a
+real demonstration of how quadratic behavior is mitigated.
+*/
+
+use crate::{
+ meta::error::{RetryError, RetryQuadraticError},
+ HalfMatch, Input, MatchError,
+};
+
+#[cfg(feature = "dfa-build")]
+pub(crate) fn dfa_try_search_half_rev(
+ dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
+ input: &Input<'_>,
+ min_start: usize,
+) -> Result<Option<HalfMatch>, RetryError> {
+ use crate::dfa::Automaton;
+
+ let mut mat = None;
+ let mut sid = dfa.start_state_reverse(input)?;
+ if input.start() == input.end() {
+ dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
+ let mut at = input.end() - 1;
+ loop {
+ sid = dfa.next_state(sid, input.haystack()[at]);
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ // Since reverse searches report the beginning of a
+ // match and the beginning is inclusive (not exclusive
+ // like the end of a match), we add 1 to make it
+ // inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
+ } else if dfa.is_quit_state(sid) {
+ if mat.is_some() {
+ return Ok(mat);
+ }
+ return Err(MatchError::quit(input.haystack()[at], at).into());
+ }
+ }
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
+ if at < min_start {
+ trace!(
+ "reached position {} which is before the previous literal \
+ match, quitting to avoid quadratic behavior",
+ at,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ }
+ let was_dead = dfa.is_dead_state(sid);
+ dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ // If we reach the beginning of the search and we could otherwise still
+ // potentially keep matching if there was more to match, then we actually
+ // return an error to indicate giving up on this optimization. Why? Because
+ // we can't prove that the real match begins at where we would report it.
+ //
+ // This only happens when all of the following are true:
+ //
+ // 1) We reach the starting point of our search span.
+ // 2) The match we found is before the starting point.
+ // 3) The FSM reports we could possibly find a longer match.
+ //
+ // We need (1) because otherwise the search stopped before the starting
+ // point and there is no possible way to find a more leftmost position.
+ //
+ // We need (2) because if the match found has an offset equal to the minimum
+ // possible offset, then there is no possible more leftmost match.
+ //
+ // We need (3) because if the FSM couldn't continue anyway (i.e., it's in
+ // a dead state), then we know we couldn't find anything more leftmost
+ // than what we have. (We have to check the state we were in prior to the
+ // EOI transition since the EOI transition will usually bring us to a dead
+ // state by virtue of it represents the end-of-input.)
+ if at == input.start()
+ && mat.map_or(false, |m| m.offset() > input.start())
+ && !was_dead
+ {
+ trace!(
+ "reached beginning of search at offset {} without hitting \
+ a dead state, quitting to avoid potential false positive match",
+ at,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ Ok(mat)
+}
+
+#[cfg(feature = "hybrid")]
+pub(crate) fn hybrid_try_search_half_rev(
+ dfa: &crate::hybrid::dfa::DFA,
+ cache: &mut crate::hybrid::dfa::Cache,
+ input: &Input<'_>,
+ min_start: usize,
+) -> Result<Option<HalfMatch>, RetryError> {
+ let mut mat = None;
+ let mut sid = dfa.start_state_reverse(cache, input)?;
+ if input.start() == input.end() {
+ hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
+ let mut at = input.end() - 1;
+ loop {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[at])
+ .map_err(|_| MatchError::gave_up(at))?;
+ if sid.is_tagged() {
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ // Since reverse searches report the beginning of a
+ // match and the beginning is inclusive (not exclusive
+ // like the end of a match), we add 1 to make it
+ // inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
+ } else if sid.is_dead() {
+ return Ok(mat);
+ } else if sid.is_quit() {
+ if mat.is_some() {
+ return Ok(mat);
+ }
+ return Err(MatchError::quit(input.haystack()[at], at).into());
+ }
+ }
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
+ if at < min_start {
+ trace!(
+ "reached position {} which is before the previous literal \
+ match, quitting to avoid quadratic behavior",
+ at,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ }
+ let was_dead = sid.is_dead();
+ hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
+ // See the comments in the full DFA routine above for why we need this.
+ if at == input.start()
+ && mat.map_or(false, |m| m.offset() > input.start())
+ && !was_dead
+ {
+ trace!(
+ "reached beginning of search at offset {} without hitting \
+ a dead state, quitting to avoid potential false positive match",
+ at,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ Ok(mat)
+}
+
+#[cfg(feature = "dfa-build")]
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn dfa_eoi_rev(
+ dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
+ input: &Input<'_>,
+ sid: &mut crate::util::primitives::StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ use crate::dfa::Automaton;
+
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa.next_state(*sid, byte);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if dfa.is_quit_state(*sid) {
+ if mat.is_some() {
+ return Ok(());
+ }
+ return Err(MatchError::quit(byte, sp.start - 1));
+ }
+ } else {
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
+ }
+ Ok(())
+}
+
+#[cfg(feature = "hybrid")]
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn hybrid_eoi_rev(
+ dfa: &crate::hybrid::dfa::DFA,
+ cache: &mut crate::hybrid::dfa::Cache,
+ input: &Input<'_>,
+ sid: &mut crate::hybrid::LazyStateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa
+ .next_state(cache, *sid, byte)
+ .map_err(|_| MatchError::gave_up(sp.start))?;
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if sid.is_quit() {
+ if mat.is_some() {
+ return Ok(());
+ }
+ return Err(MatchError::quit(byte, sp.start - 1));
+ }
+ } else {
+ *sid = dfa
+ .next_eoi_state(cache, *sid)
+ .map_err(|_| MatchError::gave_up(sp.start))?;
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
+ }
+ Ok(())
+}
diff --git a/vendor/regex-automata/src/meta/literal.rs b/vendor/regex-automata/src/meta/literal.rs
new file mode 100644
index 000000000..a68b93b7a
--- /dev/null
+++ b/vendor/regex-automata/src/meta/literal.rs
@@ -0,0 +1,81 @@
+use alloc::{vec, vec::Vec};
+
+use regex_syntax::hir::Hir;
+
+use crate::{meta::regex::RegexInfo, util::search::MatchKind};
+
+/// Pull out an alternation of literals from the given sequence of HIR
+/// expressions.
+///
+/// There are numerous ways for this to fail. Generally, this only applies
+/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
+/// are "too few" alternates, in which case, the regex engine is likely faster.
+///
+/// And currently, this only returns something when 'hirs.len() == 1'.
+pub(crate) fn alternation_literals(
+ info: &RegexInfo,
+ hirs: &[&Hir],
+) -> Option<Vec<Vec<u8>>> {
+ use regex_syntax::hir::{HirKind, Literal};
+
+ // Might as well skip the work below if we know we can't build an
+ // Aho-Corasick searcher.
+ if !cfg!(feature = "perf-literal-multisubstring") {
+ return None;
+ }
+ // This is pretty hacky, but basically, if `is_alternation_literal` is
+ // true, then we can make several assumptions about the structure of our
+ // HIR. This is what justifies the `unreachable!` statements below.
+ if hirs.len() != 1
+ || !info.props()[0].look_set().is_empty()
+ || info.props()[0].explicit_captures_len() > 0
+ || !info.props()[0].is_alternation_literal()
+ || info.config().get_match_kind() != MatchKind::LeftmostFirst
+ {
+ return None;
+ }
+ let hir = &hirs[0];
+ let alts = match *hir.kind() {
+ HirKind::Alternation(ref alts) => alts,
+ _ => return None, // one literal isn't worth it
+ };
+
+ let mut lits = vec![];
+ for alt in alts {
+ let mut lit = vec![];
+ match *alt.kind() {
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes)
+ }
+ HirKind::Concat(ref exprs) => {
+ for e in exprs {
+ match *e.kind() {
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes);
+ }
+ _ => unreachable!("expected literal, got {:?}", e),
+ }
+ }
+ }
+ _ => unreachable!("expected literal or concat, got {:?}", alt),
+ }
+ lits.push(lit);
+ }
+ // Why do this? Well, when the number of literals is small, it's likely
+ // that we'll use the lazy DFA which is in turn likely to be faster than
+ // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
+ // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
+ // the latter because it is so hungry (in time and space), and the former
+ // is decently fast, but not as fast as a well oiled lazy DFA.
+ //
+ // However, once the number starts getting large, the lazy DFA is likely
+ // to start thrashing because of the modest default cache size. When
+ // exactly does this happen? Dunno. But at whatever point that is (we make
+ // a guess below based on ad hoc benchmarking), we'll want to cut over to
+ // Aho-Corasick, where even the contiguous NFA is likely to do much better.
+ if lits.len() < 3000 {
+ debug!("skipping Aho-Corasick because there are too few literals");
+ return None;
+ }
+ Some(lits)
+}
diff --git a/vendor/regex-automata/src/meta/mod.rs b/vendor/regex-automata/src/meta/mod.rs
new file mode 100644
index 000000000..01f430fcb
--- /dev/null
+++ b/vendor/regex-automata/src/meta/mod.rs
@@ -0,0 +1,62 @@
+/*!
+Provides a regex matcher that composes several other regex matchers
+automatically.
+
+This module is home to a meta [`Regex`], which provides a convenient high
+level API for executing regular expressions in linear time.
+
+# Comparison with the `regex` crate
+
+A meta `Regex` is the implementation used directly by the `regex` crate.
+Indeed, the `regex` crate API is essentially just a light wrapper over a meta
+`Regex`. This means that if you need the full flexibility offered by this
+API, then you should be able to switch to using this API directly without
+any changes in match semantics or syntax. However, there are some API level
+differences:
+
+* The `regex` crate API returns match objects that include references to the
+haystack itself, which in turn makes it easy to access the matching strings
+without having to slice the haystack yourself. In contrast, a meta `Regex`
+returns match objects that only have offsets in them.
+* At time of writing, a meta `Regex` doesn't have some of the convenience
+routines that the `regex` crate has, such as replacements. Note though that
+[`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string)
+will handle the replacement string interpolation for you.
+* A meta `Regex` supports the [`Input`](crate::Input) abstraction, which
+provides a way to configure a search in more ways than is supported by the
+`regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can
+be used to run an anchored search, regardless of whether the pattern is itself
+anchored with a `^`.
+* A meta `Regex` supports multi-pattern searching everywhere.
+Indeed, every [`Match`](crate::Match) returned by the search APIs
+include a [`PatternID`](crate::PatternID) indicating which pattern
+matched. In the single pattern case, all matches correspond to
+[`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate
+has distinct `Regex` and a `RegexSet` APIs. The former only supports a single
+pattern, while the latter supports multiple patterns but cannot report the
+offsets of a match.
+* A meta `Regex` provides the explicit capability of bypassing its internal
+memory pool for automatically acquiring mutable scratch space required by its
+internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower
+level routines such as [`Regex::search_with`].
+
+*/
+
+pub use self::{
+ error::BuildError,
+ regex::{
+ Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split,
+ SplitN,
+ },
+};
+
+mod error;
+#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
+mod limited;
+mod literal;
+mod regex;
+mod reverse_inner;
+#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
+mod stopat;
+mod strategy;
+mod wrappers;
diff --git a/vendor/regex-automata/src/meta/regex.rs b/vendor/regex-automata/src/meta/regex.rs
new file mode 100644
index 000000000..3a04b14d8
--- /dev/null
+++ b/vendor/regex-automata/src/meta/regex.rs
@@ -0,0 +1,3649 @@
+use core::{
+ borrow::Borrow,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+use alloc::{boxed::Box, sync::Arc, vec, vec::Vec};
+
+use regex_syntax::{
+ ast,
+ hir::{self, Hir},
+};
+
+use crate::{
+ meta::{
+ error::BuildError,
+ strategy::{self, Strategy},
+ wrappers,
+ },
+ nfa::thompson::WhichCaptures,
+ util::{
+ captures::{Captures, GroupInfo},
+ iter,
+ pool::{Pool, PoolGuard},
+ prefilter::Prefilter,
+ primitives::{NonMaxUsize, PatternID},
+ search::{HalfMatch, Input, Match, MatchKind, PatternSet, Span},
+ },
+};
+
+/// A type alias for our pool of meta::Cache that fixes the type parameters to
+/// what we use for the meta regex below.
+type CachePool = Pool<Cache, CachePoolFn>;
+
+/// Same as above, but for the guard returned by a pool.
+type CachePoolGuard<'a> = PoolGuard<'a, Cache, CachePoolFn>;
+
+/// The type of the closure we use to create new caches. We need to spell out
+/// all of the marker traits or else we risk leaking !MARKER impls.
+type CachePoolFn =
+ Box<dyn Fn() -> Cache + Send + Sync + UnwindSafe + RefUnwindSafe>;
+
+/// A regex matcher that works by composing several other regex matchers
+/// automatically.
+///
+/// In effect, a meta regex papers over a lot of the quirks or performance
+/// problems in each of the regex engines in this crate. Its goal is to provide
+/// an infallible and simple API that "just does the right thing" in the common
+/// case.
+///
+/// A meta regex is the implementation of a `Regex` in the `regex` crate.
+/// Indeed, the `regex` crate API is essentially just a light wrapper over
+/// this type. This includes the `regex` crate's `RegexSet` API!
+///
+/// # Composition
+///
+/// This is called a "meta" matcher precisely because it uses other regex
+/// matchers to provide a convenient high level regex API. Here are some
+/// examples of how other regex matchers are composed:
+///
+/// * When calling [`Regex::captures`], instead of immediately
+/// running a slower but more capable regex engine like the
+/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM), the meta regex engine
+/// will usually first look for the bounds of a match with a higher throughput
+/// regex engine like a [lazy DFA](crate::hybrid). Only when a match is found
+/// is a slower engine like `PikeVM` used to find the matching span for each
+/// capture group.
+/// * While higher throughout engines like the lazy DFA cannot handle
+/// Unicode word boundaries in general, they can still be used on pure ASCII
+/// haystacks by pretending that Unicode word boundaries are just plain ASCII
+/// word boundaries. However, if a haystack is not ASCII, the meta regex engine
+/// will automatically switch to a (possibly slower) regex engine that supports
+/// Unicode word boundaries in general.
+/// * In some cases where a regex pattern is just a simple literal or a small
+/// set of literals, an actual regex engine won't be used at all. Instead,
+/// substring or multi-substring search algorithms will be employed.
+///
+/// There are many other forms of composition happening too, but the above
+/// should give a general idea. In particular, it may perhaps be surprising
+/// that *multiple* regex engines might get executed for a single search. That
+/// is, the decision of what regex engine to use is not _just_ based on the
+/// pattern, but also based on the dynamic execution of the search itself.
+///
+/// The primary reason for this composition is performance. The fundamental
+/// tension is that the faster engines tend to be less capable, and the more
+/// capable engines tend to be slower.
+///
+/// Note that the forms of composition that are allowed are determined by
+/// compile time crate features and configuration. For example, if the `hybrid`
+/// feature isn't enabled, or if [`Config::hybrid`] has been disabled, then the
+/// meta regex engine will never use a lazy DFA.
+///
+/// # Synchronization and cloning
+///
+/// Most of the regex engines in this crate require some kind of mutable
+/// "scratch" space to read and write from while performing a search. Since
+/// a meta regex composes these regex engines, a meta regex also requires
+/// mutable scratch space. This scratch space is called a [`Cache`].
+///
+/// Most regex engines _also_ usually have a read-only component, typically
+/// a [Thompson `NFA`](crate::nfa::thompson::NFA).
+///
+/// In order to make the `Regex` API convenient, most of the routines hide
+/// the fact that a `Cache` is needed at all. To achieve this, a [memory
+/// pool](crate::util::pool::Pool) is used internally to retrieve `Cache`
+/// values in a thread safe way that also permits reuse. This in turn implies
+/// that every such search call requires some form of synchronization. Usually
+/// this synchronization is fast enough to not notice, but in some cases, it
+/// can be a bottleneck. This typically occurs when all of the following are
+/// true:
+///
+/// * The same `Regex` is shared across multiple threads simultaneously,
+/// usually via a [`util::lazy::Lazy`](crate::util::lazy::Lazy) or something
+/// similar from the `once_cell` or `lazy_static` crates.
+/// * The primary unit of work in each thread is a regex search.
+/// * Searches are run on very short haystacks.
+///
+/// This particular case can lead to high contention on the pool used by a
+/// `Regex` internally, which can in turn increase latency to a noticeable
+/// effect. This cost can be mitigated in one of the following ways:
+///
+/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it.
+/// Cloning a `Regex` _does not_ do a deep copy of its read-only component.
+/// But it does lead to each `Regex` having its own memory pool, which in
+/// turn eliminates the problem of contention. In general, this technique should
+/// not result in any additional memory usage when compared to sharing the same
+/// `Regex` across multiple threads simultaneously.
+/// * Use lower level APIs, like [`Regex::search_with`], which permit passing
+/// a `Cache` explicitly. In this case, it is up to you to determine how best
+/// to provide a `Cache`. For example, you might put a `Cache` in thread-local
+/// storage if your use case allows for it.
+///
+/// Overall, this is an issue that happens rarely in practice, but it can
+/// happen.
+///
+/// # Warning: spin-locks may be used in alloc-only mode
+///
+/// When this crate is built without the `std` feature and the high level APIs
+/// on a `Regex` are used, then a spin-lock will be used to synchronize access
+/// to an internal pool of `Cache` values. This may be undesirable because
+/// a spin-lock is [effectively impossible to implement correctly in user
+/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could
+/// result in a deadlock.
+///
+/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
+///
+/// If one wants to avoid the use of spin-locks when the `std` feature is
+/// disabled, then you must use APIs that accept a `Cache` value explicitly.
+/// For example, [`Regex::search_with`].
+///
+/// # Example
+///
+/// ```
+/// use regex_automata::meta::Regex;
+///
+/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?;
+/// assert!(re.is_match("2010-03-14"));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: anchored search
+///
+/// This example shows how to use [`Input::anchored`] to run an anchored
+/// search, even when the regex pattern itself isn't anchored. An anchored
+/// search guarantees that if a match is found, then the start offset of the
+/// match corresponds to the offset at which the search was started.
+///
+/// ```
+/// use regex_automata::{meta::Regex, Anchored, Input, Match};
+///
+/// let re = Regex::new(r"\bfoo\b")?;
+/// let input = Input::new("xx foo xx").range(3..).anchored(Anchored::Yes);
+/// // The offsets are in terms of the original haystack.
+/// assert_eq!(Some(Match::must(0, 3..6)), re.find(input));
+///
+/// // Notice that no match occurs here, because \b still takes the
+/// // surrounding context into account, even if it means looking back
+/// // before the start of your search.
+/// let hay = "xxfoo xx";
+/// let input = Input::new(hay).range(2..).anchored(Anchored::Yes);
+/// assert_eq!(None, re.find(input));
+/// // Indeed, you cannot achieve the above by simply slicing the
+/// // haystack itself, since the regex engine can't see the
+/// // surrounding context. This is why 'Input' permits setting
+/// // the bounds of a search!
+/// let input = Input::new(&hay[2..]).anchored(Anchored::Yes);
+/// // WRONG!
+/// assert_eq!(Some(Match::must(0, 0..3)), re.find(input));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: earliest search
+///
+/// This example shows how to use [`Input::earliest`] to run a search that
+/// might stop before finding the typical leftmost match.
+///
+/// ```
+/// use regex_automata::{meta::Regex, Anchored, Input, Match};
+///
+/// let re = Regex::new(r"[a-z]{3}|b")?;
+/// let input = Input::new("abc").earliest(true);
+/// assert_eq!(Some(Match::must(0, 1..2)), re.find(input));
+///
+/// // Note that "earliest" isn't really a match semantic unto itself.
+/// // Instead, it is merely an instruction to whatever regex engine
+/// // gets used internally to quit as soon as it can. For example,
+/// // this regex uses a different search technique, and winds up
+/// // producing a different (but valid) match!
+/// let re = Regex::new(r"abc|b")?;
+/// let input = Input::new("abc").earliest(true);
+/// assert_eq!(Some(Match::must(0, 0..3)), re.find(input));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: change the line terminator
+///
+/// This example shows how to enable multi-line mode by default and change
+/// the line terminator to the NUL byte:
+///
+/// ```
+/// use regex_automata::{meta::Regex, util::syntax, Match};
+///
+/// let re = Regex::builder()
+/// .syntax(syntax::Config::new().multi_line(true))
+/// .configure(Regex::config().line_terminator(b'\x00'))
+/// .build(r"^foo$")?;
+/// let hay = "\x00foo\x00";
+/// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Debug)]
+pub struct Regex {
+ /// The actual regex implementation.
+ imp: Arc<RegexI>,
+ /// A thread safe pool of caches.
+ ///
+ /// For the higher level search APIs, a `Cache` is automatically plucked
+ /// from this pool before running a search. The lower level `with` methods
+ /// permit the caller to provide their own cache, thereby bypassing
+ /// accesses to this pool.
+ ///
+ /// Note that we put this outside the `Arc` so that cloning a `Regex`
+ /// results in creating a fresh `CachePool`. This in turn permits callers
+ /// to clone regexes into separate threads where each such regex gets
+ /// the pool's "thread owner" optimization. Otherwise, if one shares the
+ /// `Regex` directly, then the pool will go through a slower mutex path for
+ /// all threads except for the "owner."
+ pool: CachePool,
+}
+
+/// The internal implementation of `Regex`, split out so that it can be wrapped
+/// in an `Arc`.
+#[derive(Debug)]
+struct RegexI {
+ /// The core matching engine.
+ ///
+ /// Why is this reference counted when RegexI is already wrapped in an Arc?
+ /// Well, we need to capture this in a closure to our `Pool` below in order
+ /// to create new `Cache` values when needed. So since it needs to be in
+ /// two places, we make it reference counted.
+ ///
+ /// We make `RegexI` itself reference counted too so that `Regex` itself
+ /// stays extremely small and very cheap to clone.
+ strat: Arc<dyn Strategy>,
+ /// Metadata about the regexes driving the strategy. The metadata is also
+ /// usually stored inside the strategy too, but we put it here as well
+ /// so that we can get quick access to it (without virtual calls) before
+ /// executing the regex engine. For example, we use this metadata to
+ /// detect a subset of cases where we know a match is impossible, and can
+ /// thus avoid calling into the strategy at all.
+ ///
+ /// Since `RegexInfo` is stored in multiple places, it is also reference
+ /// counted.
+ info: RegexInfo,
+}
+
+/// Convenience constructors for a `Regex` using the default configuration.
+impl Regex {
+ /// Builds a `Regex` from a single pattern string using the default
+ /// configuration.
+ ///
+ /// If there was a problem parsing the pattern or a problem turning it into
+ /// a regex matcher, then an error is returned.
+ ///
+ /// If you want to change the configuration of a `Regex`, use a [`Builder`]
+ /// with a [`Config`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new(r"(?Rm)^foo$")?;
+ /// let hay = "\r\nfoo\r\n";
+ /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+ Regex::builder().build(pattern)
+ }
+
+ /// Builds a `Regex` from many pattern strings using the default
+ /// configuration.
+ ///
+ /// If there was a problem parsing any of the patterns or a problem turning
+ /// them into a regex matcher, then an error is returned.
+ ///
+ /// If you want to change the configuration of a `Regex`, use a [`Builder`]
+ /// with a [`Config`].
+ ///
+ /// # Example: simple lexer
+ ///
+ /// This simplistic example leverages the multi-pattern support to build a
+ /// simple little lexer. The pattern ID in the match tells you which regex
+ /// matched, which in turn might be used to map back to the "type" of the
+ /// token returned by the lexer.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new_many(&[
+ /// r"[[:space:]]",
+ /// r"[A-Za-z0-9][A-Za-z0-9_]+",
+ /// r"->",
+ /// r".",
+ /// ])?;
+ /// let haystack = "fn is_boss(bruce: i32, springsteen: String) -> bool;";
+ /// let matches: Vec<Match> = re.find_iter(haystack).collect();
+ /// assert_eq!(matches, vec![
+ /// Match::must(1, 0..2), // 'fn'
+ /// Match::must(0, 2..3), // ' '
+ /// Match::must(1, 3..10), // 'is_boss'
+ /// Match::must(3, 10..11), // '('
+ /// Match::must(1, 11..16), // 'bruce'
+ /// Match::must(3, 16..17), // ':'
+ /// Match::must(0, 17..18), // ' '
+ /// Match::must(1, 18..21), // 'i32'
+ /// Match::must(3, 21..22), // ','
+ /// Match::must(0, 22..23), // ' '
+ /// Match::must(1, 23..34), // 'springsteen'
+ /// Match::must(3, 34..35), // ':'
+ /// Match::must(0, 35..36), // ' '
+ /// Match::must(1, 36..42), // 'String'
+ /// Match::must(3, 42..43), // ')'
+ /// Match::must(0, 43..44), // ' '
+ /// Match::must(2, 44..46), // '->'
+ /// Match::must(0, 46..47), // ' '
+ /// Match::must(1, 47..51), // 'bool'
+ /// Match::must(3, 51..52), // ';'
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// One can write a lexer like the above using a regex like
+ /// `(?P<space>[[:space:]])|(?P<ident>[A-Za-z0-9][A-Za-z0-9_]+)|...`,
+ /// but then you need to ask whether capture group matched to determine
+ /// which branch in the regex matched, and thus, which token the match
+ /// corresponds to. In contrast, the above example includes the pattern ID
+ /// in the match. There's no need to use capture groups at all.
+ ///
+ /// # Example: finding the pattern that caused an error
+ ///
+ /// When a syntax error occurs, it is possible to ask which pattern
+ /// caused the syntax error.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, PatternID};
+ ///
+ /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
+ /// assert_eq!(Some(PatternID::must(2)), err.pattern());
+ /// ```
+ ///
+ /// # Example: zero patterns is valid
+ ///
+ /// Building a regex with zero patterns results in a regex that never
+ /// matches anything. Because this routine is generic, passing an empty
+ /// slice usually requires a turbo-fish (or something else to help type
+ /// inference).
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::new_many::<&str>(&[])?;
+ /// assert_eq!(None, re.find(""));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ Regex::builder().build_many(patterns)
+ }
+
+ /// Return a default configuration for a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the [`Config`]
+ /// type when customizing the construction of a `Regex`.
+ ///
+ /// # Example: lower the NFA size limit
+ ///
+ /// In some cases, the default size limit might be too big. The size limit
+ /// can be lowered, which will prevent large regex patterns from compiling.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10))))
+ /// // Not even 20KB is enough to build a single large Unicode class!
+ /// .build(r"\pL");
+ /// assert!(result.is_err());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example: change the line terminator
+ ///
+ /// This example shows how to enable multi-line mode by default and change
+ /// the line terminator to the NUL byte:
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().multi_line(true))
+ /// .configure(Regex::config().line_terminator(b'\x00'))
+ /// .build(r"^foo$")?;
+ /// let hay = "\x00foo\x00";
+ /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+/// High level convenience routines for using a regex to search a haystack.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. (Consider how this might make
+ /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`.
+ /// This routine _may_ stop after it sees the first `a`, but routines like
+ /// `find` need to continue searching because `+` is greedy by default.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ ///
+ /// assert!(re.is_match("foo12345bar"));
+ /// assert!(!re.is_match("foobar"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: consistency with search APIs
+ ///
+ /// `is_match` is guaranteed to return `true` whenever `find` returns a
+ /// match. This includes searches that are executed entirely within a
+ /// codepoint:
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input};
+ ///
+ /// let re = Regex::new("a*")?;
+ ///
+ /// // This doesn't match because the default configuration bans empty
+ /// // matches from splitting a codepoint.
+ /// assert!(!re.is_match(Input::new("☃").span(1..2)));
+ /// assert_eq!(None, re.find(Input::new("☃").span(1..2)));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Notice that when UTF-8 mode is disabled, then the above reports a
+ /// match because the restriction against zero-width matches that split a
+ /// codepoint has been lifted:
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8_empty(false))
+ /// .build("a*")?;
+ ///
+ /// assert!(re.is_match(Input::new("☃").span(1..2)));
+ /// assert_eq!(
+ /// Some(Match::must(0, 1..1)),
+ /// re.find(Input::new("☃").span(1..2)),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// A similar idea applies when using line anchors with CRLF mode enabled,
+ /// which prevents them from matching between a `\r` and a `\n`.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, Match};
+ ///
+ /// let re = Regex::new(r"(?Rm:$)")?;
+ /// assert!(!re.is_match(Input::new("\r\n").span(1..1)));
+ /// // A regular line anchor, which only considers \n as a
+ /// // line terminator, will match.
+ /// let re = Regex::new(r"(?m:$)")?;
+ /// assert!(re.is_match(Input::new("\r\n").span(1..1)));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
+ let input = input.into().earliest(true);
+ if self.imp.info.is_impossible(&input) {
+ return false;
+ }
+ let mut guard = self.pool.get();
+ let result = self.imp.strat.is_match(&mut guard, &input);
+ // See 'Regex::search' for why we put the guard back explicitly.
+ PoolGuard::put(guard);
+ result
+ }
+
+ /// Executes a leftmost search and returns the first match that is found,
+ /// if one exists.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
+ self.search(&input.into())
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Span};
+ ///
+ /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
+ /// let mut caps = re.create_captures();
+ ///
+ /// re.captures("2010-03-14", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
+ /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures<'h, I: Into<Input<'h>>>(
+ &self,
+ input: I,
+ caps: &mut Captures,
+ ) {
+ self.search_captures(&input.into(), caps)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in
+ /// the given haystack. If no match exists, then the iterator yields no
+ /// elements.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let haystack = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(haystack).collect();
+ /// assert_eq!(matches, vec![
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ ) -> FindMatches<'r, 'h> {
+ let cache = self.pool.get();
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, cache, it }
+ }
+
+ /// Returns an iterator over all non-overlapping `Captures` values. If no
+ /// match exists, then the iterator yields no elements.
+ ///
+ /// This yields the same matches as [`Regex::find_iter`], but it includes
+ /// the spans of all capturing groups that participate in each match.
+ ///
+ /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
+ /// how to correctly iterate over all matches in a haystack while avoiding
+ /// the creation of a new `Captures` value for every match. (Which you are
+ /// forced to do with an `Iterator`.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Span};
+ ///
+ /// let re = Regex::new("foo(?P<numbers>[0-9]+)")?;
+ ///
+ /// let haystack = "foo1 foo12 foo123";
+ /// let matches: Vec<Span> = re
+ /// .captures_iter(haystack)
+ /// // The unwrap is OK since 'numbers' matches if the pattern matches.
+ /// .map(|caps| caps.get_group_by_name("numbers").unwrap())
+ /// .collect();
+ /// assert_eq!(matches, vec![
+ /// Span::from(3..4),
+ /// Span::from(8..10),
+ /// Span::from(14..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures_iter<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ ) -> CapturesMatches<'r, 'h> {
+ let cache = self.pool.get();
+ let caps = self.create_captures();
+ let it = iter::Searcher::new(input.into());
+ CapturesMatches { re: self, cache, caps, it }
+ }
+
+ /// Returns an iterator of spans of the haystack given, delimited by a
+ /// match of the regex. Namely, each element of the iterator corresponds to
+ /// a part of the haystack that *isn't* matched by the regular expression.
+ ///
+ /// # Example
+ ///
+ /// To split a string delimited by arbitrary amounts of spaces or tabs:
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"[ \t]+")?;
+ /// let hay = "a b \t c\td e";
+ /// let fields: Vec<&str> = re.split(hay).map(|span| &hay[span]).collect();
+ /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: more cases
+ ///
+ /// Basic usage:
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r" ")?;
+ /// let hay = "Mary had a little lamb";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec![""]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "lionXXtigerXleopard";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]);
+ ///
+ /// let re = Regex::new(r"::")?;
+ /// let hay = "lion::tiger::leopard";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["lion", "tiger", "leopard"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// If a haystack contains multiple contiguous matches, you will end up
+ /// with empty spans yielded by the iterator:
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "XXXXaXXbXc";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]);
+ ///
+ /// let re = Regex::new(r"/")?;
+ /// let hay = "(///)";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["(", "", "", ")"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Separators at the start or end of a haystack are neighbored by empty
+ /// spans.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"0")?;
+ /// let hay = "010";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["", "1", ""]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// When the empty string is used as a regex, it splits at every valid
+ /// UTF-8 boundary by default (which includes the beginning and end of the
+ /// haystack):
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"")?;
+ /// let hay = "rust";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]);
+ ///
+ /// // Splitting by an empty string is UTF-8 aware by default!
+ /// let re = Regex::new(r"")?;
+ /// let hay = "☃";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["", "☃", ""]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// But note that UTF-8 mode for empty strings can be disabled, which will
+ /// then result in a match at every byte offset in the haystack,
+ /// including between every UTF-8 code unit.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8_empty(false))
+ /// .build(r"")?;
+ /// let hay = "☃".as_bytes();
+ /// let got: Vec<&[u8]> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec![
+ /// // Writing byte string slices is just brutal. The problem is that
+ /// // b"foo" has type &[u8; 3] instead of &[u8].
+ /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..],
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Contiguous separators (commonly shows up with whitespace), can lead to
+ /// possibly surprising behavior. For example, this code is correct:
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r" ")?;
+ /// let hay = " a b c";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want
+ /// to match contiguous space characters:
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r" +")?;
+ /// let hay = " a b c";
+ /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect();
+ /// // N.B. This does still include a leading empty span because ' +'
+ /// // matches at the beginning of the haystack.
+ /// assert_eq!(got, vec!["", "a", "b", "c"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn split<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ ) -> Split<'r, 'h> {
+ Split { finder: self.find_iter(input), last: 0 }
+ }
+
+ /// Returns an iterator of at most `limit` spans of the haystack given,
+ /// delimited by a match of the regex. (A `limit` of `0` will return no
+ /// spans.) Namely, each element of the iterator corresponds to a part
+ /// of the haystack that *isn't* matched by the regular expression. The
+ /// remainder of the haystack that is not split will be the last element in
+ /// the iterator.
+ ///
+ /// # Example
+ ///
+ /// Get the first two words in some haystack:
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"\W+").unwrap();
+ /// let hay = "Hey! How are you?";
+ /// let fields: Vec<&str> =
+ /// re.splitn(hay, 3).map(|span| &hay[span]).collect();
+ /// assert_eq!(fields, vec!["Hey", "How", "are you?"]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Examples: more cases
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r" ")?;
+ /// let hay = "Mary had a little lamb";
+ /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "";
+ /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec![""]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "lionXXtigerXleopard";
+ /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]);
+ ///
+ /// let re = Regex::new(r"::")?;
+ /// let hay = "lion::tiger::leopard";
+ /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["lion", "tiger::leopard"]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "abcXdef";
+ /// let got: Vec<&str> = re.splitn(hay, 1).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["abcXdef"]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "abcdef";
+ /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect();
+ /// assert_eq!(got, vec!["abcdef"]);
+ ///
+ /// let re = Regex::new(r"X")?;
+ /// let hay = "abcXdef";
+ /// let got: Vec<&str> = re.splitn(hay, 0).map(|sp| &hay[sp]).collect();
+ /// assert!(got.is_empty());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn splitn<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ limit: usize,
+ ) -> SplitN<'r, 'h> {
+ SplitN { splits: self.split(input), limit }
+ }
+}
+
+/// Lower level search routines that give more control.
+impl Regex {
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// This is like [`Regex::find`] but, but it accepts a concrete `&Input`
+ /// instead of an `Into<Input>`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, Match};
+ ///
+ /// let re = Regex::new(r"Samwise|Sam")?;
+ /// let input = Input::new(
+ /// "one of the chief characters, Samwise the Brave",
+ /// );
+ /// assert_eq!(Some(Match::must(0, 29..36)), re.search(&input));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search(&self, input: &Input<'_>) -> Option<Match> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ let mut guard = self.pool.get();
+ let result = self.imp.strat.search(&mut guard, input);
+ // We do this dance with the guard and explicitly put it back in the
+ // pool because it seems to result in better codegen. If we let the
+ // guard's Drop impl put it back in the pool, then functions like
+ // ptr::drop_in_place get called and they *don't* get inlined. This
+ // isn't usually a big deal, but in latency sensitive benchmarks the
+ // extra function call can matter.
+ //
+ // I used `rebar measure -f '^grep/every-line$' -e meta` to measure
+ // the effects here.
+ //
+ // Note that this doesn't eliminate the latency effects of using the
+ // pool. There is still some (minor) cost for the "thread owner" of the
+ // pool. (i.e., The thread that first calls a regex search routine.)
+ // However, for other threads using the regex, the pool access can be
+ // quite expensive as it goes through a mutex. Callers can avoid this
+ // by either cloning the Regex (which creates a distinct copy of the
+ // pool), or callers can use the lower level APIs that accept a 'Cache'
+ // directly and do their own handling.
+ PoolGuard::put(guard);
+ result
+ }
+
+ /// Returns the end offset of the leftmost match. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This is distinct from [`Regex::search`] in that it only returns the end
+ /// of a match and not the start of the match. Depending on a variety of
+ /// implementation details, this _may_ permit the regex engine to do less
+ /// overall work. For example, if a DFA is being used to execute a search,
+ /// then the start of a match usually requires running a separate DFA in
+ /// reverse to the find the start of a match. If one only needs the end of
+ /// a match, then the separate reverse scan to find the start of a match
+ /// can be skipped. (Note that the reverse scan is avoided even when using
+ /// `Regex::search` when possible, for example, in the case of an anchored
+ /// search.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, HalfMatch};
+ ///
+ /// let re = Regex::new(r"Samwise|Sam")?;
+ /// let input = Input::new(
+ /// "one of the chief characters, Samwise the Brave",
+ /// );
+ /// assert_eq!(Some(HalfMatch::must(0, 36)), re.search_half(&input));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_half(&self, input: &Input<'_>) -> Option<HalfMatch> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ let mut guard = self.pool.get();
+ let result = self.imp.strat.search_half(&mut guard, input);
+ // See 'Regex::search' for why we put the guard back explicitly.
+ PoolGuard::put(guard);
+ result
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// This is like [`Regex::captures`], but it accepts a concrete `&Input`
+ /// instead of an `Into<Input>`.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-pattern `Regex` that permits
+ /// searching for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// Anchored, Match, PatternID, Input,
+ /// };
+ ///
+ /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let mut caps = re.create_captures();
+ /// let haystack = "foo123";
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(Match::must(0, 0..6));
+ /// re.search_captures(&Input::new(haystack), &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(Match::must(1, 0..6));
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// re.search_captures(&input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Match, Input};
+ ///
+ /// let re = Regex::new(r"\b[0-9]{3}\b")?;
+ /// let mut caps = re.create_captures();
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// let input = Input::new(&haystack[3..6]);
+ /// re.search_captures(&input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let input = Input::new(haystack).range(3..6);
+ /// re.search_captures(&input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_captures(&self, input: &Input<'_>, caps: &mut Captures) {
+ caps.set_pattern(None);
+ let pid = self.search_slots(input, caps.slots_mut());
+ caps.set_pattern(pid);
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided `slots`, and
+ /// returns the matching pattern ID. The contents of the slots for patterns
+ /// other than the matching pattern are unspecified. If no match was found,
+ /// then `None` is returned and the contents of `slots` is unspecified.
+ ///
+ /// This is like [`Regex::search`], but it accepts a raw slots slice
+ /// instead of a `Captures` value. This is useful in contexts where you
+ /// don't want or need to allocate a `Captures`.
+ ///
+ /// It is legal to pass _any_ number of slots to this routine. If the regex
+ /// engine would otherwise write a slot offset that doesn't fit in the
+ /// provided slice, then it is simply skipped. In general though, there are
+ /// usually three slice lengths you might want to use:
+ ///
+ /// * An empty slice, if you only care about which pattern matched.
+ /// * A slice with [`pattern_len() * 2`](Regex::pattern_len) slots, if you
+ /// only care about the overall match spans for each matching pattern.
+ /// * A slice with
+ /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
+ /// permits recording match offsets for every capturing group in every
+ /// pattern.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, PatternID, Input};
+ ///
+ /// let re = Regex::new_many(&[
+ /// r"\pL+",
+ /// r"\d+",
+ /// ])?;
+ /// let input = Input::new("!@#123");
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.search_slots(&input, &mut slots);
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_slots(
+ &self,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ let mut guard = self.pool.get();
+ let result = self.imp.strat.search_slots(&mut guard, input, slots);
+ // See 'Regex::search' for why we put the guard back explicitly.
+ PoolGuard::put(guard);
+ result
+ }
+
+ /// Writes the set of patterns that match anywhere in the given search
+ /// configuration to `patset`. If multiple patterns match at the same
+ /// position and this `Regex` was configured with [`MatchKind::All`]
+ /// semantics, then all matching patterns are written to the given set.
+ ///
+ /// Unless all of the patterns in this `Regex` are anchored, then generally
+ /// speaking, this will scan the entire haystack.
+ ///
+ /// This search routine *does not* clear the pattern set. This gives some
+ /// flexibility to the caller (e.g., running multiple searches with the
+ /// same pattern set), but does make the API bug-prone if you're reusing
+ /// the same pattern set for multiple searches but intended them to be
+ /// independent.
+ ///
+ /// If a pattern ID matched but the given `PatternSet` does not have
+ /// sufficient capacity to store it, then it is not inserted and silently
+ /// dropped.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find all matching patterns in a haystack,
+ /// even when some patterns match at the same position as other patterns.
+ /// It is important that we configure the `Regex` with [`MatchKind::All`]
+ /// semantics here, or else overlapping matches will not be reported.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet};
+ ///
+ /// let patterns = &[
+ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
+ /// ];
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ ///
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(re.pattern_len());
+ /// re.which_overlapping_matches(&input, &mut patset);
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn which_overlapping_matches(
+ &self,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if self.imp.info.is_impossible(input) {
+ return;
+ }
+ let mut guard = self.pool.get();
+ let result = self
+ .imp
+ .strat
+ .which_overlapping_matches(&mut guard, input, patset);
+ // See 'Regex::search' for why we put the guard back explicitly.
+ PoolGuard::put(guard);
+ result
+ }
+}
+
+/// Lower level search routines that give more control, and require the caller
+/// to provide an explicit [`Cache`] parameter.
+impl Regex {
+ /// This is like [`Regex::search`], but requires the caller to
+ /// explicitly pass a [`Cache`].
+ ///
+ /// # Why pass a `Cache` explicitly?
+ ///
+ /// Passing a `Cache` explicitly will bypass the use of an internal memory
+ /// pool used by `Regex` to get a `Cache` for a search. The use of this
+ /// pool can be slower in some cases when a `Regex` is used from multiple
+ /// threads simultaneously. Typically, performance only becomes an issue
+ /// when there is heavy contention, which in turn usually only occurs
+ /// when each thread's primary unit of work is a regex search on a small
+ /// haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, Match};
+ ///
+ /// let re = Regex::new(r"Samwise|Sam")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new(
+ /// "one of the chief characters, Samwise the Brave",
+ /// );
+ /// assert_eq!(
+ /// Some(Match::must(0, 29..36)),
+ /// re.search_with(&mut cache, &input),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_with(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<Match> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ self.imp.strat.search(cache, input)
+ }
+
+ /// This is like [`Regex::search_half`], but requires the caller to
+ /// explicitly pass a [`Cache`].
+ ///
+ /// # Why pass a `Cache` explicitly?
+ ///
+ /// Passing a `Cache` explicitly will bypass the use of an internal memory
+ /// pool used by `Regex` to get a `Cache` for a search. The use of this
+ /// pool can be slower in some cases when a `Regex` is used from multiple
+ /// threads simultaneously. Typically, performance only becomes an issue
+ /// when there is heavy contention, which in turn usually only occurs
+ /// when each thread's primary unit of work is a regex search on a small
+ /// haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, HalfMatch};
+ ///
+ /// let re = Regex::new(r"Samwise|Sam")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new(
+ /// "one of the chief characters, Samwise the Brave",
+ /// );
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 36)),
+ /// re.search_half_with(&mut cache, &input),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_half_with(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ self.imp.strat.search_half(cache, input)
+ }
+
+ /// This is like [`Regex::search_captures`], but requires the caller to
+ /// explicitly pass a [`Cache`].
+ ///
+ /// # Why pass a `Cache` explicitly?
+ ///
+ /// Passing a `Cache` explicitly will bypass the use of an internal memory
+ /// pool used by `Regex` to get a `Cache` for a search. The use of this
+ /// pool can be slower in some cases when a `Regex` is used from multiple
+ /// threads simultaneously. Typically, performance only becomes an issue
+ /// when there is heavy contention, which in turn usually only occurs
+ /// when each thread's primary unit of work is a regex search on a small
+ /// haystack.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-pattern `Regex` that permits
+ /// searching for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// Anchored, Match, PatternID, Input,
+ /// };
+ ///
+ /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123";
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(Match::must(0, 0..6));
+ /// re.search_captures_with(&mut cache, &Input::new(haystack), &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(Match::must(1, 0..6));
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// re.search_captures_with(&mut cache, &input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Match, Input};
+ ///
+ /// let re = Regex::new(r"\b[0-9]{3}\b")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// let input = Input::new(&haystack[3..6]);
+ /// re.search_captures_with(&mut cache, &input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let input = Input::new(haystack).range(3..6);
+ /// re.search_captures_with(&mut cache, &input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_captures_with(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ caps: &mut Captures,
+ ) {
+ caps.set_pattern(None);
+ let pid = self.search_slots_with(cache, input, caps.slots_mut());
+ caps.set_pattern(pid);
+ }
+
+ /// This is like [`Regex::search_slots`], but requires the caller to
+ /// explicitly pass a [`Cache`].
+ ///
+ /// # Why pass a `Cache` explicitly?
+ ///
+ /// Passing a `Cache` explicitly will bypass the use of an internal memory
+ /// pool used by `Regex` to get a `Cache` for a search. The use of this
+ /// pool can be slower in some cases when a `Regex` is used from multiple
+ /// threads simultaneously. Typically, performance only becomes an issue
+ /// when there is heavy contention, which in turn usually only occurs
+ /// when each thread's primary unit of work is a regex search on a small
+ /// haystack.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, PatternID, Input};
+ ///
+ /// let re = Regex::new_many(&[
+ /// r"\pL+",
+ /// r"\d+",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("!@#123");
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.search_slots_with(&mut cache, &input, &mut slots);
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_slots_with(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if self.imp.info.is_impossible(input) {
+ return None;
+ }
+ self.imp.strat.search_slots(cache, input, slots)
+ }
+
+ /// This is like [`Regex::which_overlapping_matches`], but requires the
+ /// caller to explicitly pass a [`Cache`].
+ ///
+ /// Passing a `Cache` explicitly will bypass the use of an internal memory
+ /// pool used by `Regex` to get a `Cache` for a search. The use of this
+ /// pool can be slower in some cases when a `Regex` is used from multiple
+ /// threads simultaneously. Typically, performance only becomes an issue
+ /// when there is heavy contention, which in turn usually only occurs
+ /// when each thread's primary unit of work is a regex search on a small
+ /// haystack.
+ ///
+ /// # Why pass a `Cache` explicitly?
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet};
+ ///
+ /// let patterns = &[
+ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
+ /// ];
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(re.pattern_len());
+ /// re.which_overlapping_matches_with(&mut cache, &input, &mut patset);
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn which_overlapping_matches_with(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if self.imp.info.is_impossible(input) {
+ return;
+ }
+ self.imp.strat.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+/// Various non-search routines for querying properties of a `Regex` and
+/// convenience routines for creating [`Captures`] and [`Cache`] values.
+impl Regex {
+ /// Creates a new object for recording capture group offsets. This is used
+ /// in search APIs like [`Regex::captures`] and [`Regex::search_captures`].
+ ///
+ /// This is a convenience routine for
+ /// `Captures::all(re.group_info().clone())`. Callers may build other types
+ /// of `Captures` values that record less information (and thus require
+ /// less work from the regex engine) using [`Captures::matches`] and
+ /// [`Captures::empty`].
+ ///
+ /// # Example
+ ///
+ /// This shows some alternatives to [`Regex::create_captures`]:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// util::captures::Captures,
+ /// Match, PatternID, Span,
+ /// };
+ ///
+ /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?;
+ ///
+ /// // This is equivalent to Regex::create_captures. It stores matching
+ /// // offsets for all groups in the regex.
+ /// let mut all = Captures::all(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut all);
+ /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first"));
+ /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last"));
+ ///
+ /// // In this version, we only care about the implicit groups, which
+ /// // means offsets for the explicit groups will be unavailable. It can
+ /// // sometimes be faster to ask for fewer groups, since the underlying
+ /// // regex engine needs to do less work to keep track of them.
+ /// let mut matches = Captures::matches(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut matches);
+ /// // We still get the overall match info.
+ /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match());
+ /// // But now the explicit groups are unavailable.
+ /// assert_eq!(None, matches.get_group_by_name("first"));
+ /// assert_eq!(None, matches.get_group_by_name("last"));
+ ///
+ /// // Finally, in this version, we don't ask to keep track of offsets for
+ /// // *any* groups. All we get back is whether a match occurred, and if
+ /// // so, the ID of the pattern that matched.
+ /// let mut empty = Captures::empty(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut empty);
+ /// // it's a match!
+ /// assert!(empty.is_match());
+ /// // for pattern ID 0
+ /// assert_eq!(Some(PatternID::ZERO), empty.pattern());
+ /// // Match offsets are unavailable.
+ /// assert_eq!(None, empty.get_match());
+ /// // And of course, explicit groups are unavailable too.
+ /// assert_eq!(None, empty.get_group_by_name("first"));
+ /// assert_eq!(None, empty.get_group_by_name("last"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn create_captures(&self) -> Captures {
+ Captures::all(self.group_info().clone())
+ }
+
+ /// Creates a new cache for use with lower level search APIs like
+ /// [`Regex::search_with`].
+ ///
+ /// The cache returned should only be used for searches for this `Regex`.
+ /// If you want to reuse the cache for another `Regex`, then you must call
+ /// [`Cache::reset`] with that `Regex`.
+ ///
+ /// This is a convenience routine for [`Cache::new`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Input, Match};
+ ///
+ /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("crazy janey and her mission man");
+ /// assert_eq!(
+ /// Some(Match::must(0, 20..31)),
+ /// re.search_with(&mut cache, &input),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn create_cache(&self) -> Cache {
+ self.imp.strat.create_cache()
+ }
+
+ /// Returns the total number of patterns in this regex.
+ ///
+ /// The standard [`Regex::new`] constructor always results in a `Regex`
+ /// with a single pattern, but [`Regex::new_many`] permits building a
+ /// multi-pattern regex.
+ ///
+ /// A `Regex` guarantees that the maximum possible `PatternID` returned in
+ /// any match is `Regex::pattern_len() - 1`. In the case where the number
+ /// of patterns is `0`, a match is impossible.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let re = Regex::new(r"(?m)^[a-z]$")?;
+ /// assert_eq!(1, re.pattern_len());
+ ///
+ /// let re = Regex::new_many::<&str>(&[])?;
+ /// assert_eq!(0, re.pattern_len());
+ ///
+ /// let re = Regex::new_many(&["a", "b", "c"])?;
+ /// assert_eq!(3, re.pattern_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_len(&self) -> usize {
+ self.imp.info.pattern_len()
+ }
+
+ /// Returns the total number of capturing groups.
+ ///
+ /// This includes the implicit capturing group corresponding to the
+ /// entire match. Therefore, the minimum value returned is `1`.
+ ///
+ /// # Example
+ ///
+ /// This shows a few patterns and how many capture groups they have.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.captures_len())
+ /// };
+ ///
+ /// assert_eq!(1, len("a")?);
+ /// assert_eq!(2, len("(a)")?);
+ /// assert_eq!(3, len("(a)|(b)")?);
+ /// assert_eq!(5, len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(2, len("(a)|b")?);
+ /// assert_eq!(2, len("a|(b)")?);
+ /// assert_eq!(2, len("(b)*")?);
+ /// assert_eq!(2, len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: multiple patterns
+ ///
+ /// This routine also works for multiple patterns. The total number is
+ /// the sum of the capture groups of each pattern.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let len = |patterns| {
+ /// Regex::new_many(patterns).map(|re| re.captures_len())
+ /// };
+ ///
+ /// assert_eq!(2, len(&["a", "b"])?);
+ /// assert_eq!(4, len(&["(a)", "(b)"])?);
+ /// assert_eq!(6, len(&["(a)|(b)", "(c)|(d)"])?);
+ /// assert_eq!(8, len(&["(a)(b)|(c)(d)", "(x)(y)"])?);
+ /// assert_eq!(3, len(&["(a)", "b"])?);
+ /// assert_eq!(3, len(&["a", "(b)"])?);
+ /// assert_eq!(4, len(&["(a)", "(b)*"])?);
+ /// assert_eq!(4, len(&["(a)+", "(b)+"])?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn captures_len(&self) -> usize {
+ self.imp
+ .info
+ .props_union()
+ .explicit_captures_len()
+ .saturating_add(self.pattern_len())
+ }
+
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: multiple patterns
+ ///
+ /// This property extends to regexes with multiple patterns as well. In
+ /// order for their to be a static number of capture groups in this case,
+ /// every pattern must have the same static number.
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let len = |patterns| {
+ /// Regex::new_many(patterns).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len(&["a", "b"])?);
+ /// assert_eq!(Some(2), len(&["(a)", "(b)"])?);
+ /// assert_eq!(Some(2), len(&["(a)|(b)", "(c)|(d)"])?);
+ /// assert_eq!(Some(3), len(&["(a)(b)|(c)(d)", "(x)(y)"])?);
+ /// assert_eq!(None, len(&["(a)", "b"])?);
+ /// assert_eq!(None, len(&["a", "(b)"])?);
+ /// assert_eq!(None, len(&["(a)", "(b)*"])?);
+ /// assert_eq!(Some(2), len(&["(a)+", "(b)+"])?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.imp
+ .info
+ .props_union()
+ .static_explicit_captures_len()
+ .map(|len| len.saturating_add(1))
+ }
+
+ /// Return information about the capture groups in this `Regex`.
+ ///
+ /// A `GroupInfo` is an immutable object that can be cheaply cloned. It
+ /// is responsible for maintaining a mapping between the capture groups
+ /// in the concrete syntax of zero or more regex patterns and their
+ /// internal representation used by some of the regex matchers. It is also
+ /// responsible for maintaining a mapping between the name of each group
+ /// (if one exists) and its corresponding group index.
+ ///
+ /// A `GroupInfo` is ultimately what is used to build a [`Captures`] value,
+ /// which is some mutable space where group offsets are stored as a result
+ /// of a search.
+ ///
+ /// # Example
+ ///
+ /// This shows some alternatives to [`Regex::create_captures`]:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// util::captures::Captures,
+ /// Match, PatternID, Span,
+ /// };
+ ///
+ /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?;
+ ///
+ /// // This is equivalent to Regex::create_captures. It stores matching
+ /// // offsets for all groups in the regex.
+ /// let mut all = Captures::all(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut all);
+ /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first"));
+ /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last"));
+ ///
+ /// // In this version, we only care about the implicit groups, which
+ /// // means offsets for the explicit groups will be unavailable. It can
+ /// // sometimes be faster to ask for fewer groups, since the underlying
+ /// // regex engine needs to do less work to keep track of them.
+ /// let mut matches = Captures::matches(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut matches);
+ /// // We still get the overall match info.
+ /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match());
+ /// // But now the explicit groups are unavailable.
+ /// assert_eq!(None, matches.get_group_by_name("first"));
+ /// assert_eq!(None, matches.get_group_by_name("last"));
+ ///
+ /// // Finally, in this version, we don't ask to keep track of offsets for
+ /// // *any* groups. All we get back is whether a match occurred, and if
+ /// // so, the ID of the pattern that matched.
+ /// let mut empty = Captures::empty(re.group_info().clone());
+ /// re.captures("Bruce Springsteen", &mut empty);
+ /// // it's a match!
+ /// assert!(empty.is_match());
+ /// // for pattern ID 0
+ /// assert_eq!(Some(PatternID::ZERO), empty.pattern());
+ /// // Match offsets are unavailable.
+ /// assert_eq!(None, empty.get_match());
+ /// // And of course, explicit groups are unavailable too.
+ /// assert_eq!(None, empty.get_group_by_name("first"));
+ /// assert_eq!(None, empty.get_group_by_name("last"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn group_info(&self) -> &GroupInfo {
+ self.imp.strat.group_info()
+ }
+
+ /// Returns the configuration object used to build this `Regex`.
+ ///
+ /// If no configuration object was explicitly passed, then the
+ /// configuration returned represents the default.
+ #[inline]
+ pub fn get_config(&self) -> &Config {
+ self.imp.info.config()
+ }
+
+ /// Returns true if this regex has a high chance of being "accelerated."
+ ///
+ /// The precise meaning of "accelerated" is specifically left unspecified,
+ /// but the general meaning is that the search is a high likelihood of
+ /// running faster than than a character-at-a-time loop inside a standard
+ /// regex engine.
+ ///
+ /// When a regex is accelerated, it is only a *probabilistic* claim. That
+ /// is, just because the regex is believed to be accelerated, that doesn't
+ /// mean it will definitely execute searches very fast. Similarly, if a
+ /// regex is *not* accelerated, that is also a probabilistic claim. That
+ /// is, a regex for which `is_accelerated` returns `false` could still run
+ /// searches more quickly than a regex for which `is_accelerated` returns
+ /// `true`.
+ ///
+ /// Whether a regex is marked as accelerated or not is dependent on
+ /// implementations details that may change in a semver compatible release.
+ /// That is, a regex that is accelerated in a `x.y.1` release might not be
+ /// accelerated in a `x.y.2` release.
+ ///
+ /// Basically, the value of acceleration boils down to a hedge: a hodge
+ /// podge of internal heuristics combine to make a probabilistic guess
+ /// that this regex search may run "fast." The value in knowing this from
+ /// a caller's perspective is that it may act as a signal that no further
+ /// work should be done to accelerate a search. For example, a grep-like
+ /// tool might try to do some extra work extracting literals from a regex
+ /// to create its own heuristic acceleration strategies. But it might
+ /// choose to defer to this crate's acceleration strategy if one exists.
+ /// This routine permits querying whether such a strategy is active for a
+ /// particular regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::meta::Regex;
+ ///
+ /// // A simple literal is very likely to be accelerated.
+ /// let re = Regex::new(r"foo")?;
+ /// assert!(re.is_accelerated());
+ ///
+ /// // A regex with no literals is likely to not be accelerated.
+ /// let re = Regex::new(r"\w")?;
+ /// assert!(!re.is_accelerated());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_accelerated(&self) -> bool {
+ self.imp.strat.is_accelerated()
+ }
+
+ /// Return the total approximate heap memory, in bytes, used by this `Regex`.
+ ///
+ /// Note that currently, there is no high level configuration for setting
+ /// a limit on the specific value returned by this routine. Instead, the
+ /// following routines can be used to control heap memory at a bit of a
+ /// lower level:
+ ///
+ /// * [`Config::nfa_size_limit`] controls how big _any_ of the NFAs are
+ /// allowed to be.
+ /// * [`Config::onepass_size_limit`] controls how big the one-pass DFA is
+ /// allowed to be.
+ /// * [`Config::hybrid_cache_capacity`] controls how much memory the lazy
+ /// DFA is permitted to allocate to store its transition table.
+ /// * [`Config::dfa_size_limit`] controls how big a fully compiled DFA is
+ /// allowed to be.
+ /// * [`Config::dfa_state_limit`] controls the conditions under which the
+ /// meta regex engine will even attempt to build a fully compiled DFA.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ self.imp.strat.memory_usage()
+ }
+}
+
+impl Clone for Regex {
+ fn clone(&self) -> Regex {
+ let imp = Arc::clone(&self.imp);
+ let pool = {
+ let strat = Arc::clone(&imp.strat);
+ let create: CachePoolFn = Box::new(move || strat.create_cache());
+ Pool::new(create)
+ };
+ Regex { imp, pool }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct RegexInfo(Arc<RegexInfoI>);
+
+#[derive(Clone, Debug)]
+struct RegexInfoI {
+ config: Config,
+ props: Vec<hir::Properties>,
+ props_union: hir::Properties,
+}
+
+impl RegexInfo {
+ fn new(config: Config, hirs: &[&Hir]) -> RegexInfo {
+ // Collect all of the properties from each of the HIRs, and also
+ // union them into one big set of properties representing all HIRs
+ // as if they were in one big alternation.
+ let mut props = vec![];
+ for hir in hirs.iter() {
+ props.push(hir.properties().clone());
+ }
+ let props_union = hir::Properties::union(&props);
+
+ RegexInfo(Arc::new(RegexInfoI { config, props, props_union }))
+ }
+
+ pub(crate) fn config(&self) -> &Config {
+ &self.0.config
+ }
+
+ pub(crate) fn props(&self) -> &[hir::Properties] {
+ &self.0.props
+ }
+
+ pub(crate) fn props_union(&self) -> &hir::Properties {
+ &self.0.props_union
+ }
+
+ pub(crate) fn pattern_len(&self) -> usize {
+ self.props().len()
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.props().iter().map(|p| p.memory_usage()).sum::<usize>()
+ + self.props_union().memory_usage()
+ }
+
+ /// Returns true when the search is guaranteed to be anchored. That is,
+ /// when a match is reported, its offset is guaranteed to correspond to
+ /// the start of the search.
+ ///
+ /// This includes returning true when `input` _isn't_ anchored but the
+ /// underlying regex is.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_anchored_start(&self, input: &Input<'_>) -> bool {
+ input.get_anchored().is_anchored() || self.is_always_anchored_start()
+ }
+
+ /// Returns true when this regex is always anchored to the start of a
+ /// search. And in particular, that regardless of an `Input` configuration,
+ /// if any match is reported it must start at `0`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_always_anchored_start(&self) -> bool {
+ use regex_syntax::hir::Look;
+ self.props_union().look_set_prefix().contains(Look::Start)
+ }
+
+ /// Returns true when this regex is always anchored to the end of a
+ /// search. And in particular, that regardless of an `Input` configuration,
+ /// if any match is reported it must end at the end of the haystack.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_always_anchored_end(&self) -> bool {
+ use regex_syntax::hir::Look;
+ self.props_union().look_set_suffix().contains(Look::End)
+ }
+
+ /// Returns true if and only if it is known that a match is impossible
+ /// for the given input. This is useful for short-circuiting and avoiding
+ /// running the regex engine if it's known no match can be reported.
+ ///
+ /// Note that this doesn't necessarily detect every possible case. For
+ /// example, when `pattern_len() == 0`, a match is impossible, but that
+ /// case is so rare that it's fine to be handled by the regex engine
+ /// itself. That is, it's not worth the cost of adding it here in order to
+ /// make it a little faster. The reason is that this is called for every
+ /// search. so there is some cost to adding checks here. Arguably, some of
+ /// the checks that are here already probably shouldn't be here...
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_impossible(&self, input: &Input<'_>) -> bool {
+ // The underlying regex is anchored, so if we don't start the search
+ // at position 0, a match is impossible, because the anchor can only
+ // match at position 0.
+ if input.start() > 0 && self.is_always_anchored_start() {
+ return true;
+ }
+ // Same idea, but for the end anchor.
+ if input.end() < input.haystack().len()
+ && self.is_always_anchored_end()
+ {
+ return true;
+ }
+ // If the haystack is smaller than the minimum length required, then
+ // we know there can be no match.
+ let minlen = match self.props_union().minimum_len() {
+ None => return false,
+ Some(minlen) => minlen,
+ };
+ if input.get_span().len() < minlen {
+ return true;
+ }
+ // Same idea as minimum, but for maximum. This is trickier. We can
+ // only apply the maximum when we know the entire span that we're
+ // searching *has* to match according to the regex (and possibly the
+ // input configuration). If we know there is too much for the regex
+ // to match, we can bail early.
+ //
+ // I don't think we can apply the maximum otherwise unfortunately.
+ if self.is_anchored_start(input) && self.is_always_anchored_end() {
+ let maxlen = match self.props_union().maximum_len() {
+ None => return false,
+ Some(maxlen) => maxlen,
+ };
+ if input.get_span().len() > maxlen {
+ return true;
+ }
+ }
+ false
+ }
+}
+
+/// An iterator over all non-overlapping matches.
+///
+/// The iterator yields a [`Match`] value until no more matches could be found.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the `Regex` that produced this iterator.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`Regex::find_iter`] method.
+#[derive(Debug)]
+pub struct FindMatches<'r, 'h> {
+ re: &'r Regex,
+ cache: CachePoolGuard<'r>,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'h> FindMatches<'r, 'h> {
+ /// Returns the `Regex` value that created this iterator.
+ #[inline]
+ pub fn regex(&self) -> &'r Regex {
+ self.re
+ }
+
+ /// Returns the current `Input` associated with this iterator.
+ ///
+ /// The `start` position on the given `Input` may change during iteration,
+ /// but all other values are guaranteed to remain invariant.
+ #[inline]
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ self.it.input()
+ }
+}
+
+impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut cache, ref mut it } = *self;
+ it.advance(|input| Ok(re.search_with(cache, input)))
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ // If all we care about is a count of matches, then we only need to
+ // find the end position of each match. This can give us a 2x perf
+ // boost in some cases, because it avoids needing to do a reverse scan
+ // to find the start of a match.
+ let FindMatches { re, mut cache, it } = self;
+ // This does the deref for PoolGuard once instead of every iter.
+ let cache = &mut *cache;
+ it.into_half_matches_iter(
+ |input| Ok(re.search_half_with(cache, input)),
+ )
+ .count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for FindMatches<'r, 'h> {}
+
+/// An iterator over all non-overlapping leftmost matches with their capturing
+/// groups.
+///
+/// The iterator yields a [`Captures`] value until no more matches could be
+/// found.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the `Regex` that produced this iterator.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`Regex::captures_iter`] method.
+#[derive(Debug)]
+pub struct CapturesMatches<'r, 'h> {
+ re: &'r Regex,
+ cache: CachePoolGuard<'r>,
+ caps: Captures,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'h> CapturesMatches<'r, 'h> {
+ /// Returns the `Regex` value that created this iterator.
+ #[inline]
+ pub fn regex(&self) -> &'r Regex {
+ self.re
+ }
+
+ /// Returns the current `Input` associated with this iterator.
+ ///
+ /// The `start` position on the given `Input` may change during iteration,
+ /// but all other values are guaranteed to remain invariant.
+ #[inline]
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ self.it.input()
+ }
+}
+
+impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
+ type Item = Captures;
+
+ #[inline]
+ fn next(&mut self) -> Option<Captures> {
+ // Splitting 'self' apart seems necessary to appease borrowck.
+ let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
+ *self;
+ let _ = it.advance(|input| {
+ re.search_captures_with(cache, input, caps);
+ Ok(caps.get_match())
+ });
+ if caps.is_match() {
+ Some(caps.clone())
+ } else {
+ None
+ }
+ }
+
+ #[inline]
+ fn count(self) -> usize {
+ let CapturesMatches { re, mut cache, it, .. } = self;
+ // This does the deref for PoolGuard once instead of every iter.
+ let cache = &mut *cache;
+ it.into_half_matches_iter(
+ |input| Ok(re.search_half_with(cache, input)),
+ )
+ .count()
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for CapturesMatches<'r, 'h> {}
+
+/// Yields all substrings delimited by a regular expression match.
+///
+/// The spans correspond to the offsets between matches.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the `Regex` that produced this iterator.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`Regex::split`] method.
+#[derive(Debug)]
+pub struct Split<'r, 'h> {
+ finder: FindMatches<'r, 'h>,
+ last: usize,
+}
+
+impl<'r, 'h> Split<'r, 'h> {
+ /// Returns the current `Input` associated with this iterator.
+ ///
+ /// The `start` position on the given `Input` may change during iteration,
+ /// but all other values are guaranteed to remain invariant.
+ #[inline]
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ self.finder.input()
+ }
+}
+
+impl<'r, 'h> Iterator for Split<'r, 'h> {
+ type Item = Span;
+
+ fn next(&mut self) -> Option<Span> {
+ match self.finder.next() {
+ None => {
+ let len = self.finder.it.input().haystack().len();
+ if self.last > len {
+ None
+ } else {
+ let span = Span::from(self.last..len);
+ self.last = len + 1; // Next call will return None
+ Some(span)
+ }
+ }
+ Some(m) => {
+ let span = Span::from(self.last..m.start());
+ self.last = m.end();
+ Some(span)
+ }
+ }
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
+
+/// Yields at most `N` spans delimited by a regular expression match.
+///
+/// The spans correspond to the offsets between matches. The last span will be
+/// whatever remains after splitting.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the `Regex` that produced this iterator.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`Regex::splitn`] method.
+#[derive(Debug)]
+pub struct SplitN<'r, 'h> {
+ splits: Split<'r, 'h>,
+ limit: usize,
+}
+
+impl<'r, 'h> SplitN<'r, 'h> {
+ /// Returns the current `Input` associated with this iterator.
+ ///
+ /// The `start` position on the given `Input` may change during iteration,
+ /// but all other values are guaranteed to remain invariant.
+ #[inline]
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ self.splits.input()
+ }
+}
+
+impl<'r, 'h> Iterator for SplitN<'r, 'h> {
+ type Item = Span;
+
+ fn next(&mut self) -> Option<Span> {
+ if self.limit == 0 {
+ return None;
+ }
+
+ self.limit -= 1;
+ if self.limit > 0 {
+ return self.splits.next();
+ }
+
+ let len = self.splits.finder.it.input().haystack().len();
+ if self.splits.last > len {
+ // We've already returned all substrings.
+ None
+ } else {
+ // self.n == 0, so future calls will return None immediately
+ Some(Span::from(self.splits.last..len))
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.limit))
+ }
+}
+
+impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
+
+/// Represents mutable scratch space used by regex engines during a search.
+///
+/// Most of the regex engines in this crate require some kind of
+/// mutable state in order to execute a search. This mutable state is
+/// explicitly separated from the the core regex object (such as a
+/// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex
+/// object can be shared across multiple threads simultaneously without any
+/// synchronization. Conversely, a `Cache` must either be duplicated if using
+/// the same `Regex` from multiple threads, or else there must be some kind of
+/// synchronization that guarantees exclusive access while it's in use by one
+/// thread.
+///
+/// A `Regex` attempts to do this synchronization for you by using a thread
+/// pool internally. Its size scales roughly with the number of simultaneous
+/// regex searches.
+///
+/// For cases where one does not want to rely on a `Regex`'s internal thread
+/// pool, lower level routines such as [`Regex::search_with`] are provided
+/// that permit callers to pass a `Cache` into the search routine explicitly.
+///
+/// General advice is that the thread pool is often more than good enough.
+/// However, it may be possible to observe the effects of its latency,
+/// especially when searching many small haystacks from many threads
+/// simultaneously.
+///
+/// Caches can be created from their corresponding `Regex` via
+/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
+/// that created it, or the `Regex` that was most recently used to reset it
+/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
+/// panics or incorrect results.
+///
+/// # Example
+///
+/// ```
+/// use regex_automata::{meta::Regex, Input, Match};
+///
+/// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?;
+/// let mut cache = re.create_cache();
+/// let input = Input::new("crazy janey and her mission man");
+/// assert_eq!(
+/// Some(Match::must(0, 20..31)),
+/// re.search_with(&mut cache, &input),
+/// );
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Debug, Clone)]
+pub struct Cache {
+ pub(crate) capmatches: Captures,
+ pub(crate) pikevm: wrappers::PikeVMCache,
+ pub(crate) backtrack: wrappers::BoundedBacktrackerCache,
+ pub(crate) onepass: wrappers::OnePassCache,
+ pub(crate) hybrid: wrappers::HybridCache,
+ pub(crate) revhybrid: wrappers::ReverseHybridCache,
+}
+
+impl Cache {
+ /// Creates a new `Cache` for use with this regex.
+ ///
+ /// The cache returned should only be used for searches for the given
+ /// `Regex`. If you want to reuse the cache for another `Regex`, then you
+ /// must call [`Cache::reset`] with that `Regex`.
+ pub fn new(re: &Regex) -> Cache {
+ re.create_cache()
+ }
+
+ /// Reset this cache such that it can be used for searching with the given
+ /// `Regex` (and only that `Regex`).
+ ///
+ /// A cache reset permits potentially reusing memory already allocated in
+ /// this cache with a different `Regex`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `Regex`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Match, Input};
+ ///
+ /// let re1 = Regex::new(r"\w")?;
+ /// let re2 = Regex::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// re1.search_with(&mut cache, &Input::new("Δ")),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the Regex we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// re2.search_with(&mut cache, &Input::new("☃")),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &Regex) {
+ re.imp.strat.reset_cache(self)
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ let mut bytes = 0;
+ bytes += self.pikevm.memory_usage();
+ bytes += self.backtrack.memory_usage();
+ bytes += self.onepass.memory_usage();
+ bytes += self.hybrid.memory_usage();
+ bytes += self.revhybrid.memory_usage();
+ bytes
+ }
+}
+
+/// An object describing the configuration of a `Regex`.
+///
+/// This configuration only includes options for the
+/// non-syntax behavior of a `Regex`, and can be applied via the
+/// [`Builder::configure`] method. For configuring the syntax options, see
+/// [`util::syntax::Config`](crate::util::syntax::Config).
+///
+/// # Example: lower the NFA size limit
+///
+/// In some cases, the default size limit might be too big. The size limit can
+/// be lowered, which will prevent large regex patterns from compiling.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::meta::Regex;
+///
+/// let result = Regex::builder()
+/// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10))))
+/// // Not even 20KB is enough to build a single large Unicode class!
+/// .build(r"\pL");
+/// assert!(result.is_err());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ // As with other configuration types in this crate, we put all our knobs
+ // in options so that we can distinguish between "default" and "not set."
+ // This makes it possible to easily combine multiple configurations
+ // without default values overwriting explicitly specified values. See the
+ // 'overwrite' method.
+ //
+ // For docs on the fields below, see the corresponding method setters.
+ match_kind: Option<MatchKind>,
+ utf8_empty: Option<bool>,
+ autopre: Option<bool>,
+ pre: Option<Option<Prefilter>>,
+ which_captures: Option<WhichCaptures>,
+ nfa_size_limit: Option<Option<usize>>,
+ onepass_size_limit: Option<Option<usize>>,
+ hybrid_cache_capacity: Option<usize>,
+ hybrid: Option<bool>,
+ dfa: Option<bool>,
+ dfa_size_limit: Option<Option<usize>>,
+ dfa_state_limit: Option<Option<usize>>,
+ onepass: Option<bool>,
+ backtrack: Option<bool>,
+ byte_classes: Option<bool>,
+ line_terminator: Option<u8>,
+}
+
+impl Config {
+ /// Create a new configuration object for a `Regex`.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set the match semantics for a `Regex`.
+ ///
+ /// The default value is [`MatchKind::LeftmostFirst`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match, MatchKind};
+ ///
+ /// // By default, leftmost-first semantics are used, which
+ /// // disambiguates matches at the same position by selecting
+ /// // the one that corresponds earlier in the pattern.
+ /// let re = Regex::new("sam|samwise")?;
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find("samwise"));
+ ///
+ /// // But with 'all' semantics, match priority is ignored
+ /// // and all match states are included. When coupled with
+ /// // a leftmost search, the search will report the last
+ /// // possible match.
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().match_kind(MatchKind::All))
+ /// .build("sam|samwise")?;
+ /// assert_eq!(Some(Match::must(0, 0..7)), re.find("samwise"));
+ /// // Beware that this can lead to skipping matches!
+ /// // Usually 'all' is used for anchored reverse searches
+ /// // only, or for overlapping searches.
+ /// assert_eq!(Some(Match::must(0, 4..11)), re.find("sam samwise"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn match_kind(self, kind: MatchKind) -> Config {
+ Config { match_kind: Some(kind), ..self }
+ }
+
+ /// Toggles whether empty matches are permitted to occur between the code
+ /// units of a UTF-8 encoded codepoint.
+ ///
+ /// This should generally be enabled when search a `&str` or anything that
+ /// you otherwise know is valid UTF-8. It should be disabled in all other
+ /// cases. Namely, if the haystack is not valid UTF-8 and this is enabled,
+ /// then behavior is unspecified.
+ ///
+ /// By default, this is enabled.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new("")?;
+ /// let got: Vec<Match> = re.find_iter("☃").collect();
+ /// // Matches only occur at the beginning and end of the snowman.
+ /// assert_eq!(got, vec![
+ /// Match::must(0, 0..0),
+ /// Match::must(0, 3..3),
+ /// ]);
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8_empty(false))
+ /// .build("")?;
+ /// let got: Vec<Match> = re.find_iter("☃").collect();
+ /// // Matches now occur at every position!
+ /// assert_eq!(got, vec![
+ /// Match::must(0, 0..0),
+ /// Match::must(0, 1..1),
+ /// Match::must(0, 2..2),
+ /// Match::must(0, 3..3),
+ /// ]);
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8_empty(self, yes: bool) -> Config {
+ Config { utf8_empty: Some(yes), ..self }
+ }
+
+ /// Toggles whether automatic prefilter support is enabled.
+ ///
+ /// If this is disabled and [`Config::prefilter`] is not set, then the
+ /// meta regex engine will not use any prefilters. This can sometimes
+ /// be beneficial in cases where you know (or have measured) that the
+ /// prefilter leads to overall worse search performance.
+ ///
+ /// By default, this is enabled.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().auto_prefilter(false))
+ /// .build(r"Bruce \w+")?;
+ /// let hay = "Hello Bruce Springsteen!";
+ /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay));
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn auto_prefilter(self, yes: bool) -> Config {
+ Config { autopre: Some(yes), ..self }
+ }
+
+ /// Overrides and sets the prefilter to use inside a `Regex`.
+ ///
+ /// This permits one to forcefully set a prefilter in cases where the
+ /// caller knows better than whatever the automatic prefilter logic is
+ /// capable of.
+ ///
+ /// By default, this is set to `None` and an automatic prefilter will be
+ /// used if one could be built. (Assuming [`Config::auto_prefilter`] is
+ /// enabled, which it is by default.)
+ ///
+ /// # Example
+ ///
+ /// This example shows how to set your own prefilter. In the case of a
+ /// pattern like `Bruce \w+`, the automatic prefilter is likely to be
+ /// constructed in a way that it will look for occurrences of `Bruce `.
+ /// In most cases, this is the best choice. But in some cases, it may be
+ /// the case that running `memchr` on `B` is the best choice. One can
+ /// achieve that behavior by overriding the automatic prefilter logic
+ /// and providing a prefilter that just matches `B`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// util::prefilter::Prefilter,
+ /// Match, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["B"])
+ /// .expect("a prefilter");
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().prefilter(Some(pre)))
+ /// .build(r"Bruce \w+")?;
+ /// let hay = "Hello Bruce Springsteen!";
+ /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: incorrect prefilters can lead to incorrect results!
+ ///
+ /// Be warned that setting an incorrect prefilter can lead to missed
+ /// matches. So if you use this option, ensure your prefilter can _never_
+ /// report false negatives. (A false positive is, on the other hand, quite
+ /// okay and generally unavoidable.)
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// util::prefilter::Prefilter,
+ /// Match, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Z"])
+ /// .expect("a prefilter");
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().prefilter(Some(pre)))
+ /// .build(r"Bruce \w+")?;
+ /// let hay = "Hello Bruce Springsteen!";
+ /// // Oops! No match found, but there should be one!
+ /// assert_eq!(None, re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(self, pre: Option<Prefilter>) -> Config {
+ Config { pre: Some(pre), ..self }
+ }
+
+ /// Configures what kinds of groups are compiled as "capturing" in the
+ /// underlying regex engine.
+ ///
+ /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
+ /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
+ /// overhead of capture states for explicit groups.
+ ///
+ /// Note that another approach to avoiding the overhead of capture groups
+ /// is by using non-capturing groups in the regex pattern. That is,
+ /// `(?:a)` instead of `(a)`. This option is useful when you can't control
+ /// the concrete syntax but know that you don't need the underlying capture
+ /// states. For example, using `WhichCaptures::Implicit` will behave as if
+ /// all explicit capturing groups in the pattern were non-capturing.
+ ///
+ /// Setting this to `WhichCaptures::None` is usually not the right thing to
+ /// do. When no capture states are compiled, some regex engines (such as
+ /// the `PikeVM`) won't be able to report match offsets. This will manifest
+ /// as no match being found.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how the results of capture groups can change
+ /// based on this option. First we show the default (all capture groups in
+ /// the pattern are capturing):
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match, Span};
+ ///
+ /// let re = Regex::new(r"foo([0-9]+)bar")?;
+ /// let hay = "foo123bar";
+ ///
+ /// let mut caps = re.create_captures();
+ /// re.captures(hay, &mut caps);
+ /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0));
+ /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1));
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And now we show the behavior when we only include implicit capture
+ /// groups. In this case, we can only find the overall match span, but the
+ /// spans of any other explicit group don't exist because they are treated
+ /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used,
+ /// there is no real point in using [`Regex::captures`] since it will never
+ /// be able to report more information than [`Regex::find`].)
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// meta::Regex,
+ /// nfa::thompson::WhichCaptures,
+ /// Match,
+ /// Span,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().which_captures(WhichCaptures::Implicit))
+ /// .build(r"foo([0-9]+)bar")?;
+ /// let hay = "foo123bar";
+ ///
+ /// let mut caps = re.create_captures();
+ /// re.captures(hay, &mut caps);
+ /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0));
+ /// assert_eq!(None, caps.get_group(1));
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
+ self.which_captures = Some(which_captures);
+ self
+ }
+
+ /// Sets the size limit, in bytes, to enforce on the construction of every
+ /// NFA build by the meta regex engine.
+ ///
+ /// Setting it to `None` disables the limit. This is not recommended if
+ /// you're compiling untrusted patterns.
+ ///
+ /// Note that this limit is applied to _each_ NFA built, and if any of
+ /// them excceed the limit, then construction will fail. This limit does
+ /// _not_ correspond to the total memory used by all NFAs in the meta regex
+ /// engine.
+ ///
+ /// This defaults to some reasonable number that permits most reasonable
+ /// patterns.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10))))
+ /// // Not even 20KB is enough to build a single large Unicode class!
+ /// .build(r"\pL");
+ /// assert!(result.is_err());
+ ///
+ /// // But notice that building such a regex with the exact same limit
+ /// // can succeed depending on other aspects of the configuration. For
+ /// // example, a single *forward* NFA will (at time of writing) fit into
+ /// // the 20KB limit, but a *reverse* NFA of the same pattern will not.
+ /// // So if one configures a meta regex such that a reverse NFA is never
+ /// // needed and thus never built, then the 20KB limit will be enough for
+ /// // a pattern like \pL!
+ /// let result = Regex::builder()
+ /// .configure(Regex::config()
+ /// .nfa_size_limit(Some(20 * (1<<10)))
+ /// // The DFAs are the only thing that (currently) need a reverse
+ /// // NFA. So if both are disabled, the meta regex engine will
+ /// // skip building the reverse NFA. Note that this isn't an API
+ /// // guarantee. A future semver compatible version may introduce
+ /// // new use cases for a reverse NFA.
+ /// .hybrid(false)
+ /// .dfa(false)
+ /// )
+ /// // Not even 20KB is enough to build a single large Unicode class!
+ /// .build(r"\pL");
+ /// assert!(result.is_ok());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn nfa_size_limit(self, limit: Option<usize>) -> Config {
+ Config { nfa_size_limit: Some(limit), ..self }
+ }
+
+ /// Sets the size limit, in bytes, for the one-pass DFA.
+ ///
+ /// Setting it to `None` disables the limit. Disabling the limit is
+ /// strongly discouraged when compiling untrusted patterns. Even if the
+ /// patterns are trusted, it still may not be a good idea, since a one-pass
+ /// DFA can use a lot of memory. With that said, as the size of a regex
+ /// increases, the likelihood of it being one-pass likely decreases.
+ ///
+ /// This defaults to some reasonable number that permits most reasonable
+ /// one-pass patterns.
+ ///
+ /// # Example
+ ///
+ /// This shows how to set the one-pass DFA size limit. Note that since
+ /// a one-pass DFA is an optional component of the meta regex engine,
+ /// this size limit only impacts what is built internally and will never
+ /// determine whether a `Regex` itself fails to build.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// .configure(Regex::config().onepass_size_limit(Some(2 * (1<<20))))
+ /// .build(r"\pL{5}");
+ /// assert!(result.is_ok());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn onepass_size_limit(self, limit: Option<usize>) -> Config {
+ Config { onepass_size_limit: Some(limit), ..self }
+ }
+
+ /// Set the cache capacity, in bytes, for the lazy DFA.
+ ///
+ /// The cache capacity of the lazy DFA determines approximately how much
+ /// heap memory it is allowed to use to store its state transitions. The
+ /// state transitions are computed at search time, and if the cache fills
+ /// up it, it is cleared. At this point, any previously generated state
+ /// transitions are lost and are re-generated if they're needed again.
+ ///
+ /// This sort of cache filling and clearing works quite well _so long as
+ /// cache clearing happens infrequently_. If it happens too often, then the
+ /// meta regex engine will stop using the lazy DFA and switch over to a
+ /// different regex engine.
+ ///
+ /// In cases where the cache is cleared too often, it may be possible to
+ /// give the cache more space and reduce (or eliminate) how often it is
+ /// cleared. Similarly, sometimes a regex is so big that the lazy DFA isn't
+ /// used at all if its cache capacity isn't big enough.
+ ///
+ /// The capacity set here is a _limit_ on how much memory is used. The
+ /// actual memory used is only allocated as it's needed.
+ ///
+ /// Determining the right value for this is a little tricky and will likely
+ /// required some profiling. Enabling the `logging` feature and setting the
+ /// log level to `trace` will also tell you how often the cache is being
+ /// cleared.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// .configure(Regex::config().hybrid_cache_capacity(20 * (1<<20)))
+ /// .build(r"\pL{5}");
+ /// assert!(result.is_ok());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn hybrid_cache_capacity(self, limit: usize) -> Config {
+ Config { hybrid_cache_capacity: Some(limit), ..self }
+ }
+
+ /// Sets the size limit, in bytes, for heap memory used for a fully
+ /// compiled DFA.
+ ///
+ /// **NOTE:** If you increase this, you'll likely also need to increase
+ /// [`Config::dfa_state_limit`].
+ ///
+ /// In contrast to the lazy DFA, building a full DFA requires computing
+ /// all of its state transitions up front. This can be a very expensive
+ /// process, and runs in worst case `2^n` time and space (where `n` is
+ /// proportional to the size of the regex). However, a full DFA unlocks
+ /// some additional optimization opportunities.
+ ///
+ /// Because full DFAs can be so expensive, the default limits for them are
+ /// incredibly small. Generally speaking, if your regex is moderately big
+ /// or if you're using Unicode features (`\w` is Unicode-aware by default
+ /// for example), then you can expect that the meta regex engine won't even
+ /// attempt to build a DFA for it.
+ ///
+ /// If this and [`Config::dfa_state_limit`] are set to `None`, then the
+ /// meta regex will not use any sort of limits when deciding whether to
+ /// build a DFA. This in turn makes construction of a `Regex` take
+ /// worst case exponential time and space. Even short patterns can result
+ /// in huge space blow ups. So it is strongly recommended to keep some kind
+ /// of limit set!
+ ///
+ /// The default is set to a small number that permits some simple regexes
+ /// to get compiled into DFAs in reasonable time.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// // 100MB is much bigger than the default.
+ /// .configure(Regex::config()
+ /// .dfa_size_limit(Some(100 * (1<<20)))
+ /// // We don't care about size too much here, so just
+ /// // remove the NFA state limit altogether.
+ /// .dfa_state_limit(None))
+ /// .build(r"\pL{5}");
+ /// assert!(result.is_ok());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn dfa_size_limit(self, limit: Option<usize>) -> Config {
+ Config { dfa_size_limit: Some(limit), ..self }
+ }
+
+ /// Sets a limit on the total number of NFA states, beyond which, a full
+ /// DFA is not attempted to be compiled.
+ ///
+ /// This limit works in concert with [`Config::dfa_size_limit`]. Namely,
+ /// where as `Config::dfa_size_limit` is applied by attempting to construct
+ /// a DFA, this limit is used to avoid the attempt in the first place. This
+ /// is useful to avoid hefty initialization costs associated with building
+ /// a DFA for cases where it is obvious the DFA will ultimately be too big.
+ ///
+ /// By default, this is set to a very small number.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::meta::Regex;
+ ///
+ /// let result = Regex::builder()
+ /// .configure(Regex::config()
+ /// // Sometimes the default state limit rejects DFAs even
+ /// // if they would fit in the size limit. Here, we disable
+ /// // the check on the number of NFA states and just rely on
+ /// // the size limit.
+ /// .dfa_state_limit(None))
+ /// .build(r"(?-u)\w{30}");
+ /// assert!(result.is_ok());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn dfa_state_limit(self, limit: Option<usize>) -> Config {
+ Config { dfa_state_limit: Some(limit), ..self }
+ }
+
+ /// Whether to attempt to shrink the size of the alphabet for the regex
+ /// pattern or not. When enabled, the alphabet is shrunk into a set of
+ /// equivalence classes, where every byte in the same equivalence class
+ /// cannot discriminate between a match or non-match.
+ ///
+ /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+ /// does not yield any speed advantages. Indeed, disabling it can result
+ /// in much higher memory usage. Disabling byte classes is useful for
+ /// debugging the actual generated transitions because it lets one see the
+ /// transitions defined on actual bytes instead of the equivalence classes.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging the meta regex engine's internals.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().byte_classes(false))
+ /// .build(r"[a-z]+")?;
+ /// let hay = "!!quux!!";
+ /// assert_eq!(Some(Match::must(0, 2..6)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn byte_classes(self, yes: bool) -> Config {
+ Config { byte_classes: Some(yes), ..self }
+ }
+
+ /// Set the line terminator to be used by the `^` and `$` anchors in
+ /// multi-line mode.
+ ///
+ /// This option has no effect when CRLF mode is enabled. That is,
+ /// regardless of this setting, `(?Rm:^)` and `(?Rm:$)` will always treat
+ /// `\r` and `\n` as line terminators (and will never match between a `\r`
+ /// and a `\n`).
+ ///
+ /// By default, `\n` is the line terminator.
+ ///
+ /// **Warning**: This does not change the behavior of `.`. To do that,
+ /// you'll need to configure the syntax option
+ /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator)
+ /// in addition to this. Otherwise, `.` will continue to match any
+ /// character other than `\n`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().multi_line(true))
+ /// .configure(Regex::config().line_terminator(b'\x00'))
+ /// .build(r"^foo$")?;
+ /// let hay = "\x00foo\x00";
+ /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn line_terminator(self, byte: u8) -> Config {
+ Config { line_terminator: Some(byte), ..self }
+ }
+
+ /// Toggle whether the hybrid NFA/DFA (also known as the "lazy DFA") should
+ /// be available for use by the meta regex engine.
+ ///
+ /// Enabling this does not necessarily mean that the lazy DFA will
+ /// definitely be used. It just means that it will be _available_ for use
+ /// if the meta regex engine thinks it will be useful.
+ ///
+ /// When the `hybrid` crate feature is enabled, then this is enabled by
+ /// default. Otherwise, if the crate feature is disabled, then this is
+ /// always disabled, regardless of its setting by the caller.
+ pub fn hybrid(self, yes: bool) -> Config {
+ Config { hybrid: Some(yes), ..self }
+ }
+
+ /// Toggle whether a fully compiled DFA should be available for use by the
+ /// meta regex engine.
+ ///
+ /// Enabling this does not necessarily mean that a DFA will definitely be
+ /// used. It just means that it will be _available_ for use if the meta
+ /// regex engine thinks it will be useful.
+ ///
+ /// When the `dfa-build` crate feature is enabled, then this is enabled by
+ /// default. Otherwise, if the crate feature is disabled, then this is
+ /// always disabled, regardless of its setting by the caller.
+ pub fn dfa(self, yes: bool) -> Config {
+ Config { dfa: Some(yes), ..self }
+ }
+
+ /// Toggle whether a one-pass DFA should be available for use by the meta
+ /// regex engine.
+ ///
+ /// Enabling this does not necessarily mean that a one-pass DFA will
+ /// definitely be used. It just means that it will be _available_ for
+ /// use if the meta regex engine thinks it will be useful. (Indeed, a
+ /// one-pass DFA can only be used when the regex is one-pass. See the
+ /// [`dfa::onepass`](crate::dfa::onepass) module for more details.)
+ ///
+ /// When the `dfa-onepass` crate feature is enabled, then this is enabled
+ /// by default. Otherwise, if the crate feature is disabled, then this is
+ /// always disabled, regardless of its setting by the caller.
+ pub fn onepass(self, yes: bool) -> Config {
+ Config { onepass: Some(yes), ..self }
+ }
+
+ /// Toggle whether a bounded backtracking regex engine should be available
+ /// for use by the meta regex engine.
+ ///
+ /// Enabling this does not necessarily mean that a bounded backtracker will
+ /// definitely be used. It just means that it will be _available_ for use
+ /// if the meta regex engine thinks it will be useful.
+ ///
+ /// When the `nfa-backtrack` crate feature is enabled, then this is enabled
+ /// by default. Otherwise, if the crate feature is disabled, then this is
+ /// always disabled, regardless of its setting by the caller.
+ pub fn backtrack(self, yes: bool) -> Config {
+ Config { backtrack: Some(yes), ..self }
+ }
+
+ /// Returns the match kind on this configuration, as set by
+ /// [`Config::match_kind`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether empty matches must fall on valid UTF-8 boundaries, as
+ /// set by [`Config::utf8_empty`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_utf8_empty(&self) -> bool {
+ self.utf8_empty.unwrap_or(true)
+ }
+
+ /// Returns whether automatic prefilters are enabled, as set by
+ /// [`Config::auto_prefilter`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_auto_prefilter(&self) -> bool {
+ self.autopre.unwrap_or(true)
+ }
+
+ /// Returns a manually set prefilter, if one was set by
+ /// [`Config::prefilter`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
+ }
+
+ /// Returns the capture configuration, as set by
+ /// [`Config::which_captures`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_which_captures(&self) -> WhichCaptures {
+ self.which_captures.unwrap_or(WhichCaptures::All)
+ }
+
+ /// Returns NFA size limit, as set by [`Config::nfa_size_limit`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_nfa_size_limit(&self) -> Option<usize> {
+ self.nfa_size_limit.unwrap_or(Some(10 * (1 << 20)))
+ }
+
+ /// Returns one-pass DFA size limit, as set by
+ /// [`Config::onepass_size_limit`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_onepass_size_limit(&self) -> Option<usize> {
+ self.onepass_size_limit.unwrap_or(Some(1 * (1 << 20)))
+ }
+
+ /// Returns hybrid NFA/DFA cache capacity, as set by
+ /// [`Config::hybrid_cache_capacity`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_hybrid_cache_capacity(&self) -> usize {
+ self.hybrid_cache_capacity.unwrap_or(2 * (1 << 20))
+ }
+
+ /// Returns DFA size limit, as set by [`Config::dfa_size_limit`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_dfa_size_limit(&self) -> Option<usize> {
+ // The default for this is VERY small because building a full DFA is
+ // ridiculously costly. But for regexes that are very small, it can be
+ // beneficial to use a full DFA. In particular, a full DFA can enable
+ // additional optimizations via something called "accelerated" states.
+ // Namely, when there's a state with only a few outgoing transitions,
+ // we can temporary suspend walking the transition table and use memchr
+ // for just those outgoing transitions to skip ahead very quickly.
+ //
+ // Generally speaking, if Unicode is enabled in your regex and you're
+ // using some kind of Unicode feature, then it's going to blow this
+ // size limit. Moreover, Unicode tends to defeat the "accelerated"
+ // state optimization too, so it's a double whammy.
+ //
+ // We also use a limit on the number of NFA states to avoid even
+ // starting the DFA construction process. Namely, DFA construction
+ // itself could make lots of initial allocs proportional to the size
+ // of the NFA, and if the NFA is large, it doesn't make sense to pay
+ // that cost if we know it's likely to be blown by a large margin.
+ self.dfa_size_limit.unwrap_or(Some(40 * (1 << 10)))
+ }
+
+ /// Returns DFA size limit in terms of the number of states in the NFA, as
+ /// set by [`Config::dfa_state_limit`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_dfa_state_limit(&self) -> Option<usize> {
+ // Again, as with the size limit, we keep this very small.
+ self.dfa_state_limit.unwrap_or(Some(30))
+ }
+
+ /// Returns whether byte classes are enabled, as set by
+ /// [`Config::byte_classes`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns the line terminator for this configuration, as set by
+ /// [`Config::line_terminator`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_line_terminator(&self) -> u8 {
+ self.line_terminator.unwrap_or(b'\n')
+ }
+
+ /// Returns whether the hybrid NFA/DFA regex engine may be used, as set by
+ /// [`Config::hybrid`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_hybrid(&self) -> bool {
+ #[cfg(feature = "hybrid")]
+ {
+ self.hybrid.unwrap_or(true)
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ false
+ }
+ }
+
+ /// Returns whether the DFA regex engine may be used, as set by
+ /// [`Config::dfa`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_dfa(&self) -> bool {
+ #[cfg(feature = "dfa-build")]
+ {
+ self.dfa.unwrap_or(true)
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ false
+ }
+ }
+
+ /// Returns whether the one-pass DFA regex engine may be used, as set by
+ /// [`Config::onepass`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_onepass(&self) -> bool {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ self.onepass.unwrap_or(true)
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ false
+ }
+ }
+
+ /// Returns whether the bounded backtracking regex engine may be used, as
+ /// set by [`Config::backtrack`].
+ ///
+ /// If it was not explicitly set, then a default value is returned.
+ pub fn get_backtrack(&self) -> bool {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ self.backtrack.unwrap_or(true)
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ false
+ }
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ match_kind: o.match_kind.or(self.match_kind),
+ utf8_empty: o.utf8_empty.or(self.utf8_empty),
+ autopre: o.autopre.or(self.autopre),
+ pre: o.pre.or_else(|| self.pre.clone()),
+ which_captures: o.which_captures.or(self.which_captures),
+ nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
+ onepass_size_limit: o
+ .onepass_size_limit
+ .or(self.onepass_size_limit),
+ hybrid_cache_capacity: o
+ .hybrid_cache_capacity
+ .or(self.hybrid_cache_capacity),
+ hybrid: o.hybrid.or(self.hybrid),
+ dfa: o.dfa.or(self.dfa),
+ dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
+ dfa_state_limit: o.dfa_state_limit.or(self.dfa_state_limit),
+ onepass: o.onepass.or(self.onepass),
+ backtrack: o.backtrack.or(self.backtrack),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ line_terminator: o.line_terminator.or(self.line_terminator),
+ }
+ }
+}
+
+/// A builder for configuring and constructing a `Regex`.
+///
+/// The builder permits configuring two different aspects of a `Regex`:
+///
+/// * [`Builder::configure`] will set high-level configuration options as
+/// described by a [`Config`].
+/// * [`Builder::syntax`] will set the syntax level configuration options
+/// as described by a [`util::syntax::Config`](crate::util::syntax::Config).
+/// This only applies when building a `Regex` from pattern strings.
+///
+/// Once configured, the builder can then be used to construct a `Regex` from
+/// one of 4 different inputs:
+///
+/// * [`Builder::build`] creates a regex from a single pattern string.
+/// * [`Builder::build_many`] creates a regex from many pattern strings.
+/// * [`Builder::build_from_hir`] creates a regex from a
+/// [`regex-syntax::Hir`](Hir) expression.
+/// * [`Builder::build_many_from_hir`] creates a regex from many
+/// [`regex-syntax::Hir`](Hir) expressions.
+///
+/// The latter two methods in particular provide a way to construct a fully
+/// feature regular expression matcher directly from an `Hir` expression
+/// without having to first convert it to a string. (This is in contrast to the
+/// top-level `regex` crate which intentionally provides no such API in order
+/// to avoid making `regex-syntax` a public dependency.)
+///
+/// As a convenience, this builder may be created via [`Regex::builder`], which
+/// may help avoid an extra import.
+///
+/// # Example: change the line terminator
+///
+/// This example shows how to enable multi-line mode by default and change the
+/// line terminator to the NUL byte:
+///
+/// ```
+/// use regex_automata::{meta::Regex, util::syntax, Match};
+///
+/// let re = Regex::builder()
+/// .syntax(syntax::Config::new().multi_line(true))
+/// .configure(Regex::config().line_terminator(b'\x00'))
+/// .build(r"^foo$")?;
+/// let hay = "\x00foo\x00";
+/// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: disable UTF-8 requirement
+///
+/// By default, regex patterns are required to match UTF-8. This includes
+/// regex patterns that can produce matches of length zero. In the case of an
+/// empty match, by default, matches will not appear between the code units of
+/// a UTF-8 encoded codepoint.
+///
+/// However, it can be useful to disable this requirement, particularly if
+/// you're searching things like `&[u8]` that are not known to be valid UTF-8.
+///
+/// ```
+/// use regex_automata::{meta::Regex, util::syntax, Match};
+///
+/// let mut builder = Regex::builder();
+/// // Disables the requirement that non-empty matches match UTF-8.
+/// builder.syntax(syntax::Config::new().utf8(false));
+/// // Disables the requirement that empty matches match UTF-8 boundaries.
+/// builder.configure(Regex::config().utf8_empty(false));
+///
+/// // We can match raw bytes via \xZZ syntax, but we need to disable
+/// // Unicode mode to do that. We could disable it everywhere, or just
+/// // selectively, as shown here.
+/// let re = builder.build(r"(?-u:\xFF)foo(?-u:\xFF)")?;
+/// let hay = b"\xFFfoo\xFF";
+/// assert_eq!(Some(Match::must(0, 0..5)), re.find(hay));
+///
+/// // We can also match between code units.
+/// let re = builder.build(r"")?;
+/// let hay = "☃";
+/// assert_eq!(re.find_iter(hay).collect::<Vec<Match>>(), vec![
+/// Match::must(0, 0..0),
+/// Match::must(0, 1..1),
+/// Match::must(0, 2..2),
+/// Match::must(0, 3..3),
+/// ]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ ast: ast::parse::ParserBuilder,
+ hir: hir::translate::TranslatorBuilder,
+}
+
+impl Builder {
+ /// Creates a new builder for configuring and constructing a [`Regex`].
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ ast: ast::parse::ParserBuilder::new(),
+ hir: hir::translate::TranslatorBuilder::new(),
+ }
+ }
+
+ /// Builds a `Regex` from a single pattern string.
+ ///
+ /// If there was a problem parsing the pattern or a problem turning it into
+ /// a regex matcher, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to configure syntax options.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().crlf(true).multi_line(true))
+ /// .build(r"^foo$")?;
+ /// let hay = "\r\nfoo\r\n";
+ /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Builds a `Regex` from many pattern strings.
+ ///
+ /// If there was a problem parsing any of the patterns or a problem turning
+ /// them into a regex matcher, then an error is returned.
+ ///
+ /// # Example: finding the pattern that caused an error
+ ///
+ /// When a syntax error occurs, it is possible to ask which pattern
+ /// caused the syntax error.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, PatternID};
+ ///
+ /// let err = Regex::builder()
+ /// .build_many(&["a", "b", r"\p{Foo}", "c"])
+ /// .unwrap_err();
+ /// assert_eq!(Some(PatternID::must(2)), err.pattern());
+ /// ```
+ ///
+ /// # Example: zero patterns is valid
+ ///
+ /// Building a regex with zero patterns results in a regex that never
+ /// matches anything. Because this routine is generic, passing an empty
+ /// slice usually requires a turbo-fish (or something else to help type
+ /// inference).
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .build_many::<&str>(&[])?;
+ /// assert_eq!(None, re.find(""));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ use crate::util::primitives::IteratorIndexExt;
+ log! {
+ debug!("building meta regex with {} patterns:", patterns.len());
+ for (pid, p) in patterns.iter().with_pattern_ids() {
+ let p = p.as_ref();
+ // We might split a grapheme with this truncation logic, but
+ // that's fine. We at least avoid splitting a codepoint.
+ let maxoff = p
+ .char_indices()
+ .map(|(i, ch)| i + ch.len_utf8())
+ .take(1000)
+ .last()
+ .unwrap_or(0);
+ if maxoff < p.len() {
+ debug!("{:?}: {}[... snip ...]", pid, &p[..maxoff]);
+ } else {
+ debug!("{:?}: {}", pid, p);
+ }
+ }
+ }
+ let (mut asts, mut hirs) = (vec![], vec![]);
+ for (pid, p) in patterns.iter().with_pattern_ids() {
+ let ast = self
+ .ast
+ .build()
+ .parse(p.as_ref())
+ .map_err(|err| BuildError::ast(pid, err))?;
+ asts.push(ast);
+ }
+ for ((pid, p), ast) in
+ patterns.iter().with_pattern_ids().zip(asts.iter())
+ {
+ let hir = self
+ .hir
+ .build()
+ .translate(p.as_ref(), ast)
+ .map_err(|err| BuildError::hir(pid, err))?;
+ hirs.push(hir);
+ }
+ self.build_many_from_hir(&hirs)
+ }
+
+ /// Builds a `Regex` directly from an `Hir` expression.
+ ///
+ /// This is useful if you needed to parse a pattern string into an `Hir`
+ /// for other reasons (such as analysis or transformations). This routine
+ /// permits building a `Regex` directly from the `Hir` expression instead
+ /// of first converting the `Hir` back to a pattern string.
+ ///
+ /// When using this method, any options set via [`Builder::syntax`] are
+ /// ignored. Namely, the syntax options only apply when parsing a pattern
+ /// string, which isn't relevant here.
+ ///
+ /// If there was a problem building the underlying regex matcher for the
+ /// given `Hir`, then an error is returned.
+ ///
+ /// # Example
+ ///
+ /// This example shows how one can hand-construct an `Hir` expression and
+ /// build a regex from it without doing any parsing at all.
+ ///
+ /// ```
+ /// use {
+ /// regex_automata::{meta::Regex, Match},
+ /// regex_syntax::hir::{Hir, Look},
+ /// };
+ ///
+ /// // (?Rm)^foo$
+ /// let hir = Hir::concat(vec![
+ /// Hir::look(Look::StartCRLF),
+ /// Hir::literal("foo".as_bytes()),
+ /// Hir::look(Look::EndCRLF),
+ /// ]);
+ /// let re = Regex::builder()
+ /// .build_from_hir(&hir)?;
+ /// let hay = "\r\nfoo\r\n";
+ /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_hir(&self, hir: &Hir) -> Result<Regex, BuildError> {
+ self.build_many_from_hir(&[hir])
+ }
+
+ /// Builds a `Regex` directly from many `Hir` expressions.
+ ///
+ /// This is useful if you needed to parse pattern strings into `Hir`
+ /// expressions for other reasons (such as analysis or transformations).
+ /// This routine permits building a `Regex` directly from the `Hir`
+ /// expressions instead of first converting the `Hir` expressions back to
+ /// pattern strings.
+ ///
+ /// When using this method, any options set via [`Builder::syntax`] are
+ /// ignored. Namely, the syntax options only apply when parsing a pattern
+ /// string, which isn't relevant here.
+ ///
+ /// If there was a problem building the underlying regex matcher for the
+ /// given `Hir` expressions, then an error is returned.
+ ///
+ /// Note that unlike [`Builder::build_many`], this can only fail as a
+ /// result of building the underlying matcher. In that case, there is
+ /// no single `Hir` expression that can be isolated as a reason for the
+ /// failure. So if this routine fails, it's not possible to determine which
+ /// `Hir` expression caused the failure.
+ ///
+ /// # Example
+ ///
+ /// This example shows how one can hand-construct multiple `Hir`
+ /// expressions and build a single regex from them without doing any
+ /// parsing at all.
+ ///
+ /// ```
+ /// use {
+ /// regex_automata::{meta::Regex, Match},
+ /// regex_syntax::hir::{Hir, Look},
+ /// };
+ ///
+ /// // (?Rm)^foo$
+ /// let hir1 = Hir::concat(vec![
+ /// Hir::look(Look::StartCRLF),
+ /// Hir::literal("foo".as_bytes()),
+ /// Hir::look(Look::EndCRLF),
+ /// ]);
+ /// // (?Rm)^bar$
+ /// let hir2 = Hir::concat(vec![
+ /// Hir::look(Look::StartCRLF),
+ /// Hir::literal("bar".as_bytes()),
+ /// Hir::look(Look::EndCRLF),
+ /// ]);
+ /// let re = Regex::builder()
+ /// .build_many_from_hir(&[&hir1, &hir2])?;
+ /// let hay = "\r\nfoo\r\nbar";
+ /// let got: Vec<Match> = re.find_iter(hay).collect();
+ /// let expected = vec![
+ /// Match::must(0, 2..5),
+ /// Match::must(1, 7..10),
+ /// ];
+ /// assert_eq!(expected, got);
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_many_from_hir<H: Borrow<Hir>>(
+ &self,
+ hirs: &[H],
+ ) -> Result<Regex, BuildError> {
+ let config = self.config.clone();
+ // We collect the HIRs into a vec so we can write internal routines
+ // with '&[&Hir]'. i.e., Don't use generics everywhere to keep code
+ // bloat down..
+ let hirs: Vec<&Hir> = hirs.iter().map(|hir| hir.borrow()).collect();
+ let info = RegexInfo::new(config, &hirs);
+ let strat = strategy::new(&info, &hirs)?;
+ let pool = {
+ let strat = Arc::clone(&strat);
+ let create: CachePoolFn = Box::new(move || strat.create_cache());
+ Pool::new(create)
+ };
+ Ok(Regex { imp: Arc::new(RegexI { strat, info }), pool })
+ }
+
+ /// Configure the behavior of a `Regex`.
+ ///
+ /// This configuration controls non-syntax options related to the behavior
+ /// of a `Regex`. This includes things like whether empty matches can split
+ /// a codepoint, prefilters, line terminators and a long list of options
+ /// for configuring which regex engines the meta regex engine will be able
+ /// to use internally.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 empty mode. This will permit
+ /// empty matches to occur between the UTF-8 encoding of a codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, Match};
+ ///
+ /// let re = Regex::new("")?;
+ /// let got: Vec<Match> = re.find_iter("☃").collect();
+ /// // Matches only occur at the beginning and end of the snowman.
+ /// assert_eq!(got, vec![
+ /// Match::must(0, 0..0),
+ /// Match::must(0, 3..3),
+ /// ]);
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8_empty(false))
+ /// .build("")?;
+ /// let got: Vec<Match> = re.find_iter("☃").collect();
+ /// // Matches now occur at every position!
+ /// assert_eq!(got, vec![
+ /// Match::must(0, 0..0),
+ /// Match::must(0, 1..1),
+ /// Match::must(0, 2..2),
+ /// Match::must(0, 3..3),
+ /// ]);
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Configure the syntax options when parsing a pattern string while
+ /// building a `Regex`.
+ ///
+ /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`]
+ /// are used. The other build methods accept `Hir` values, which have
+ /// already been parsed.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to enable case insensitive mode.
+ ///
+ /// ```
+ /// use regex_automata::{meta::Regex, util::syntax, Match};
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().case_insensitive(true))
+ /// .build(r"δ")?;
+ /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
+ ///
+ /// Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ config.apply_ast(&mut self.ast);
+ config.apply_hir(&mut self.hir);
+ self
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // I found this in the course of building out the benchmark suite for
+ // rebar.
+ #[test]
+ fn regression() {
+ env_logger::init();
+
+ let re = Regex::new(r"[a-zA-Z]+ing").unwrap();
+ assert_eq!(1, re.find_iter("tingling").count());
+ }
+}
diff --git a/vendor/regex-automata/src/meta/reverse_inner.rs b/vendor/regex-automata/src/meta/reverse_inner.rs
new file mode 100644
index 000000000..3d78779f6
--- /dev/null
+++ b/vendor/regex-automata/src/meta/reverse_inner.rs
@@ -0,0 +1,220 @@
+/*!
+A module dedicated to plucking inner literals out of a regex pattern, and
+then constructing a prefilter for them. We also include a regex pattern
+"prefix" that corresponds to the bits of the regex that need to match before
+the literals do. The reverse inner optimization then proceeds by looking for
+matches of the inner literal(s), and then doing a reverse search of the prefix
+from the start of the literal match to find the overall start position of the
+match.
+
+The essential invariant we want to uphold here is that the literals we return
+reflect a set where *at least* one of them must match in order for the overall
+regex to match. We also need to maintain the invariant that the regex prefix
+returned corresponds to the entirety of the regex up until the literals we
+return.
+
+This somewhat limits what we can do. That is, if we a regex like
+`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
+from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
+we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
+we could still extract `{@!, %%}`, and it is true that at least of them must
+match. But then, what is our regex prefix? Again, in theory, that could be
+`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
+matches, and `\w+` only matches when `%%` matches.
+
+All of that is technically possible to do, but it seemingly requires a lot of
+sophistication and machinery. Probably the way to tackle that is with some kind
+of formalism and approach this problem more generally.
+
+For now, the code below basically just looks for a top-level concatenation.
+And if it can find one, it looks for literals in each of the direct child
+sub-expressions of that concatenation. If some good ones are found, we return
+those and a concatenation of the Hir expressions seen up to that point.
+*/
+
+use alloc::vec::Vec;
+
+use regex_syntax::hir::{self, literal, Hir, HirKind};
+
+use crate::{util::prefilter::Prefilter, MatchKind};
+
+/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
+/// one was found, then a concatenation of the HIR expressions that precede it
+/// is returned.
+///
+/// The idea here is that the prefilter returned can be used to find candidate
+/// matches. And then the HIR returned can be used to build a reverse regex
+/// matcher, which will find the start of the candidate match. Finally, the
+/// match still has to be confirmed with a normal anchored forward scan to find
+/// the end position of the match.
+///
+/// Note that this assumes leftmost-first match semantics, so callers must
+/// not call this otherwise.
+pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
+ if hirs.len() != 1 {
+ debug!(
+ "skipping reverse inner optimization since it only \
+ supports 1 pattern, {} were given",
+ hirs.len(),
+ );
+ return None;
+ }
+ let mut concat = match top_concat(hirs[0]) {
+ Some(concat) => concat,
+ None => {
+ debug!(
+ "skipping reverse inner optimization because a top-level \
+ concatenation could not found",
+ );
+ return None;
+ }
+ };
+ // We skip the first HIR because if it did have a prefix prefilter in it,
+ // we probably wouldn't be here looking for an inner prefilter.
+ for i in 1..concat.len() {
+ let hir = &concat[i];
+ let pre = match prefilter(hir) {
+ None => continue,
+ Some(pre) => pre,
+ };
+ // Even if we got a prefilter, if it isn't consider "fast," then we
+ // probably don't want to bother with it. Namely, since the reverse
+ // inner optimization requires some overhead, it likely only makes
+ // sense if the prefilter scan itself is (believed) to be much faster
+ // than the regex engine.
+ if !pre.is_fast() {
+ debug!(
+ "skipping extracted inner prefilter because \
+ it probably isn't fast"
+ );
+ continue;
+ }
+ let concat_suffix = Hir::concat(concat.split_off(i));
+ let concat_prefix = Hir::concat(concat);
+ // Look for a prefilter again. Why? Because above we only looked for
+ // a prefilter on the individual 'hir', but we might be able to find
+ // something better and more discriminatory by looking at the entire
+ // suffix. We don't do this above to avoid making this loop worst case
+ // quadratic in the length of 'concat'.
+ let pre2 = match prefilter(&concat_suffix) {
+ None => pre,
+ Some(pre2) => {
+ if pre2.is_fast() {
+ pre2
+ } else {
+ pre
+ }
+ }
+ };
+ return Some((concat_prefix, pre2));
+ }
+ debug!(
+ "skipping reverse inner optimization because a top-level \
+ sub-expression with a fast prefilter could not be found"
+ );
+ None
+}
+
+/// Attempt to extract a prefilter from an HIR expression.
+///
+/// We do a little massaging here to do our best that the prefilter we get out
+/// of this is *probably* fast. Basically, the false positive rate has a much
+/// higher impact for things like the reverse inner optimization because more
+/// work needs to potentially be done for each candidate match.
+///
+/// Note that this assumes leftmost-first match semantics, so callers must
+/// not call this otherwise.
+fn prefilter(hir: &Hir) -> Option<Prefilter> {
+ let mut extractor = literal::Extractor::new();
+ extractor.kind(literal::ExtractKind::Prefix);
+ let mut prefixes = extractor.extract(hir);
+ debug!(
+ "inner prefixes (len={:?}) extracted before optimization: {:?}",
+ prefixes.len(),
+ prefixes
+ );
+ // Since these are inner literals, we know they cannot be exact. But the
+ // extractor doesn't know this. We mark them as inexact because this might
+ // impact literal optimization. Namely, optimization weights "all literals
+ // are exact" as very high, because it presumes that any match results in
+ // an overall match. But of course, that is not the case here.
+ //
+ // In practice, this avoids plucking out a ASCII-only \s as an alternation
+ // of single-byte whitespace characters.
+ prefixes.make_inexact();
+ prefixes.optimize_for_prefix_by_preference();
+ debug!(
+ "inner prefixes (len={:?}) extracted after optimization: {:?}",
+ prefixes.len(),
+ prefixes
+ );
+ prefixes
+ .literals()
+ .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
+}
+
+/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
+/// try to return one even if it's embedded in a capturing group, but is
+/// otherwise pretty conservative in what is returned.
+///
+/// The HIR returned is a complete copy of the concat with all capturing
+/// groups removed. In effect, the concat returned is "flattened" with respect
+/// to capturing groups. This makes the detection logic above for prefixes
+/// a bit simpler, and it works because 1) capturing groups never influence
+/// whether a match occurs or not and 2) capturing groups are not used when
+/// doing the reverse inner search to find the start of the match.
+fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
+ loop {
+ hir = match hir.kind() {
+ HirKind::Empty
+ | HirKind::Literal(_)
+ | HirKind::Class(_)
+ | HirKind::Look(_)
+ | HirKind::Repetition(_)
+ | HirKind::Alternation(_) => return None,
+ HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
+ HirKind::Concat(ref subs) => {
+ // We are careful to only do the flattening/copy when we know
+ // we have a "top level" concat we can inspect. This avoids
+ // doing extra work in cases where we definitely won't use it.
+ // (This might still be wasted work if we can't go on to find
+ // some literals to extract.)
+ let concat =
+ Hir::concat(subs.iter().map(|h| flatten(h)).collect());
+ return match concat.into_kind() {
+ HirKind::Concat(xs) => Some(xs),
+ // It is actually possible for this case to occur, because
+ // 'Hir::concat' might simplify the expression to the point
+ // that concatenations are actually removed. One wonders
+ // whether this leads to other cases where we should be
+ // extracting literals, but in theory, I believe if we do
+ // get here, then it means that a "real" prefilter failed
+ // to be extracted and we should probably leave well enough
+ // alone. (A "real" prefilter is unbothered by "top-level
+ // concats" and "capturing groups.")
+ _ => return None,
+ };
+ }
+ };
+ }
+}
+
+/// Returns a copy of the given HIR but with all capturing groups removed.
+fn flatten(hir: &Hir) -> Hir {
+ match hir.kind() {
+ HirKind::Empty => Hir::empty(),
+ HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
+ HirKind::Class(ref x) => Hir::class(x.clone()),
+ HirKind::Look(ref x) => Hir::look(x.clone()),
+ HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
+ // This is the interesting case. We just drop the group information
+ // entirely and use the child HIR itself.
+ HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
+ HirKind::Alternation(ref xs) => {
+ Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
+ }
+ HirKind::Concat(ref xs) => {
+ Hir::concat(xs.iter().map(|x| flatten(x)).collect())
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/meta/stopat.rs b/vendor/regex-automata/src/meta/stopat.rs
new file mode 100644
index 000000000..e8d716689
--- /dev/null
+++ b/vendor/regex-automata/src/meta/stopat.rs
@@ -0,0 +1,224 @@
+/*!
+This module defines two bespoke forward DFA search routines. One for the lazy
+DFA and one for the fully compiled DFA. These routines differ from the normal
+ones by reporting the position at which the search terminates when a match
+*isn't* found.
+
+This position at which a search terminates is useful in contexts where the meta
+regex engine runs optimizations that could go quadratic if we aren't careful.
+Namely, a regex search *could* scan to the end of the haystack only to report a
+non-match. If the caller doesn't know that the search scanned to the end of the
+haystack, it might restart the search at the next literal candidate it finds
+and repeat the process.
+
+Providing the caller with the position at which the search stopped provides a
+way for the caller to determine the point at which subsequent scans should not
+pass. This is principally used in the "reverse inner" optimization, which works
+like this:
+
+1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'.
+2. At the spot where 'Z' matches, do a reverse anchored search from there for
+'\w+'.
+3. If the reverse search matches, it corresponds to the start position of a
+(possible) match. At this point, do a forward anchored search to find the end
+position. If an end position is found, then we have a match and we know its
+bounds.
+
+If the forward anchored search in (3) searches the entire rest of the haystack
+but reports a non-match, then a naive implementation of the above will continue
+back at step 1 looking for more candidates. There might still be a match to be
+found! It's possible. But we already scanned the whole haystack. So if we keep
+repeating the process, then we might wind up taking quadratic time in the size
+of the haystack, which is not great.
+
+So if the forward anchored search in (3) reports the position at which it
+stops, then we can detect whether quadratic behavior might be occurring in
+steps (1) and (2). For (1), it occurs if the literal candidate found occurs
+*before* the end of the previous search in (3), since that means we're now
+going to look for another match in a place where the forward search has already
+scanned. It is *correct* to do so, but our technique has become inefficient.
+For (2), quadratic behavior occurs similarly when its reverse search extends
+past the point where the previous forward search in (3) terminated. Indeed, to
+implement (2), we use the sibling 'limited' module for ensuring our reverse
+scan doesn't go further than we want.
+
+See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of
+how quadratic behavior is mitigated.
+*/
+
+use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError};
+
+#[cfg(feature = "dfa-build")]
+pub(crate) fn dfa_try_search_half_fwd(
+ dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
+ input: &Input<'_>,
+) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ use crate::dfa::{accel, Automaton};
+
+ let mut mat = None;
+ let mut sid = dfa.start_state_forward(input)?;
+ let mut at = input.start();
+ while at < input.end() {
+ sid = dfa.next_state(sid, input.haystack()[at]);
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ mat = Some(HalfMatch::new(pattern, at));
+ if input.get_earliest() {
+ return Ok(mat.ok_or(at));
+ }
+ if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ at = accel::find_fwd(needs, input.haystack(), at)
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ at = accel::find_fwd(needs, input.haystack(), at)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat.ok_or(at));
+ } else if dfa.is_quit_state(sid) {
+ if mat.is_some() {
+ return Ok(mat.ok_or(at));
+ }
+ return Err(MatchError::quit(input.haystack()[at], at).into());
+ } else {
+ // Ideally we wouldn't use a DFA that specialized start states
+ // and thus 'is_start_state()' could never be true here, but in
+ // practice we reuse the DFA created for the full regex which
+ // will specialize start states whenever there is a prefilter.
+ debug_assert!(dfa.is_start_state(sid));
+ }
+ }
+ at += 1;
+ }
+ dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat.ok_or(at))
+}
+
+#[cfg(feature = "hybrid")]
+pub(crate) fn hybrid_try_search_half_fwd(
+ dfa: &crate::hybrid::dfa::DFA,
+ cache: &mut crate::hybrid::dfa::Cache,
+ input: &Input<'_>,
+) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ let mut mat = None;
+ let mut sid = dfa.start_state_forward(cache, input)?;
+ let mut at = input.start();
+ while at < input.end() {
+ sid = dfa
+ .next_state(cache, sid, input.haystack()[at])
+ .map_err(|_| MatchError::gave_up(at))?;
+ if sid.is_tagged() {
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, sid, 0);
+ mat = Some(HalfMatch::new(pattern, at));
+ if input.get_earliest() {
+ return Ok(mat.ok_or(at));
+ }
+ } else if sid.is_dead() {
+ return Ok(mat.ok_or(at));
+ } else if sid.is_quit() {
+ if mat.is_some() {
+ return Ok(mat.ok_or(at));
+ }
+ return Err(MatchError::quit(input.haystack()[at], at).into());
+ } else {
+ // We should NEVER get an unknown state ID back from
+ // dfa.next_state().
+ debug_assert!(!sid.is_unknown());
+ // Ideally we wouldn't use a lazy DFA that specialized start
+ // states and thus 'sid.is_start()' could never be true here,
+ // but in practice we reuse the lazy DFA created for the full
+ // regex which will specialize start states whenever there is
+ // a prefilter.
+ debug_assert!(sid.is_start());
+ }
+ }
+ at += 1;
+ }
+ hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
+ Ok(mat.ok_or(at))
+}
+
+#[cfg(feature = "dfa-build")]
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn dfa_eoi_fwd(
+ dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
+ input: &Input<'_>,
+ sid: &mut crate::util::primitives::StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ use crate::dfa::Automaton;
+
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
+ Some(&b) => {
+ *sid = dfa.next_state(*sid, b);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if dfa.is_quit_state(*sid) {
+ if mat.is_some() {
+ return Ok(());
+ }
+ return Err(MatchError::quit(b, sp.end));
+ }
+ }
+ None => {
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
+ }
+ }
+ Ok(())
+}
+
+#[cfg(feature = "hybrid")]
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn hybrid_eoi_fwd(
+ dfa: &crate::hybrid::dfa::DFA,
+ cache: &mut crate::hybrid::dfa::Cache,
+ input: &Input<'_>,
+ sid: &mut crate::hybrid::LazyStateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
+ Some(&b) => {
+ *sid = dfa
+ .next_state(cache, *sid, b)
+ .map_err(|_| MatchError::gave_up(sp.end))?;
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if sid.is_quit() {
+ if mat.is_some() {
+ return Ok(());
+ }
+ return Err(MatchError::quit(b, sp.end));
+ }
+ }
+ None => {
+ *sid = dfa
+ .next_eoi_state(cache, *sid)
+ .map_err(|_| MatchError::gave_up(input.haystack().len()))?;
+ if sid.is_match() {
+ let pattern = dfa.match_pattern(cache, *sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!sid.is_quit());
+ }
+ }
+ Ok(())
+}
diff --git a/vendor/regex-automata/src/meta/strategy.rs b/vendor/regex-automata/src/meta/strategy.rs
new file mode 100644
index 000000000..ea6c6ab57
--- /dev/null
+++ b/vendor/regex-automata/src/meta/strategy.rs
@@ -0,0 +1,1908 @@
+use core::{
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+use alloc::sync::Arc;
+
+use regex_syntax::hir::{literal, Hir};
+
+use crate::{
+ meta::{
+ error::{BuildError, RetryError, RetryFailError, RetryQuadraticError},
+ regex::{Cache, RegexInfo},
+ reverse_inner, wrappers,
+ },
+ nfa::thompson::{self, WhichCaptures, NFA},
+ util::{
+ captures::{Captures, GroupInfo},
+ look::LookMatcher,
+ prefilter::{self, Prefilter, PrefilterI},
+ primitives::{NonMaxUsize, PatternID},
+ search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet},
+ },
+};
+
+/// A trait that represents a single meta strategy. Its main utility is in
+/// providing a way to do dynamic dispatch over a few choices.
+///
+/// Why dynamic dispatch? I actually don't have a super compelling reason, and
+/// importantly, I have not benchmarked it with the main alternative: an enum.
+/// I went with dynamic dispatch initially because the regex engine search code
+/// really can't be inlined into caller code in most cases because it's just
+/// too big. In other words, it is already expected that every regex search
+/// will entail at least the cost of a function call.
+///
+/// I do wonder whether using enums would result in better codegen overall
+/// though. It's a worthwhile experiment to try. Probably the most interesting
+/// benchmark to run in such a case would be one with a high match count. That
+/// is, a benchmark to test the overall latency of a search call.
+pub(super) trait Strategy:
+ Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
+{
+ fn group_info(&self) -> &GroupInfo;
+
+ fn create_cache(&self) -> Cache;
+
+ fn reset_cache(&self, cache: &mut Cache);
+
+ fn is_accelerated(&self) -> bool;
+
+ fn memory_usage(&self) -> usize;
+
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match>;
+
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch>;
+
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool;
+
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID>;
+
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ );
+}
+
+pub(super) fn new(
+ info: &RegexInfo,
+ hirs: &[&Hir],
+) -> Result<Arc<dyn Strategy>, BuildError> {
+ // At this point, we're committed to a regex engine of some kind. So pull
+ // out a prefilter if we can, which will feed to each of the constituent
+ // regex engines.
+ let pre = if info.is_always_anchored_start() {
+ // PERF: I'm not sure we necessarily want to do this... We may want to
+ // run a prefilter for quickly rejecting in some cases. The problem
+ // is that anchored searches overlap quite a bit with the use case
+ // of "run a regex on every line to extract data." In that case, the
+ // regex always matches, so running a prefilter doesn't really help us
+ // there. The main place where a prefilter helps in an anchored search
+ // is if the anchored search is not expected to match frequently. That
+ // is, the prefilter gives us a way to possibly reject a haystack very
+ // quickly.
+ //
+ // Maybe we should do use a prefilter, but only for longer haystacks?
+ // Or maybe we should only use a prefilter when we think it's "fast"?
+ //
+ // Interestingly, I think we currently lack the infrastructure for
+ // disabling a prefilter based on haystack length. That would probably
+ // need to be a new 'Input' option. (Interestingly, an 'Input' used to
+ // carry a 'Prefilter' with it, but I moved away from that.)
+ debug!("skipping literal extraction since regex is anchored");
+ None
+ } else if let Some(pre) = info.config().get_prefilter() {
+ debug!(
+ "skipping literal extraction since the caller provided a prefilter"
+ );
+ Some(pre.clone())
+ } else if info.config().get_auto_prefilter() {
+ let kind = info.config().get_match_kind();
+ let prefixes = crate::util::prefilter::prefixes(kind, hirs);
+ // If we can build a full `Strategy` from just the extracted prefixes,
+ // then we can short-circuit and avoid building a regex engine at all.
+ if let Some(pre) = Pre::from_prefixes(info, &prefixes) {
+ debug!(
+ "found that the regex can be broken down to a literal \
+ search, avoiding the regex engine entirely",
+ );
+ return Ok(pre);
+ }
+ // This now attempts another short-circuit of the regex engine: if we
+ // have a huge alternation of just plain literals, then we can just use
+ // Aho-Corasick for that and avoid the regex engine entirely.
+ //
+ // You might think this case would just be handled by
+ // `Pre::from_prefixes`, but that technique relies on heuristic literal
+ // extraction from the corresponding `Hir`. That works, but part of
+ // heuristics limit the size and number of literals returned. This case
+ // will specifically handle patterns with very large alternations.
+ //
+ // One wonders if we should just roll this our heuristic literal
+ // extraction, and then I think this case could disappear entirely.
+ if let Some(pre) = Pre::from_alternation_literals(info, hirs) {
+ debug!(
+ "found plain alternation of literals, \
+ avoiding regex engine entirely and using Aho-Corasick"
+ );
+ return Ok(pre);
+ }
+ prefixes.literals().and_then(|strings| {
+ debug!(
+ "creating prefilter from {} literals: {:?}",
+ strings.len(),
+ strings,
+ );
+ Prefilter::new(kind, strings)
+ })
+ } else {
+ debug!("skipping literal extraction since prefilters were disabled");
+ None
+ };
+ let mut core = Core::new(info.clone(), pre.clone(), hirs)?;
+ // Now that we have our core regex engines built, there are a few cases
+ // where we can do a little bit better than just a normal "search forward
+ // and maybe use a prefilter when in a start state." However, these cases
+ // may not always work or otherwise build on top of the Core searcher.
+ // For example, the reverse anchored optimization seems like it might
+ // always work, but only the DFAs support reverse searching and the DFAs
+ // might give up or quit for reasons. If we had, e.g., a PikeVM that
+ // supported reverse searching, then we could avoid building a full Core
+ // engine for this case.
+ core = match ReverseAnchored::new(core) {
+ Err(core) => core,
+ Ok(ra) => {
+ debug!("using reverse anchored strategy");
+ return Ok(Arc::new(ra));
+ }
+ };
+ core = match ReverseSuffix::new(core, hirs) {
+ Err(core) => core,
+ Ok(rs) => {
+ debug!("using reverse suffix strategy");
+ return Ok(Arc::new(rs));
+ }
+ };
+ core = match ReverseInner::new(core, hirs) {
+ Err(core) => core,
+ Ok(ri) => {
+ debug!("using reverse inner strategy");
+ return Ok(Arc::new(ri));
+ }
+ };
+ debug!("using core strategy");
+ Ok(Arc::new(core))
+}
+
+#[derive(Clone, Debug)]
+struct Pre<P> {
+ pre: P,
+ group_info: GroupInfo,
+}
+
+impl<P: PrefilterI> Pre<P> {
+ fn new(pre: P) -> Arc<dyn Strategy> {
+ // The only thing we support when we use prefilters directly as a
+ // strategy is the start and end of the overall match for a single
+ // pattern. In other words, exactly one implicit capturing group. Which
+ // is exactly what we use here for a GroupInfo.
+ let group_info = GroupInfo::new([[None::<&str>]]).unwrap();
+ Arc::new(Pre { pre, group_info })
+ }
+}
+
+// This is a little weird, but we don't actually care about the type parameter
+// here because we're selecting which underlying prefilter to use. So we just
+// define it on an arbitrary type.
+impl Pre<()> {
+ /// Given a sequence of prefixes, attempt to return a full `Strategy` using
+ /// just the prefixes.
+ ///
+ /// Basically, this occurs when the prefixes given not just prefixes,
+ /// but an enumeration of the entire language matched by the regular
+ /// expression.
+ ///
+ /// A number of other conditions need to be true too. For example, there
+ /// can be only one pattern, the number of explicit capture groups is 0, no
+ /// look-around assertions and so on.
+ ///
+ /// Note that this ignores `Config::get_auto_prefilter` because if this
+ /// returns something, then it isn't a prefilter but a matcher itself.
+ /// Therefore, it shouldn't suffer from the problems typical to prefilters
+ /// (such as a high false positive rate).
+ fn from_prefixes(
+ info: &RegexInfo,
+ prefixes: &literal::Seq,
+ ) -> Option<Arc<dyn Strategy>> {
+ let kind = info.config().get_match_kind();
+ // Check to see if our prefixes are exact, which means we might be
+ // able to bypass the regex engine entirely and just rely on literal
+ // searches.
+ if !prefixes.is_exact() {
+ return None;
+ }
+ // We also require that we have a single regex pattern. Namely,
+ // we reuse the prefilter infrastructure to implement search and
+ // prefilters only report spans. Prefilters don't know about pattern
+ // IDs. The multi-regex case isn't a lost cause, we might still use
+ // Aho-Corasick and we might still just use a regular prefilter, but
+ // that's done below.
+ if info.pattern_len() != 1 {
+ return None;
+ }
+ // We can't have any capture groups either. The literal engines don't
+ // know how to deal with things like '(foo)(bar)'. In that case, a
+ // prefilter will just be used and then the regex engine will resolve
+ // the capture groups.
+ if info.props()[0].explicit_captures_len() != 0 {
+ return None;
+ }
+ // We also require that it has zero look-around assertions. Namely,
+ // literal extraction treats look-around assertions as if they match
+ // *every* empty string. But of course, that isn't true. So for
+ // example, 'foo\bquux' never matches anything, but 'fooquux' is
+ // extracted from that as an exact literal. Such cases should just run
+ // the regex engine. 'fooquux' will be used as a normal prefilter, and
+ // then the regex engine will try to look for an actual match.
+ if !info.props()[0].look_set().is_empty() {
+ return None;
+ }
+ // Finally, currently, our prefilters are all oriented around
+ // leftmost-first match semantics, so don't try to use them if the
+ // caller asked for anything else.
+ if kind != MatchKind::LeftmostFirst {
+ return None;
+ }
+ // The above seems like a lot of requirements to meet, but it applies
+ // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet
+ // the above criteria, for example.
+ //
+ // Note that this is effectively a latency optimization. If we didn't
+ // do this, then the extracted literals would still get bundled into
+ // a prefilter, and every regex engine capable of running unanchored
+ // searches supports prefilters. So this optimization merely sidesteps
+ // having to run the regex engine at all to confirm the match. Thus, it
+ // decreases the latency of a match.
+
+ // OK because we know the set is exact and thus finite.
+ let prefixes = prefixes.literals().unwrap();
+ debug!(
+ "trying to bypass regex engine by creating \
+ prefilter from {} literals: {:?}",
+ prefixes.len(),
+ prefixes,
+ );
+ let choice = match prefilter::Choice::new(kind, prefixes) {
+ Some(choice) => choice,
+ None => {
+ debug!(
+ "regex bypass failed because no prefilter could be built"
+ );
+ return None;
+ }
+ };
+ let strat: Arc<dyn Strategy> = match choice {
+ prefilter::Choice::Memchr(pre) => Pre::new(pre),
+ prefilter::Choice::Memchr2(pre) => Pre::new(pre),
+ prefilter::Choice::Memchr3(pre) => Pre::new(pre),
+ prefilter::Choice::Memmem(pre) => Pre::new(pre),
+ prefilter::Choice::Teddy(pre) => Pre::new(pre),
+ prefilter::Choice::ByteSet(pre) => Pre::new(pre),
+ prefilter::Choice::AhoCorasick(pre) => Pre::new(pre),
+ };
+ Some(strat)
+ }
+
+ /// Attempts to extract an alternation of literals, and if it's deemed
+ /// worth doing, returns an Aho-Corasick prefilter as a strategy.
+ ///
+ /// And currently, this only returns something when 'hirs.len() == 1'. This
+ /// could in theory do something if there are multiple HIRs where all of
+ /// them are alternation of literals, but I haven't had the time to go down
+ /// that path yet.
+ fn from_alternation_literals(
+ info: &RegexInfo,
+ hirs: &[&Hir],
+ ) -> Option<Arc<dyn Strategy>> {
+ use crate::util::prefilter::AhoCorasick;
+
+ let lits = crate::meta::literal::alternation_literals(info, hirs)?;
+ let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?;
+ Some(Pre::new(ac))
+ }
+}
+
+// This implements Strategy for anything that implements PrefilterI.
+//
+// Note that this must only be used for regexes of length 1. Multi-regexes
+// don't work here. The prefilter interface only provides the span of a match
+// and not the pattern ID. (I did consider making it more expressive, but I
+// couldn't figure out how to tie everything together elegantly.) Thus, so long
+// as the regex only contains one pattern, we can simply assume that a match
+// corresponds to PatternID::ZERO. And indeed, that's what we do here.
+//
+// In practice, since this impl is used to report matches directly and thus
+// completely bypasses the regex engine, we only wind up using this under the
+// following restrictions:
+//
+// * There must be only one pattern. As explained above.
+// * The literal sequence must be finite and only contain exact literals.
+// * There must not be any look-around assertions. If there are, the literals
+// extracted might be exact, but a match doesn't necessarily imply an overall
+// match. As a trivial example, 'foo\bbar' does not match 'foobar'.
+// * The pattern must not have any explicit capturing groups. If it does, the
+// caller might expect them to be resolved. e.g., 'foo(bar)'.
+//
+// So when all of those things are true, we use a prefilter directly as a
+// strategy.
+//
+// In the case where the number of patterns is more than 1, we don't use this
+// but do use a special Aho-Corasick strategy if all of the regexes are just
+// simple literals or alternations of literals. (We also use the Aho-Corasick
+// strategy when len(patterns)==1 if the number of literals is large. In that
+// case, literal extraction gives up and will return an infinite set.)
+impl<P: PrefilterI> Strategy for Pre<P> {
+ fn group_info(&self) -> &GroupInfo {
+ &self.group_info
+ }
+
+ fn create_cache(&self) -> Cache {
+ Cache {
+ capmatches: Captures::all(self.group_info().clone()),
+ pikevm: wrappers::PikeVMCache::none(),
+ backtrack: wrappers::BoundedBacktrackerCache::none(),
+ onepass: wrappers::OnePassCache::none(),
+ hybrid: wrappers::HybridCache::none(),
+ revhybrid: wrappers::ReverseHybridCache::none(),
+ }
+ }
+
+ fn reset_cache(&self, _cache: &mut Cache) {}
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.pre.memory_usage()
+ }
+
+ fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.is_done() {
+ return None;
+ }
+ if input.get_anchored().is_anchored() {
+ return self
+ .pre
+ .prefix(input.haystack(), input.get_span())
+ .map(|sp| Match::new(PatternID::ZERO, sp));
+ }
+ self.pre
+ .find(input.haystack(), input.get_span())
+ .map(|sp| Match::new(PatternID::ZERO, sp))
+ }
+
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
+ }
+
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ self.search(cache, input).is_some()
+ }
+
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ let m = self.search(cache, input)?;
+ if let Some(slot) = slots.get_mut(0) {
+ *slot = NonMaxUsize::new(m.start());
+ }
+ if let Some(slot) = slots.get_mut(1) {
+ *slot = NonMaxUsize::new(m.end());
+ }
+ Some(m.pattern())
+ }
+
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if self.search(cache, input).is_some() {
+ patset.insert(PatternID::ZERO);
+ }
+ }
+}
+
+#[derive(Debug)]
+struct Core {
+ info: RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: NFA,
+ nfarev: Option<NFA>,
+ pikevm: wrappers::PikeVM,
+ backtrack: wrappers::BoundedBacktracker,
+ onepass: wrappers::OnePass,
+ hybrid: wrappers::Hybrid,
+ dfa: wrappers::DFA,
+}
+
+impl Core {
+ fn new(
+ info: RegexInfo,
+ pre: Option<Prefilter>,
+ hirs: &[&Hir],
+ ) -> Result<Core, BuildError> {
+ let mut lookm = LookMatcher::new();
+ lookm.set_line_terminator(info.config().get_line_terminator());
+ let thompson_config = thompson::Config::new()
+ .utf8(info.config().get_utf8_empty())
+ .nfa_size_limit(info.config().get_nfa_size_limit())
+ .shrink(false)
+ .which_captures(info.config().get_which_captures())
+ .look_matcher(lookm);
+ let nfa = thompson::Compiler::new()
+ .configure(thompson_config.clone())
+ .build_many_from_hir(hirs)
+ .map_err(BuildError::nfa)?;
+ // It's possible for the PikeVM or the BB to fail to build, even though
+ // at this point, we already have a full NFA in hand. They can fail
+ // when a Unicode word boundary is used but where Unicode word boundary
+ // support is disabled at compile time, thus making it impossible to
+ // match. (Construction can also fail if the NFA was compiled without
+ // captures, but we always enable that above.)
+ let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?;
+ let backtrack =
+ wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?;
+ // The onepass engine can of course fail to build, but we expect it to
+ // fail in many cases because it is an optimization that doesn't apply
+ // to all regexes. The 'OnePass' wrapper encapsulates this failure (and
+ // logs a message if it occurs).
+ let onepass = wrappers::OnePass::new(&info, &nfa);
+ // We try to encapsulate whether a particular regex engine should be
+ // used within each respective wrapper, but the DFAs need a reverse NFA
+ // to build itself, and we really do not want to build a reverse NFA if
+ // we know we aren't going to use the lazy DFA. So we do a config check
+ // up front, which is in practice the only way we won't try to use the
+ // DFA.
+ let (nfarev, hybrid, dfa) =
+ if !info.config().get_hybrid() && !info.config().get_dfa() {
+ (None, wrappers::Hybrid::none(), wrappers::DFA::none())
+ } else {
+ // FIXME: Technically, we don't quite yet KNOW that we need
+ // a reverse NFA. It's possible for the DFAs below to both
+ // fail to build just based on the forward NFA. In which case,
+ // building the reverse NFA was totally wasted work. But...
+ // fixing this requires breaking DFA construction apart into
+ // two pieces: one for the forward part and another for the
+ // reverse part. Quite annoying. Making it worse, when building
+ // both DFAs fails, it's quite likely that the NFA is large and
+ // that it will take quite some time to build the reverse NFA
+ // too. So... it's really probably worth it to do this!
+ let nfarev = thompson::Compiler::new()
+ // Currently, reverse NFAs don't support capturing groups,
+ // so we MUST disable them. But even if we didn't have to,
+ // we would, because nothing in this crate does anything
+ // useful with capturing groups in reverse. And of course,
+ // the lazy DFA ignores capturing groups in all cases.
+ .configure(
+ thompson_config
+ .clone()
+ .which_captures(WhichCaptures::None)
+ .reverse(true),
+ )
+ .build_many_from_hir(hirs)
+ .map_err(BuildError::nfa)?;
+ let dfa = if !info.config().get_dfa() {
+ wrappers::DFA::none()
+ } else {
+ wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
+ };
+ let hybrid = if !info.config().get_hybrid() {
+ wrappers::Hybrid::none()
+ } else if dfa.is_some() {
+ debug!("skipping lazy DFA because we have a full DFA");
+ wrappers::Hybrid::none()
+ } else {
+ wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
+ };
+ (Some(nfarev), hybrid, dfa)
+ };
+ Ok(Core {
+ info,
+ pre,
+ nfa,
+ nfarev,
+ pikevm,
+ backtrack,
+ onepass,
+ hybrid,
+ dfa,
+ })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_mayfail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<Result<Option<Match>, RetryFailError>> {
+ if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for search at {:?}", input.get_span());
+ Some(e.try_search(input))
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for search at {:?}", input.get_span());
+ Some(e.try_search(&mut cache.hybrid, input))
+ } else {
+ None
+ }
+ }
+
+ fn search_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<Match> {
+ let caps = &mut cache.capmatches;
+ caps.set_pattern(None);
+ // We manually inline 'try_search_slots_nofail' here because we need to
+ // borrow from 'cache.capmatches' in this method, but if we do, then
+ // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a
+ // classic example of how the borrow checker inhibits decomposition.
+ // There are of course work-arounds (more types and/or interior
+ // mutability), but that's more annoying than this IMO.
+ let pid = if let Some(ref e) = self.onepass.get(input) {
+ trace!("using OnePass for search at {:?}", input.get_span());
+ e.search_slots(&mut cache.onepass, input, caps.slots_mut())
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.backtrack, input, caps.slots_mut())
+ } else {
+ trace!("using PikeVM for search at {:?}", input.get_span());
+ let e = self.pikevm.get();
+ e.search_slots(&mut cache.pikevm, input, caps.slots_mut())
+ };
+ caps.set_pattern(pid);
+ caps.get_match()
+ }
+
+ fn search_half_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ // Only the lazy/full DFA returns half-matches, since the DFA requires
+ // a reverse scan to find the start position. These fallback regex
+ // engines can find the start and end in a single pass, so we just do
+ // that and throw away the start offset to conform to the API.
+ let m = self.search_nofail(cache, input)?;
+ Some(HalfMatch::new(m.pattern(), m.end()))
+ }
+
+ fn search_slots_nofail(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if let Some(ref e) = self.onepass.get(input) {
+ trace!(
+ "using OnePass for capture search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.onepass, input, slots)
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for capture search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.backtrack, input, slots)
+ } else {
+ trace!(
+ "using PikeVM for capture search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.search_slots(&mut cache.pikevm, input, slots)
+ }
+ }
+
+ fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if let Some(ref e) = self.onepass.get(input) {
+ trace!(
+ "using OnePass for is-match search at {:?}",
+ input.get_span()
+ );
+ e.search_slots(&mut cache.onepass, input, &mut []).is_some()
+ } else if let Some(ref e) = self.backtrack.get(input) {
+ trace!(
+ "using BoundedBacktracker for is-match search at {:?}",
+ input.get_span()
+ );
+ e.is_match(&mut cache.backtrack, input)
+ } else {
+ trace!(
+ "using PikeVM for is-match search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.is_match(&mut cache.pikevm, input)
+ }
+ }
+
+ fn is_capture_search_needed(&self, slots_len: usize) -> bool {
+ slots_len > self.nfa.group_info().implicit_slot_len()
+ }
+}
+
+impl Strategy for Core {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.nfa.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ Cache {
+ capmatches: Captures::all(self.group_info().clone()),
+ pikevm: self.pikevm.create_cache(),
+ backtrack: self.backtrack.create_cache(),
+ onepass: self.onepass.create_cache(),
+ hybrid: self.hybrid.create_cache(),
+ revhybrid: wrappers::ReverseHybridCache::none(),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ cache.pikevm.reset(&self.pikevm);
+ cache.backtrack.reset(&self.backtrack);
+ cache.onepass.reset(&self.onepass);
+ cache.hybrid.reset(&self.hybrid);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.as_ref().map_or(false, |pre| pre.is_fast())
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.info.memory_usage()
+ + self.pre.as_ref().map_or(0, |pre| pre.memory_usage())
+ + self.nfa.memory_usage()
+ + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage())
+ + self.onepass.memory_usage()
+ + self.dfa.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ // We manually inline try_search_mayfail here because letting the
+ // compiler do it seems to produce pretty crappy codegen.
+ return if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for full search at {:?}", input.get_span());
+ match e.try_search(input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("full DFA search failed: {}", _err);
+ self.search_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for full search at {:?}", input.get_span());
+ match e.try_search(&mut cache.hybrid, input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("lazy DFA search failed: {}", _err);
+ self.search_nofail(cache, input)
+ }
+ }
+ } else {
+ self.search_nofail(cache, input)
+ };
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ // The main difference with 'search' is that if we're using a DFA, we
+ // can use a single forward scan without needing to run the reverse
+ // DFA.
+ if let Some(e) = self.dfa.get(input) {
+ trace!("using full DFA for half search at {:?}", input.get_span());
+ match e.try_search_half_fwd(input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("full DFA half search failed: {}", _err);
+ self.search_half_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!("using lazy DFA for half search at {:?}", input.get_span());
+ match e.try_search_half_fwd(&mut cache.hybrid, input) {
+ Ok(x) => x,
+ Err(_err) => {
+ trace!("lazy DFA half search failed: {}", _err);
+ self.search_half_nofail(cache, input)
+ }
+ }
+ } else {
+ self.search_half_nofail(cache, input)
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if let Some(e) = self.dfa.get(input) {
+ trace!(
+ "using full DFA for is-match search at {:?}",
+ input.get_span()
+ );
+ match e.try_search_half_fwd(input) {
+ Ok(x) => x.is_some(),
+ Err(_err) => {
+ trace!("full DFA half search failed: {}", _err);
+ self.is_match_nofail(cache, input)
+ }
+ }
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!(
+ "using lazy DFA for is-match search at {:?}",
+ input.get_span()
+ );
+ match e.try_search_half_fwd(&mut cache.hybrid, input) {
+ Ok(x) => x.is_some(),
+ Err(_err) => {
+ trace!("lazy DFA half search failed: {}", _err);
+ self.is_match_nofail(cache, input)
+ }
+ }
+ } else {
+ self.is_match_nofail(cache, input)
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ // Even if the regex has explicit capture groups, if the caller didn't
+ // provide any explicit slots, then it doesn't make sense to try and do
+ // extra work to get offsets for those slots. Ideally the caller should
+ // realize this and not call this routine in the first place, but alas,
+ // we try to save the caller from themselves if they do.
+ if !self.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ // If the onepass DFA is available for this search (which only happens
+ // when it's anchored), then skip running a fallible DFA. The onepass
+ // DFA isn't as fast as a full or lazy DFA, but it is typically quite
+ // a bit faster than the backtracker or the PikeVM. So it isn't as
+ // advantageous to try and do a full/lazy DFA scan first.
+ //
+ // We still theorize that it's better to do a full/lazy DFA scan, even
+ // when it's anchored, because it's usually much faster and permits us
+ // to say "no match" much more quickly. This does hurt the case of,
+ // say, parsing each line in a log file into capture groups, because
+ // in that case, the line always matches. So the lazy DFA scan is
+ // usually just wasted work. But, the lazy DFA is usually quite fast
+ // and doesn't cost too much here.
+ if self.onepass.get(&input).is_some() {
+ return self.search_slots_nofail(cache, &input, slots);
+ }
+ let m = match self.try_search_mayfail(cache, input) {
+ Some(Ok(Some(m))) => m,
+ Some(Ok(None)) => return None,
+ Some(Err(_err)) => {
+ trace!("fast capture search failed: {}", _err);
+ return self.search_slots_nofail(cache, input, slots);
+ }
+ None => {
+ return self.search_slots_nofail(cache, input, slots);
+ }
+ };
+ // At this point, now that we've found the bounds of the
+ // match, we need to re-run something that can resolve
+ // capturing groups. But we only need to run on it on the
+ // match bounds and not the entire haystack.
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ m.start(),
+ m.end(),
+ );
+ let input = input
+ .clone()
+ .span(m.start()..m.end())
+ .anchored(Anchored::Pattern(m.pattern()));
+ Some(
+ self.search_slots_nofail(cache, &input, slots)
+ .expect("should find a match"),
+ )
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ if let Some(e) = self.dfa.get(input) {
+ trace!(
+ "using full DFA for overlapping search at {:?}",
+ input.get_span()
+ );
+ let _err = match e.try_which_overlapping_matches(input, patset) {
+ Ok(()) => return,
+ Err(err) => err,
+ };
+ trace!("fast overlapping search failed: {}", _err);
+ } else if let Some(e) = self.hybrid.get(input) {
+ trace!(
+ "using lazy DFA for overlapping search at {:?}",
+ input.get_span()
+ );
+ let _err = match e.try_which_overlapping_matches(
+ &mut cache.hybrid,
+ input,
+ patset,
+ ) {
+ Ok(()) => {
+ return;
+ }
+ Err(err) => err,
+ };
+ trace!("fast overlapping search failed: {}", _err);
+ }
+ trace!(
+ "using PikeVM for overlapping search at {:?}",
+ input.get_span()
+ );
+ let e = self.pikevm.get();
+ e.which_overlapping_matches(&mut cache.pikevm, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseAnchored {
+ core: Core,
+}
+
+impl ReverseAnchored {
+ fn new(core: Core) -> Result<ReverseAnchored, Core> {
+ if !core.info.is_always_anchored_end() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ the regex is not always anchored at the end"
+ );
+ return Err(core);
+ }
+ // Note that the caller can still request an anchored search even when
+ // the regex isn't anchored at the start. We detect that case in the
+ // search routines below and just fallback to the core engine. This
+ // is fine because both searches are anchored. It's just a matter of
+ // picking one. Falling back to the core engine is a little simpler,
+ // since if we used the reverse anchored approach, we'd have to add an
+ // extra check to ensure the match reported starts at the place where
+ // the caller requested the search to start.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ the regex is also anchored at the start"
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse anchored optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ Ok(ReverseAnchored { core })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_anchored_rev(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ // We of course always want an anchored search. In theory, the
+ // underlying regex engines should automatically enable anchored
+ // searches since the regex is itself anchored, but this more clearly
+ // expresses intent and is always correct.
+ let input = input.clone().anchored(Anchored::Yes);
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse anchored search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_rev(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse anchored search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_rev(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseAnchored always has a DFA")
+ }
+ }
+}
+
+// Note that in this impl, we don't check that 'input.end() ==
+// input.haystack().len()'. In particular, when that condition is false, a
+// match is always impossible because we know that the regex is always anchored
+// at the end (or else 'ReverseAnchored' won't be built). We don't check that
+// here because the 'Regex' wrapper actually does that for us in all cases.
+// Thus, in this impl, we can actually assume that the end position in 'input'
+// is equivalent to the length of the haystack.
+impl Strategy for ReverseAnchored {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ self.core.create_cache()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ // Since this is anchored at the end, a reverse anchored search is
+ // almost certainly guaranteed to result in a much faster search than
+ // a standard forward search.
+ true
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ Some(Match::new(hm.pattern(), hm.offset()..input.end()))
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ // Careful here! 'try_search_half' is a *forward* search that
+ // only cares about the *end* position of a match. But
+ // 'hm.offset()' is actually the start of the match. So we
+ // actually just throw that away here and, since we know we
+ // have a match, return the only possible position at which a
+ // match can occur: input.end().
+ Some(HalfMatch::new(hm.pattern(), input.end()))
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ match self.try_search_half_anchored_rev(cache, input) {
+ Err(_err) => {
+ trace!("fast reverse anchored search failed: {}", _err);
+ self.core.search_slots_nofail(cache, input, slots)
+ }
+ Ok(None) => None,
+ Ok(Some(hm)) => {
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, skipping captures");
+ let m = Match::new(hm.pattern(), hm.offset()..input.end());
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let start = hm.offset();
+ let input = input
+ .clone()
+ .span(start..input.end())
+ .anchored(Anchored::Pattern(hm.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ // It seems like this could probably benefit from a reverse anchored
+ // optimization, perhaps by doing an overlapping reverse search (which
+ // the DFAs do support). I haven't given it much thought though, and
+ // I'm currently focus more on the single pattern case.
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseSuffix {
+ core: Core,
+ pre: Prefilter,
+}
+
+impl ReverseSuffix {
+ fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseSuffix, Core> {
+ if !core.info.config().get_auto_prefilter() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ automatic prefilters are disabled"
+ );
+ return Err(core);
+ }
+ // Like the reverse inner optimization, we don't do this for regexes
+ // that are always anchored. It could lead to scanning too much, but
+ // could say "no match" much more quickly than running the regex
+ // engine if the initial literal scan doesn't match. With that said,
+ // the reverse suffix optimization has lower overhead, since it only
+ // requires a reverse scan after a literal match to confirm or reject
+ // the match. (Although, in the case of confirmation, it then needs to
+ // do another forward scan to find the end position.)
+ //
+ // Note that the caller can still request an anchored search even
+ // when the regex isn't anchored. We detect that case in the search
+ // routines below and just fallback to the core engine. Currently this
+ // optimization assumes all searches are unanchored, so if we do want
+ // to enable this optimization for anchored searches, it will need a
+ // little work to support it.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ the regex is always anchored at the start",
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
+ debug!(
+ "skipping reverse suffix optimization because \
+ we already have a prefilter that we think is fast"
+ );
+ return Err(core);
+ }
+ let kind = core.info.config().get_match_kind();
+ let suffixes = crate::util::prefilter::suffixes(kind, hirs);
+ let lcs = match suffixes.longest_common_suffix() {
+ None => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ a longest common suffix could not be found",
+ );
+ return Err(core);
+ }
+ Some(lcs) if lcs.is_empty() => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ the longest common suffix is the empty string",
+ );
+ return Err(core);
+ }
+ Some(lcs) => lcs,
+ };
+ let pre = match Prefilter::new(kind, &[lcs]) {
+ Some(pre) => pre,
+ None => {
+ debug!(
+ "skipping reverse suffix optimization because \
+ a prefilter could not be constructed from the \
+ longest common suffix",
+ );
+ return Err(core);
+ }
+ };
+ if !pre.is_fast() {
+ debug!(
+ "skipping reverse suffix optimization because \
+ while we have a suffix prefilter, it is not \
+ believed to be 'fast'"
+ );
+ return Err(core);
+ }
+ Ok(ReverseSuffix { core, pre })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_start(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ let mut span = input.get_span();
+ let mut min_start = 0;
+ loop {
+ let litmatch = match self.pre.find(input.haystack(), span) {
+ None => return Ok(None),
+ Some(span) => span,
+ };
+ trace!("reverse suffix scan found suffix match at {:?}", litmatch);
+ let revinput = input
+ .clone()
+ .anchored(Anchored::Yes)
+ .span(input.start()..litmatch.end);
+ match self
+ .try_search_half_rev_limited(cache, &revinput, min_start)?
+ {
+ None => {
+ if span.start >= span.end {
+ break;
+ }
+ span.start = litmatch.start.checked_add(1).unwrap();
+ }
+ Some(hm) => return Ok(Some(hm)),
+ }
+ min_start = litmatch.end;
+ }
+ Ok(None)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_fwd(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for forward reverse suffix search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for forward reverse suffix search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseSuffix always has a DFA")
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_rev_limited(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse suffix search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&input, min_start)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start)
+ } else {
+ unreachable!("ReverseSuffix always has a DFA")
+ }
+ }
+}
+
+impl Strategy for ReverseSuffix {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ self.core.create_cache()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.pre.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage() + self.pre.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix optimization failed: {}", _err);
+ self.core.search(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse suffix reverse fast search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm_start)) => {
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd(cache, &fwdinput) {
+ Err(_err) => {
+ trace!(
+ "reverse suffix forward fast search failed: {}",
+ _err
+ );
+ self.core.search_nofail(cache, input)
+ }
+ Ok(None) => {
+ unreachable!(
+ "suffix match plus reverse match implies \
+ there must be a match",
+ )
+ }
+ Ok(Some(hm_end)) => Some(Match::new(
+ hm_start.pattern(),
+ hm_start.offset()..hm_end.offset(),
+ )),
+ }
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix half optimization failed: {}", _err);
+ self.core.search_half(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast half search failed: {}",
+ _err
+ );
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(hm_start)) => {
+ // This is a bit subtle. It is tempting to just stop searching
+ // at this point and return a half-match with an offset
+ // corresponding to where the suffix was found. But the suffix
+ // match does not necessarily correspond to the end of the
+ // proper leftmost-first match. Consider /[a-z]+ing/ against
+ // 'tingling'. The first suffix match is the first 'ing', and
+ // the /[a-z]+/ matches the 't'. So if we stopped here, then
+ // we'd report 'ting' as the match. But 'tingling' is the
+ // correct match because of greediness.
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd(cache, &fwdinput) {
+ Err(_err) => {
+ trace!(
+ "reverse suffix forward fast search failed: {}",
+ _err
+ );
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => {
+ unreachable!(
+ "suffix match plus reverse match implies \
+ there must be a match",
+ )
+ }
+ Ok(Some(hm_end)) => Some(hm_end),
+ }
+ }
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse suffix half optimization failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast half search failed: {}",
+ _err
+ );
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let hm_start = match self.try_search_half_start(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!(
+ "reverse suffix captures optimization failed: {}",
+ _err
+ );
+ return self.core.search_slots(cache, input, slots);
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!(
+ "reverse suffix reverse fast captures search failed: {}",
+ _err
+ );
+ return self.core.search_slots_nofail(cache, input, slots);
+ }
+ Ok(None) => return None,
+ Ok(Some(hm_start)) => hm_start,
+ };
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ hm_start.offset(),
+ input.end(),
+ );
+ let start = hm_start.offset();
+ let input = input
+ .clone()
+ .span(start..input.end())
+ .anchored(Anchored::Pattern(hm_start.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+#[derive(Debug)]
+struct ReverseInner {
+ core: Core,
+ preinner: Prefilter,
+ nfarev: NFA,
+ hybrid: wrappers::ReverseHybrid,
+ dfa: wrappers::ReverseDFA,
+}
+
+impl ReverseInner {
+ fn new(core: Core, hirs: &[&Hir]) -> Result<ReverseInner, Core> {
+ if !core.info.config().get_auto_prefilter() {
+ debug!(
+ "skipping reverse inner optimization because \
+ automatic prefilters are disabled"
+ );
+ return Err(core);
+ }
+ // Currently we hard-code the assumption of leftmost-first match
+ // semantics. This isn't a huge deal because 'all' semantics tend to
+ // only be used for forward overlapping searches with multiple regexes,
+ // and this optimization only supports a single pattern at the moment.
+ if core.info.config().get_match_kind() != MatchKind::LeftmostFirst {
+ debug!(
+ "skipping reverse inner optimization because \
+ match kind is {:?} but this only supports leftmost-first",
+ core.info.config().get_match_kind(),
+ );
+ return Err(core);
+ }
+ // It's likely that a reverse inner scan has too much overhead for it
+ // to be worth it when the regex is anchored at the start. It is
+ // possible for it to be quite a bit faster if the initial literal
+ // scan fails to detect a match, in which case, we can say "no match"
+ // very quickly. But this could be undesirable, e.g., scanning too far
+ // or when the literal scan matches. If it matches, then confirming the
+ // match requires a reverse scan followed by a forward scan to confirm
+ // or reject, which is a fair bit of work.
+ //
+ // Note that the caller can still request an anchored search even
+ // when the regex isn't anchored. We detect that case in the search
+ // routines below and just fallback to the core engine. Currently this
+ // optimization assumes all searches are unanchored, so if we do want
+ // to enable this optimization for anchored searches, it will need a
+ // little work to support it.
+ if core.info.is_always_anchored_start() {
+ debug!(
+ "skipping reverse inner optimization because \
+ the regex is always anchored at the start",
+ );
+ return Err(core);
+ }
+ // Only DFAs can do reverse searches (currently), so we need one of
+ // them in order to do this optimization. It's possible (although
+ // pretty unlikely) that we have neither and need to give up.
+ if !core.hybrid.is_some() && !core.dfa.is_some() {
+ debug!(
+ "skipping reverse inner optimization because \
+ we don't have a lazy DFA or a full DFA"
+ );
+ return Err(core);
+ }
+ if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
+ debug!(
+ "skipping reverse inner optimization because \
+ we already have a prefilter that we think is fast"
+ );
+ return Err(core);
+ } else if core.pre.is_some() {
+ debug!(
+ "core engine has a prefix prefilter, but it is \
+ probably not fast, so continuing with attempt to \
+ use reverse inner prefilter"
+ );
+ }
+ let (concat_prefix, preinner) = match reverse_inner::extract(hirs) {
+ Some(x) => x,
+ // N.B. the 'extract' function emits debug messages explaining
+ // why we bailed out here.
+ None => return Err(core),
+ };
+ debug!("building reverse NFA for prefix before inner literal");
+ let mut lookm = LookMatcher::new();
+ lookm.set_line_terminator(core.info.config().get_line_terminator());
+ let thompson_config = thompson::Config::new()
+ .reverse(true)
+ .utf8(core.info.config().get_utf8_empty())
+ .nfa_size_limit(core.info.config().get_nfa_size_limit())
+ .shrink(false)
+ .which_captures(WhichCaptures::None)
+ .look_matcher(lookm);
+ let result = thompson::Compiler::new()
+ .configure(thompson_config)
+ .build_from_hir(&concat_prefix);
+ let nfarev = match result {
+ Ok(nfarev) => nfarev,
+ Err(_err) => {
+ debug!(
+ "skipping reverse inner optimization because the \
+ reverse NFA failed to build: {}",
+ _err,
+ );
+ return Err(core);
+ }
+ };
+ debug!("building reverse DFA for prefix before inner literal");
+ let dfa = if !core.info.config().get_dfa() {
+ wrappers::ReverseDFA::none()
+ } else {
+ wrappers::ReverseDFA::new(&core.info, &nfarev)
+ };
+ let hybrid = if !core.info.config().get_hybrid() {
+ wrappers::ReverseHybrid::none()
+ } else if dfa.is_some() {
+ debug!(
+ "skipping lazy DFA for reverse inner optimization \
+ because we have a full DFA"
+ );
+ wrappers::ReverseHybrid::none()
+ } else {
+ wrappers::ReverseHybrid::new(&core.info, &nfarev)
+ };
+ Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_full(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, RetryError> {
+ let mut span = input.get_span();
+ let mut min_match_start = 0;
+ let mut min_pre_start = 0;
+ loop {
+ let litmatch = match self.preinner.find(input.haystack(), span) {
+ None => return Ok(None),
+ Some(span) => span,
+ };
+ if litmatch.start < min_pre_start {
+ trace!(
+ "found inner prefilter match at {:?}, which starts \
+ before the end of the last forward scan at {}, \
+ quitting to avoid quadratic behavior",
+ litmatch,
+ min_pre_start,
+ );
+ return Err(RetryError::Quadratic(RetryQuadraticError::new()));
+ }
+ trace!("reverse inner scan found inner match at {:?}", litmatch);
+ let revinput = input
+ .clone()
+ .anchored(Anchored::Yes)
+ .span(input.start()..litmatch.start);
+ // Note that in addition to the literal search above scanning past
+ // our minimum start point, this routine can also return an error
+ // as a result of detecting possible quadratic behavior if the
+ // reverse scan goes past the minimum start point. That is, the
+ // literal search might not, but the reverse regex search for the
+ // prefix might!
+ match self.try_search_half_rev_limited(
+ cache,
+ &revinput,
+ min_match_start,
+ )? {
+ None => {
+ if span.start >= span.end {
+ break;
+ }
+ span.start = litmatch.start.checked_add(1).unwrap();
+ }
+ Some(hm_start) => {
+ let fwdinput = input
+ .clone()
+ .anchored(Anchored::Pattern(hm_start.pattern()))
+ .span(hm_start.offset()..input.end());
+ match self.try_search_half_fwd_stopat(cache, &fwdinput)? {
+ Err(stopat) => {
+ min_pre_start = stopat;
+ span.start =
+ litmatch.start.checked_add(1).unwrap();
+ }
+ Ok(hm_end) => {
+ return Ok(Some(Match::new(
+ hm_start.pattern(),
+ hm_start.offset()..hm_end.offset(),
+ )))
+ }
+ }
+ }
+ }
+ min_match_start = litmatch.end;
+ }
+ Ok(None)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_fwd_stopat(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ if let Some(e) = self.core.dfa.get(&input) {
+ trace!(
+ "using full DFA for forward reverse inner search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd_stopat(&input)
+ } else if let Some(e) = self.core.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for forward reverse inner search at {:?}",
+ input.get_span()
+ );
+ e.try_search_half_fwd_stopat(&mut cache.hybrid, &input)
+ } else {
+ unreachable!("ReverseInner always has a DFA")
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn try_search_half_rev_limited(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ if let Some(e) = self.dfa.get(&input) {
+ trace!(
+ "using full DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(&input, min_start)
+ } else if let Some(e) = self.hybrid.get(&input) {
+ trace!(
+ "using lazy DFA for reverse inner search at {:?}, \
+ but will be stopped at {} to avoid quadratic behavior",
+ input.get_span(),
+ min_start,
+ );
+ e.try_search_half_rev_limited(
+ &mut cache.revhybrid,
+ &input,
+ min_start,
+ )
+ } else {
+ unreachable!("ReverseInner always has a DFA")
+ }
+ }
+}
+
+impl Strategy for ReverseInner {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn group_info(&self) -> &GroupInfo {
+ self.core.group_info()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn create_cache(&self) -> Cache {
+ let mut cache = self.core.create_cache();
+ cache.revhybrid = self.hybrid.create_cache();
+ cache
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn reset_cache(&self, cache: &mut Cache) {
+ self.core.reset_cache(cache);
+ cache.revhybrid.reset(&self.hybrid);
+ }
+
+ fn is_accelerated(&self) -> bool {
+ self.preinner.is_fast()
+ }
+
+ fn memory_usage(&self) -> usize {
+ self.core.memory_usage()
+ + self.preinner.memory_usage()
+ + self.nfarev.memory_usage()
+ + self.dfa.memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner optimization failed: {}", _err);
+ self.core.search(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast search failed: {}", _err);
+ self.core.search_nofail(cache, input)
+ }
+ Ok(matornot) => matornot,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_half(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ ) -> Option<HalfMatch> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_half(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner half optimization failed: {}", _err);
+ self.core.search_half(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast half search failed: {}", _err);
+ self.core.search_half_nofail(cache, input)
+ }
+ Ok(None) => None,
+ Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
+ if input.get_anchored().is_anchored() {
+ return self.core.is_match(cache, input);
+ }
+ match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner half optimization failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast half search failed: {}", _err);
+ self.core.is_match_nofail(cache, input)
+ }
+ Ok(None) => false,
+ Ok(Some(_)) => true,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ if input.get_anchored().is_anchored() {
+ return self.core.search_slots(cache, input, slots);
+ }
+ if !self.core.is_capture_search_needed(slots.len()) {
+ trace!("asked for slots unnecessarily, trying fast path");
+ let m = self.search(cache, input)?;
+ copy_match_to_slots(m, slots);
+ return Some(m.pattern());
+ }
+ let m = match self.try_search_full(cache, input) {
+ Err(RetryError::Quadratic(_err)) => {
+ trace!("reverse inner captures optimization failed: {}", _err);
+ return self.core.search_slots(cache, input, slots);
+ }
+ Err(RetryError::Fail(_err)) => {
+ trace!("reverse inner fast captures search failed: {}", _err);
+ return self.core.search_slots_nofail(cache, input, slots);
+ }
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ trace!(
+ "match found at {}..{} in capture search, \
+ using another engine to find captures",
+ m.start(),
+ m.end(),
+ );
+ let input = input
+ .clone()
+ .span(m.start()..m.end())
+ .anchored(Anchored::Pattern(m.pattern()));
+ self.core.search_slots_nofail(cache, &input, slots)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.core.which_overlapping_matches(cache, input, patset)
+ }
+}
+
+/// Copies the offsets in the given match to the corresponding positions in
+/// `slots`.
+///
+/// In effect, this sets the slots corresponding to the implicit group for the
+/// pattern in the given match. If the indices for the corresponding slots do
+/// not exist, then no slots are set.
+///
+/// This is useful when the caller provides slots (or captures), but you use a
+/// regex engine that doesn't operate on slots (like a lazy DFA). This function
+/// lets you map the match you get back to the slots provided by the caller.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn copy_match_to_slots(m: Match, slots: &mut [Option<NonMaxUsize>]) {
+ let slot_start = m.pattern().as_usize() * 2;
+ let slot_end = slot_start + 1;
+ if let Some(slot) = slots.get_mut(slot_start) {
+ *slot = NonMaxUsize::new(m.start());
+ }
+ if let Some(slot) = slots.get_mut(slot_end) {
+ *slot = NonMaxUsize::new(m.end());
+ }
+}
diff --git a/vendor/regex-automata/src/meta/wrappers.rs b/vendor/regex-automata/src/meta/wrappers.rs
new file mode 100644
index 000000000..08110d9bb
--- /dev/null
+++ b/vendor/regex-automata/src/meta/wrappers.rs
@@ -0,0 +1,1348 @@
+/*!
+This module contains a boat load of wrappers around each of our internal regex
+engines. They encapsulate a few things:
+
+1. The wrappers manage the conditional existence of the regex engine. Namely,
+the PikeVM is the only required regex engine. The rest are optional. These
+wrappers present a uniform API regardless of which engines are available. And
+availability might be determined by compile time features or by dynamic
+configuration via `meta::Config`. Encapsulating the conditional compilation
+features is in particular a huge simplification for the higher level code that
+composes these engines.
+2. The wrappers manage construction of each engine, including skipping it if
+the engine is unavailable or configured to not be used.
+3. The wrappers manage whether an engine *can* be used for a particular
+search configuration. For example, `BoundedBacktracker::get` only returns a
+backtracking engine when the haystack is bigger than the maximum supported
+length. The wrappers also sometimes take a position on when an engine *ought*
+to be used, but only in cases where the logic is extremely local to the engine
+itself. Otherwise, things like "choose between the backtracker and the one-pass
+DFA" are managed by the higher level meta strategy code.
+
+There are also corresponding wrappers for the various `Cache` types for each
+regex engine that needs them. If an engine is unavailable or not used, then a
+cache for it will *not* actually be allocated.
+*/
+
+use alloc::vec::Vec;
+
+use crate::{
+ meta::{
+ error::{BuildError, RetryError, RetryFailError},
+ regex::RegexInfo,
+ },
+ nfa::thompson::{pikevm, NFA},
+ util::{prefilter::Prefilter, primitives::NonMaxUsize},
+ HalfMatch, Input, Match, MatchKind, PatternID, PatternSet,
+};
+
+#[cfg(feature = "dfa-build")]
+use crate::dfa;
+#[cfg(feature = "dfa-onepass")]
+use crate::dfa::onepass;
+#[cfg(feature = "hybrid")]
+use crate::hybrid;
+#[cfg(feature = "nfa-backtrack")]
+use crate::nfa::thompson::backtrack;
+
+#[derive(Debug)]
+pub(crate) struct PikeVM(PikeVMEngine);
+
+impl PikeVM {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ ) -> Result<PikeVM, BuildError> {
+ PikeVMEngine::new(info, pre, nfa).map(PikeVM)
+ }
+
+ pub(crate) fn create_cache(&self) -> PikeVMCache {
+ PikeVMCache::new(self)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(&self) -> &PikeVMEngine {
+ &self.0
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct PikeVMEngine(pikevm::PikeVM);
+
+impl PikeVMEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ ) -> Result<PikeVMEngine, BuildError> {
+ let pikevm_config = pikevm::Config::new()
+ .match_kind(info.config().get_match_kind())
+ .prefilter(pre);
+ let engine = pikevm::Builder::new()
+ .configure(pikevm_config)
+ .build_from_nfa(nfa.clone())
+ .map_err(BuildError::nfa)?;
+ debug!("PikeVM built");
+ Ok(PikeVMEngine(engine))
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_match(
+ &self,
+ cache: &mut PikeVMCache,
+ input: &Input<'_>,
+ ) -> bool {
+ self.0.is_match(cache.0.as_mut().unwrap(), input.clone())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn search_slots(
+ &self,
+ cache: &mut PikeVMCache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ self.0.search_slots(cache.0.as_mut().unwrap(), input, slots)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn which_overlapping_matches(
+ &self,
+ cache: &mut PikeVMCache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.0.which_overlapping_matches(
+ cache.0.as_mut().unwrap(),
+ input,
+ patset,
+ )
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct PikeVMCache(Option<pikevm::Cache>);
+
+impl PikeVMCache {
+ pub(crate) fn none() -> PikeVMCache {
+ PikeVMCache(None)
+ }
+
+ pub(crate) fn new(builder: &PikeVM) -> PikeVMCache {
+ PikeVMCache(Some(builder.get().0.create_cache()))
+ }
+
+ pub(crate) fn reset(&mut self, builder: &PikeVM) {
+ self.0.as_mut().unwrap().reset(&builder.get().0);
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.as_ref().map_or(0, |c| c.memory_usage())
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct BoundedBacktracker(Option<BoundedBacktrackerEngine>);
+
+impl BoundedBacktracker {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ ) -> Result<BoundedBacktracker, BuildError> {
+ BoundedBacktrackerEngine::new(info, pre, nfa).map(BoundedBacktracker)
+ }
+
+ pub(crate) fn create_cache(&self) -> BoundedBacktrackerCache {
+ BoundedBacktrackerCache::new(self)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(
+ &self,
+ input: &Input<'_>,
+ ) -> Option<&BoundedBacktrackerEngine> {
+ let engine = self.0.as_ref()?;
+ // It is difficult to make the backtracker give up early if it is
+ // guaranteed to eventually wind up in a match state. This is because
+ // of the greedy nature of a backtracker: it just blindly mushes
+ // forward. Every other regex engine is able to give up more quickly,
+ // so even if the backtracker might be able to zip through faster than
+ // (say) the PikeVM, we prefer the theoretical benefit that some other
+ // engine might be able to scan much less of the haystack than the
+ // backtracker.
+ //
+ // Now, if the haystack is really short already, then we allow the
+ // backtracker to run. (This hasn't been litigated quantitatively with
+ // benchmarks. Just a hunch.)
+ if input.get_earliest() && input.haystack().len() > 128 {
+ return None;
+ }
+ // If the backtracker is just going to return an error because the
+ // haystack is too long, then obviously do not use it.
+ if input.get_span().len() > engine.max_haystack_len() {
+ return None;
+ }
+ Some(engine)
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct BoundedBacktrackerEngine(
+ #[cfg(feature = "nfa-backtrack")] backtrack::BoundedBacktracker,
+ #[cfg(not(feature = "nfa-backtrack"))] (),
+);
+
+impl BoundedBacktrackerEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ ) -> Result<Option<BoundedBacktrackerEngine>, BuildError> {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ if !info.config().get_backtrack()
+ || info.config().get_match_kind() != MatchKind::LeftmostFirst
+ {
+ return Ok(None);
+ }
+ let backtrack_config = backtrack::Config::new().prefilter(pre);
+ let engine = backtrack::Builder::new()
+ .configure(backtrack_config)
+ .build_from_nfa(nfa.clone())
+ .map_err(BuildError::nfa)?;
+ debug!("BoundedBacktracker built");
+ Ok(Some(BoundedBacktrackerEngine(engine)))
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ Ok(None)
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_match(
+ &self,
+ cache: &mut BoundedBacktrackerCache,
+ input: &Input<'_>,
+ ) -> bool {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ // OK because we only permit access to this engine when we know
+ // the haystack is short enough for the backtracker to run without
+ // reporting an error.
+ self.0
+ .try_is_match(cache.0.as_mut().unwrap(), input.clone())
+ .unwrap()
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn search_slots(
+ &self,
+ cache: &mut BoundedBacktrackerCache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ // OK because we only permit access to this engine when we know
+ // the haystack is short enough for the backtracker to run without
+ // reporting an error.
+ self.0
+ .try_search_slots(cache.0.as_mut().unwrap(), input, slots)
+ .unwrap()
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn max_haystack_len(&self) -> usize {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ self.0.max_haystack_len()
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct BoundedBacktrackerCache(
+ #[cfg(feature = "nfa-backtrack")] Option<backtrack::Cache>,
+ #[cfg(not(feature = "nfa-backtrack"))] (),
+);
+
+impl BoundedBacktrackerCache {
+ pub(crate) fn none() -> BoundedBacktrackerCache {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ BoundedBacktrackerCache(None)
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ BoundedBacktrackerCache(())
+ }
+ }
+
+ pub(crate) fn new(
+ builder: &BoundedBacktracker,
+ ) -> BoundedBacktrackerCache {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ BoundedBacktrackerCache(
+ builder.0.as_ref().map(|e| e.0.create_cache()),
+ )
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ BoundedBacktrackerCache(())
+ }
+ }
+
+ pub(crate) fn reset(&mut self, builder: &BoundedBacktracker) {
+ #[cfg(feature = "nfa-backtrack")]
+ if let Some(ref e) = builder.0 {
+ self.0.as_mut().unwrap().reset(&e.0);
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "nfa-backtrack")]
+ {
+ self.0.as_ref().map_or(0, |c| c.memory_usage())
+ }
+ #[cfg(not(feature = "nfa-backtrack"))]
+ {
+ 0
+ }
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct OnePass(Option<OnePassEngine>);
+
+impl OnePass {
+ pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> OnePass {
+ OnePass(OnePassEngine::new(info, nfa))
+ }
+
+ pub(crate) fn create_cache(&self) -> OnePassCache {
+ OnePassCache::new(self)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(&self, input: &Input<'_>) -> Option<&OnePassEngine> {
+ let engine = self.0.as_ref()?;
+ if !input.get_anchored().is_anchored()
+ && !engine.get_nfa().is_always_start_anchored()
+ {
+ return None;
+ }
+ Some(engine)
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.as_ref().map_or(0, |e| e.memory_usage())
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct OnePassEngine(
+ #[cfg(feature = "dfa-onepass")] onepass::DFA,
+ #[cfg(not(feature = "dfa-onepass"))] (),
+);
+
+impl OnePassEngine {
+ pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> Option<OnePassEngine> {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ if !info.config().get_onepass() {
+ return None;
+ }
+ // In order to even attempt building a one-pass DFA, we require
+ // that we either have at least one explicit capturing group or
+ // there's a Unicode word boundary somewhere. If we don't have
+ // either of these things, then the lazy DFA will almost certainly
+ // be useable and be much faster. The only case where it might
+ // not is if the lazy DFA isn't utilizing its cache effectively,
+ // but in those cases, the underlying regex is almost certainly
+ // not one-pass or is too big to fit within the current one-pass
+ // implementation limits.
+ if info.props_union().explicit_captures_len() == 0
+ && !info.props_union().look_set().contains_word_unicode()
+ {
+ debug!("not building OnePass because it isn't worth it");
+ return None;
+ }
+ let onepass_config = onepass::Config::new()
+ .match_kind(info.config().get_match_kind())
+ // Like for the lazy DFA, we unconditionally enable this
+ // because it doesn't cost much and makes the API more
+ // flexible.
+ .starts_for_each_pattern(true)
+ .byte_classes(info.config().get_byte_classes())
+ .size_limit(info.config().get_onepass_size_limit());
+ let result = onepass::Builder::new()
+ .configure(onepass_config)
+ .build_from_nfa(nfa.clone());
+ let engine = match result {
+ Ok(engine) => engine,
+ Err(_err) => {
+ debug!("OnePass failed to build: {}", _err);
+ return None;
+ }
+ };
+ debug!("OnePass built, {} bytes", engine.memory_usage());
+ Some(OnePassEngine(engine))
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ None
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn search_slots(
+ &self,
+ cache: &mut OnePassCache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ // OK because we only permit getting a OnePassEngine when we know
+ // the search is anchored and thus an error cannot occur.
+ self.0
+ .try_search_slots(cache.0.as_mut().unwrap(), input, slots)
+ .unwrap()
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ self.0.memory_usage()
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn get_nfa(&self) -> &NFA {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ self.0.get_nfa()
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct OnePassCache(
+ #[cfg(feature = "dfa-onepass")] Option<onepass::Cache>,
+ #[cfg(not(feature = "dfa-onepass"))] (),
+);
+
+impl OnePassCache {
+ pub(crate) fn none() -> OnePassCache {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ OnePassCache(None)
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ OnePassCache(())
+ }
+ }
+
+ pub(crate) fn new(builder: &OnePass) -> OnePassCache {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ OnePassCache(builder.0.as_ref().map(|e| e.0.create_cache()))
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ OnePassCache(())
+ }
+ }
+
+ pub(crate) fn reset(&mut self, builder: &OnePass) {
+ #[cfg(feature = "dfa-onepass")]
+ if let Some(ref e) = builder.0 {
+ self.0.as_mut().unwrap().reset(&e.0);
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "dfa-onepass")]
+ {
+ self.0.as_ref().map_or(0, |c| c.memory_usage())
+ }
+ #[cfg(not(feature = "dfa-onepass"))]
+ {
+ 0
+ }
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct Hybrid(Option<HybridEngine>);
+
+impl Hybrid {
+ pub(crate) fn none() -> Hybrid {
+ Hybrid(None)
+ }
+
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ nfarev: &NFA,
+ ) -> Hybrid {
+ Hybrid(HybridEngine::new(info, pre, nfa, nfarev))
+ }
+
+ pub(crate) fn create_cache(&self) -> HybridCache {
+ HybridCache::new(self)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&HybridEngine> {
+ let engine = self.0.as_ref()?;
+ Some(engine)
+ }
+
+ pub(crate) fn is_some(&self) -> bool {
+ self.0.is_some()
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct HybridEngine(
+ #[cfg(feature = "hybrid")] hybrid::regex::Regex,
+ #[cfg(not(feature = "hybrid"))] (),
+);
+
+impl HybridEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ nfarev: &NFA,
+ ) -> Option<HybridEngine> {
+ #[cfg(feature = "hybrid")]
+ {
+ if !info.config().get_hybrid() {
+ return None;
+ }
+ let dfa_config = hybrid::dfa::Config::new()
+ .match_kind(info.config().get_match_kind())
+ .prefilter(pre.clone())
+ // Enabling this is necessary for ensuring we can service any
+ // kind of 'Input' search without error. For the lazy DFA,
+ // this is not particularly costly, since the start states are
+ // generated lazily.
+ .starts_for_each_pattern(true)
+ .byte_classes(info.config().get_byte_classes())
+ .unicode_word_boundary(true)
+ .specialize_start_states(pre.is_some())
+ .cache_capacity(info.config().get_hybrid_cache_capacity())
+ // This makes it possible for building a lazy DFA to
+ // fail even though the NFA has already been built. Namely,
+ // if the cache capacity is too small to fit some minimum
+ // number of states (which is small, like 4 or 5), then the
+ // DFA will refuse to build.
+ //
+ // We shouldn't enable this to make building always work, since
+ // this could cause the allocation of a cache bigger than the
+ // provided capacity amount.
+ //
+ // This is effectively the only reason why building a lazy DFA
+ // could fail. If it does, then we simply suppress the error
+ // and return None.
+ .skip_cache_capacity_check(false)
+ // This and enabling heuristic Unicode word boundary support
+ // above make it so the lazy DFA can quit at match time.
+ .minimum_cache_clear_count(Some(3))
+ .minimum_bytes_per_state(Some(10));
+ let result = hybrid::dfa::Builder::new()
+ .configure(dfa_config.clone())
+ .build_from_nfa(nfa.clone());
+ let fwd = match result {
+ Ok(fwd) => fwd,
+ Err(_err) => {
+ debug!("forward lazy DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ let result = hybrid::dfa::Builder::new()
+ .configure(
+ dfa_config
+ .clone()
+ .match_kind(MatchKind::All)
+ .prefilter(None)
+ .specialize_start_states(false),
+ )
+ .build_from_nfa(nfarev.clone());
+ let rev = match result {
+ Ok(rev) => rev,
+ Err(_err) => {
+ debug!("reverse lazy DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ let engine =
+ hybrid::regex::Builder::new().build_from_dfas(fwd, rev);
+ debug!("lazy DFA built");
+ Some(HybridEngine(engine))
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ None
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, RetryFailError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let cache = cache.0.as_mut().unwrap();
+ self.0.try_search(cache, input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_fwd(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let fwd = self.0.forward();
+ let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
+ fwd.try_search_fwd(&mut fwdcache, input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_fwd_stopat(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let dfa = self.0.forward();
+ let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0;
+ crate::meta::stopat::hybrid_try_search_half_fwd(
+ dfa, &mut cache, input,
+ )
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let rev = self.0.reverse();
+ let mut revcache = cache.0.as_mut().unwrap().as_parts_mut().1;
+ rev.try_search_rev(&mut revcache, input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev_limited(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let dfa = self.0.reverse();
+ let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1;
+ crate::meta::limited::hybrid_try_search_half_rev(
+ dfa, &mut cache, input, min_start,
+ )
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[inline]
+ pub(crate) fn try_which_overlapping_matches(
+ &self,
+ cache: &mut HybridCache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), RetryFailError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let fwd = self.0.forward();
+ let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
+ fwd.try_which_overlapping_matches(&mut fwdcache, input, patset)
+ .map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct HybridCache(
+ #[cfg(feature = "hybrid")] Option<hybrid::regex::Cache>,
+ #[cfg(not(feature = "hybrid"))] (),
+);
+
+impl HybridCache {
+ pub(crate) fn none() -> HybridCache {
+ #[cfg(feature = "hybrid")]
+ {
+ HybridCache(None)
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ HybridCache(())
+ }
+ }
+
+ pub(crate) fn new(builder: &Hybrid) -> HybridCache {
+ #[cfg(feature = "hybrid")]
+ {
+ HybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ HybridCache(())
+ }
+ }
+
+ pub(crate) fn reset(&mut self, builder: &Hybrid) {
+ #[cfg(feature = "hybrid")]
+ if let Some(ref e) = builder.0 {
+ self.0.as_mut().unwrap().reset(&e.0);
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "hybrid")]
+ {
+ self.0.as_ref().map_or(0, |c| c.memory_usage())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ 0
+ }
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct DFA(Option<DFAEngine>);
+
+impl DFA {
+ pub(crate) fn none() -> DFA {
+ DFA(None)
+ }
+
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ nfarev: &NFA,
+ ) -> DFA {
+ DFA(DFAEngine::new(info, pre, nfa, nfarev))
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&DFAEngine> {
+ let engine = self.0.as_ref()?;
+ Some(engine)
+ }
+
+ pub(crate) fn is_some(&self) -> bool {
+ self.0.is_some()
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.as_ref().map_or(0, |e| e.memory_usage())
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct DFAEngine(
+ #[cfg(feature = "dfa-build")] dfa::regex::Regex,
+ #[cfg(not(feature = "dfa-build"))] (),
+);
+
+impl DFAEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ pre: Option<Prefilter>,
+ nfa: &NFA,
+ nfarev: &NFA,
+ ) -> Option<DFAEngine> {
+ #[cfg(feature = "dfa-build")]
+ {
+ if !info.config().get_dfa() {
+ return None;
+ }
+ // If our NFA is anything but small, don't even bother with a DFA.
+ if let Some(state_limit) = info.config().get_dfa_state_limit() {
+ if nfa.states().len() > state_limit {
+ debug!(
+ "skipping full DFA because NFA has {} states, \
+ which exceeds the heuristic limit of {}",
+ nfa.states().len(),
+ state_limit,
+ );
+ return None;
+ }
+ }
+ // We cut the size limit in four because the total heap used by
+ // DFA construction is determinization aux memory and the DFA
+ // itself, and those things are configured independently in the
+ // lower level DFA builder API. And then split that in two because
+ // of forward and reverse DFAs.
+ let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4);
+ let dfa_config = dfa::dense::Config::new()
+ .match_kind(info.config().get_match_kind())
+ .prefilter(pre.clone())
+ // Enabling this is necessary for ensuring we can service any
+ // kind of 'Input' search without error. For the full DFA, this
+ // can be quite costly. But since we have such a small bound
+ // on the size of the DFA, in practice, any multl-regexes are
+ // probably going to blow the limit anyway.
+ .starts_for_each_pattern(true)
+ .byte_classes(info.config().get_byte_classes())
+ .unicode_word_boundary(true)
+ .specialize_start_states(pre.is_some())
+ .determinize_size_limit(size_limit)
+ .dfa_size_limit(size_limit);
+ let result = dfa::dense::Builder::new()
+ .configure(dfa_config.clone())
+ .build_from_nfa(&nfa);
+ let fwd = match result {
+ Ok(fwd) => fwd,
+ Err(_err) => {
+ debug!("forward full DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ let result = dfa::dense::Builder::new()
+ .configure(
+ dfa_config
+ .clone()
+ // We never need unanchored reverse searches, so
+ // there's no point in building it into the DFA, which
+ // WILL take more space. (This isn't done for the lazy
+ // DFA because the DFA is, well, lazy. It doesn't pay
+ // the cost for supporting unanchored searches unless
+ // you actually do an unanchored search, which we
+ // don't.)
+ .start_kind(dfa::StartKind::Anchored)
+ .match_kind(MatchKind::All)
+ .prefilter(None)
+ .specialize_start_states(false),
+ )
+ .build_from_nfa(&nfarev);
+ let rev = match result {
+ Ok(rev) => rev,
+ Err(_err) => {
+ debug!("reverse full DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev);
+ debug!(
+ "fully compiled forward and reverse DFAs built, {} bytes",
+ engine.forward().memory_usage()
+ + engine.reverse().memory_usage(),
+ );
+ Some(DFAEngine(engine))
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ None
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, RetryFailError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ self.0.try_search(input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_fwd(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ use crate::dfa::Automaton;
+ self.0.forward().try_search_fwd(input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_fwd_stopat(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ let dfa = self.0.forward();
+ crate::meta::stopat::dfa_try_search_half_fwd(dfa, input)
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, RetryFailError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ use crate::dfa::Automaton;
+ self.0.reverse().try_search_rev(&input).map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev_limited(
+ &self,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ let dfa = self.0.reverse();
+ crate::meta::limited::dfa_try_search_half_rev(
+ dfa, input, min_start,
+ )
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ #[inline]
+ pub(crate) fn try_which_overlapping_matches(
+ &self,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), RetryFailError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ use crate::dfa::Automaton;
+ self.0
+ .forward()
+ .try_which_overlapping_matches(input, patset)
+ .map_err(|e| e.into())
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "dfa-build")]
+ {
+ self.0.forward().memory_usage() + self.0.reverse().memory_usage()
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct ReverseHybrid(Option<ReverseHybridEngine>);
+
+impl ReverseHybrid {
+ pub(crate) fn none() -> ReverseHybrid {
+ ReverseHybrid(None)
+ }
+
+ pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid {
+ ReverseHybrid(ReverseHybridEngine::new(info, nfarev))
+ }
+
+ pub(crate) fn create_cache(&self) -> ReverseHybridCache {
+ ReverseHybridCache::new(self)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(
+ &self,
+ _input: &Input<'_>,
+ ) -> Option<&ReverseHybridEngine> {
+ let engine = self.0.as_ref()?;
+ Some(engine)
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct ReverseHybridEngine(
+ #[cfg(feature = "hybrid")] hybrid::dfa::DFA,
+ #[cfg(not(feature = "hybrid"))] (),
+);
+
+impl ReverseHybridEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ nfarev: &NFA,
+ ) -> Option<ReverseHybridEngine> {
+ #[cfg(feature = "hybrid")]
+ {
+ if !info.config().get_hybrid() {
+ return None;
+ }
+ // Since we only use this for reverse searches, we can hard-code
+ // a number of things like match semantics, prefilters, starts
+ // for each pattern and so on.
+ let dfa_config = hybrid::dfa::Config::new()
+ .match_kind(MatchKind::All)
+ .prefilter(None)
+ .starts_for_each_pattern(false)
+ .byte_classes(info.config().get_byte_classes())
+ .unicode_word_boundary(true)
+ .specialize_start_states(false)
+ .cache_capacity(info.config().get_hybrid_cache_capacity())
+ .skip_cache_capacity_check(false)
+ .minimum_cache_clear_count(Some(3))
+ .minimum_bytes_per_state(Some(10));
+ let result = hybrid::dfa::Builder::new()
+ .configure(dfa_config)
+ .build_from_nfa(nfarev.clone());
+ let rev = match result {
+ Ok(rev) => rev,
+ Err(_err) => {
+ debug!("lazy reverse DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ debug!("lazy reverse DFA built");
+ Some(ReverseHybridEngine(rev))
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ None
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev_limited(
+ &self,
+ cache: &mut ReverseHybridCache,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ #[cfg(feature = "hybrid")]
+ {
+ let dfa = &self.0;
+ let mut cache = cache.0.as_mut().unwrap();
+ crate::meta::limited::hybrid_try_search_half_rev(
+ dfa, &mut cache, input, min_start,
+ )
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct ReverseHybridCache(
+ #[cfg(feature = "hybrid")] Option<hybrid::dfa::Cache>,
+ #[cfg(not(feature = "hybrid"))] (),
+);
+
+impl ReverseHybridCache {
+ pub(crate) fn none() -> ReverseHybridCache {
+ #[cfg(feature = "hybrid")]
+ {
+ ReverseHybridCache(None)
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ ReverseHybridCache(())
+ }
+ }
+
+ pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache {
+ #[cfg(feature = "hybrid")]
+ {
+ ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ ReverseHybridCache(())
+ }
+ }
+
+ pub(crate) fn reset(&mut self, builder: &ReverseHybrid) {
+ #[cfg(feature = "hybrid")]
+ if let Some(ref e) = builder.0 {
+ self.0.as_mut().unwrap().reset(&e.0);
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "hybrid")]
+ {
+ self.0.as_ref().map_or(0, |c| c.memory_usage())
+ }
+ #[cfg(not(feature = "hybrid"))]
+ {
+ 0
+ }
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct ReverseDFA(Option<ReverseDFAEngine>);
+
+impl ReverseDFA {
+ pub(crate) fn none() -> ReverseDFA {
+ ReverseDFA(None)
+ }
+
+ pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA {
+ ReverseDFA(ReverseDFAEngine::new(info, nfarev))
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&ReverseDFAEngine> {
+ let engine = self.0.as_ref()?;
+ Some(engine)
+ }
+
+ pub(crate) fn is_some(&self) -> bool {
+ self.0.is_some()
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.as_ref().map_or(0, |e| e.memory_usage())
+ }
+}
+
+#[derive(Debug)]
+pub(crate) struct ReverseDFAEngine(
+ #[cfg(feature = "dfa-build")] dfa::dense::DFA<Vec<u32>>,
+ #[cfg(not(feature = "dfa-build"))] (),
+);
+
+impl ReverseDFAEngine {
+ pub(crate) fn new(
+ info: &RegexInfo,
+ nfarev: &NFA,
+ ) -> Option<ReverseDFAEngine> {
+ #[cfg(feature = "dfa-build")]
+ {
+ if !info.config().get_dfa() {
+ return None;
+ }
+ // If our NFA is anything but small, don't even bother with a DFA.
+ if let Some(state_limit) = info.config().get_dfa_state_limit() {
+ if nfarev.states().len() > state_limit {
+ debug!(
+ "skipping full reverse DFA because NFA has {} states, \
+ which exceeds the heuristic limit of {}",
+ nfarev.states().len(),
+ state_limit,
+ );
+ return None;
+ }
+ }
+ // We cut the size limit in two because the total heap used by DFA
+ // construction is determinization aux memory and the DFA itself,
+ // and those things are configured independently in the lower level
+ // DFA builder API.
+ let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2);
+ // Since we only use this for reverse searches, we can hard-code
+ // a number of things like match semantics, prefilters, starts
+ // for each pattern and so on. We also disable acceleration since
+ // it's incompatible with limited searches (which is the only
+ // operation we support for this kind of engine at the moment).
+ let dfa_config = dfa::dense::Config::new()
+ .match_kind(MatchKind::All)
+ .prefilter(None)
+ .accelerate(false)
+ .start_kind(dfa::StartKind::Anchored)
+ .starts_for_each_pattern(false)
+ .byte_classes(info.config().get_byte_classes())
+ .unicode_word_boundary(true)
+ .specialize_start_states(false)
+ .determinize_size_limit(size_limit)
+ .dfa_size_limit(size_limit);
+ let result = dfa::dense::Builder::new()
+ .configure(dfa_config)
+ .build_from_nfa(&nfarev);
+ let rev = match result {
+ Ok(rev) => rev,
+ Err(_err) => {
+ debug!("full reverse DFA failed to build: {}", _err);
+ return None;
+ }
+ };
+ debug!(
+ "fully compiled reverse DFA built, {} bytes",
+ rev.memory_usage()
+ );
+ Some(ReverseDFAEngine(rev))
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ None
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn try_search_half_rev_limited(
+ &self,
+ input: &Input<'_>,
+ min_start: usize,
+ ) -> Result<Option<HalfMatch>, RetryError> {
+ #[cfg(feature = "dfa-build")]
+ {
+ let dfa = &self.0;
+ crate::meta::limited::dfa_try_search_half_rev(
+ dfa, input, min_start,
+ )
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ #[cfg(feature = "dfa-build")]
+ {
+ self.0.memory_usage()
+ }
+ #[cfg(not(feature = "dfa-build"))]
+ {
+ // Impossible to reach because this engine is never constructed
+ // if the requisite features aren't enabled.
+ unreachable!()
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs
index 61ce5ef47..0c36f598a 100644
--- a/vendor/regex-automata/src/nfa/mod.rs
+++ b/vendor/regex-automata/src/nfa/mod.rs
@@ -1 +1,55 @@
+/*!
+Provides non-deterministic finite automata (NFA) and regex engines that use
+them.
+
+While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical*
+power, their usage in practice tends to result in different engineering trade
+offs. While this isn't meant to be a comprehensive treatment of the topic, here
+are a few key trade offs that are, at minimum, true for this crate:
+
+* NFAs tend to be represented sparsely where as DFAs are represented densely.
+Sparse representations use less memory, but are slower to traverse. Conversely,
+dense representations use more memory, but are faster to traverse. (Sometimes
+these lines are blurred. For example, an `NFA` might choose to represent a
+particular state in a dense fashion, and a DFA can be built using a sparse
+representation via [`sparse::DFA`](crate::dfa::sparse::DFA).
+* NFAs have espilon transitions and DFAs don't. In practice, this means that
+handling a single byte in a haystack with an NFA at search time may require
+visiting multiple NFA states. In a DFA, each byte only requires visiting
+a single state. Stated differently, NFAs require a variable number of CPU
+instructions to process one byte in a haystack where as a DFA uses a constant
+number of CPU instructions to process one byte.
+* NFAs are generally easier to amend with secondary storage. For example, the
+[`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional
+memory beyond the model of a finite state machine to track offsets for matching
+capturing groups. Conversely, the most a DFA can do is report the offset (and
+pattern ID) at which a match occurred. This is generally why we also compile
+DFAs in reverse, so that we can run them after finding the end of a match to
+also find the start of a match.
+* NFAs take worst case linear time to build, but DFAs take worst case
+exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this
+challenge for DFAs in many practical cases.
+
+There are likely other differences, but the bottom line is that NFAs tend to be
+more memory efficient and give easier opportunities for increasing expressive
+power, where as DFAs are faster to search with.
+
+# Why only a Thompson NFA?
+
+Currently, the only kind of NFA we support in this crate is a [Thompson
+NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers
+to a specific construction algorithm that takes the syntax of a regex
+pattern and converts it to an NFA. Specifically, it makes gratuitous use of
+epsilon transitions in order to keep its structure simple. In exchange, its
+construction time is linear in the size of the regex. A Thompson NFA also makes
+the guarantee that given any state and a character in a haystack, there is at
+most one transition defined for it. (Although there may be many epsilon
+transitions.)
+
+It possible that other types of NFAs will be added in the future, such as a
+[Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm).
+But currently, this crate only provides a Thompson NFA.
+*/
+
+#[cfg(feature = "nfa-thompson")]
pub mod thompson;
diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
new file mode 100644
index 000000000..eba037c1d
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs
@@ -0,0 +1,1884 @@
+/*!
+An NFA backed bounded backtracker for executing regex searches with capturing
+groups.
+
+This module provides a [`BoundedBacktracker`] that works by simulating an NFA
+using the classical backtracking algorithm with a twist: it avoids redoing
+work that it has done before and thereby avoids worst case exponential time.
+In exchange, it can only be used on "short" haystacks. Its advantage is that
+is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
+because it does less book-keeping.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ nfa::thompson::{self, BuildError, State, NFA},
+ util::{
+ captures::Captures,
+ empty, iter,
+ prefilter::Prefilter,
+ primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
+ search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
+ },
+};
+
+/// Returns the minimum visited capacity for the given haystack.
+///
+/// This function can be used as the argument to [`Config::visited_capacity`]
+/// in order to guarantee that a backtracking search for the given `input`
+/// won't return an error when using a [`BoundedBacktracker`] built from the
+/// given `NFA`.
+///
+/// This routine exists primarily as a way to test that the bounded backtracker
+/// works correctly when its capacity is set to the smallest possible amount.
+/// Still, it may be useful in cases where you know you want to use the bounded
+/// backtracker for a specific input, and just need to know what visited
+/// capacity to provide to make it work.
+///
+/// Be warned that this number could be quite large as it is multiplicative in
+/// the size the given NFA and haystack.
+pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
+ div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
+}
+
+/// The configuration used for building a bounded backtracker.
+///
+/// A bounded backtracker configuration is a simple data object that is
+/// typically used with [`Builder::configure`].
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ pre: Option<Option<Prefilter>>,
+ visited_capacity: Option<usize>,
+}
+
+impl Config {
+ /// Return a new default regex configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set a prefilter to be used whenever a start state is entered.
+ ///
+ /// A [`Prefilter`] in this context is meant to accelerate searches by
+ /// looking for literal prefixes that every match for the corresponding
+ /// pattern (or patterns) must start with. Once a prefilter produces a
+ /// match, the underlying search routine continues on to try and confirm
+ /// the match.
+ ///
+ /// Be warned that setting a prefilter does not guarantee that the search
+ /// will be faster. While it's usually a good bet, if the prefilter
+ /// produces a lot of false positive candidates (i.e., positions matched
+ /// by the prefilter but not by the regex), then the overall result can
+ /// be slower than if you had just executed the regex engine without any
+ /// prefilters.
+ ///
+ /// By default no prefilter is set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// util::prefilter::Prefilter,
+ /// Input, Match, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
+ /// let re = BoundedBacktracker::builder()
+ /// .configure(BoundedBacktracker::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// Some(Match::must(0, 5..11)),
+ /// re.try_find(&mut cache, input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Be warned though that an incorrect prefilter can lead to incorrect
+ /// results!
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
+ /// let re = BoundedBacktracker::builder()
+ /// .configure(BoundedBacktracker::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// // No match reported even though there clearly is one!
+ /// assert_eq!(None, re.try_find(&mut cache, input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
+ self.pre = Some(pre);
+ self
+ }
+
+ /// Set the visited capacity used to bound backtracking.
+ ///
+ /// The visited capacity represents the amount of heap memory (in bytes) to
+ /// allocate toward tracking which parts of the backtracking search have
+ /// been done before. The heap memory needed for any particular search is
+ /// proportional to `haystack.len() * nfa.states().len()`, which an be
+ /// quite large. Therefore, the bounded backtracker is typically only able
+ /// to run on shorter haystacks.
+ ///
+ /// For a given regex, increasing the visited capacity means that the
+ /// maximum haystack length that can be searched is increased. The
+ /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
+ ///
+ /// The default capacity is a reasonable but empirically chosen size.
+ ///
+ /// # Example
+ ///
+ /// As with other regex engines, Unicode is what tends to make the bounded
+ /// backtracker less useful by making the maximum haystack length quite
+ /// small. If necessary, increasing the visited capacity using this routine
+ /// will increase the maximum haystack length at the cost of using more
+ /// memory.
+ ///
+ /// Note though that the specific maximum values here are not an API
+ /// guarantee. The default visited capacity is subject to change and not
+ /// covered by semver.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// // Unicode inflates the size of the underlying NFA quite a bit, and
+ /// // thus means that the backtracker can only handle smaller haystacks,
+ /// // assuming that the visited capacity remains unchanged.
+ /// let re = BoundedBacktracker::new(r"\w+")?;
+ /// assert!(re.max_haystack_len() <= 7_000);
+ /// // But we can increase the visited capacity to handle bigger haystacks!
+ /// let re = BoundedBacktracker::builder()
+ /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
+ /// .build(r"\w+")?;
+ /// assert!(re.max_haystack_len() >= 25_000);
+ /// assert!(re.max_haystack_len() <= 28_000);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn visited_capacity(mut self, capacity: usize) -> Config {
+ self.visited_capacity = Some(capacity);
+ self
+ }
+
+ /// Returns the prefilter set in this configuration, if one at all.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
+ }
+
+ /// Returns the configured visited capacity.
+ ///
+ /// Note that the actual capacity used may be slightly bigger than the
+ /// configured capacity.
+ pub fn get_visited_capacity(&self) -> usize {
+ const DEFAULT: usize = 256 * (1 << 10); // 256 KB
+ self.visited_capacity.unwrap_or(DEFAULT)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ pre: o.pre.or_else(|| self.pre.clone()),
+ visited_capacity: o.visited_capacity.or(self.visited_capacity),
+ }
+ }
+}
+
+/// A builder for a bounded backtracker.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction and the `BoundedBacktracker` construction. This builder
+/// is different from a general purpose regex builder in that it permits fine
+/// grain configuration of the construction process. The trade off for this is
+/// complexity, and the possibility of setting a configuration that might not
+/// make sense. For example, there are two different UTF-8 modes:
+///
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
+/// advance the starting position of the next search when a match with zero
+/// length is found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{self, backtrack::BoundedBacktracker},
+/// util::syntax,
+/// Match,
+/// };
+///
+/// let re = BoundedBacktracker::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(Ok(Match::must(0, 1..9)));
+/// let got = re.try_find_iter(&mut cache, haystack).next();
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
+}
+
+impl Builder {
+ /// Create a new BoundedBacktracker builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
+ }
+ }
+
+ /// Build a `BoundedBacktracker` from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(feature = "syntax")]
+ pub fn build(
+ &self,
+ pattern: &str,
+ ) -> Result<BoundedBacktracker, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a `BoundedBacktracker` from the given patterns.
+ #[cfg(feature = "syntax")]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<BoundedBacktracker, BuildError> {
+ let nfa = self.thompson.build_many(patterns)?;
+ self.build_from_nfa(nfa)
+ }
+
+ /// Build a `BoundedBacktracker` directly from its NFA.
+ ///
+ /// Note that when using this method, any configuration that applies to the
+ /// construction of the NFA itself will of course be ignored, since the NFA
+ /// given here is already built.
+ pub fn build_from_nfa(
+ &self,
+ nfa: NFA,
+ ) -> Result<BoundedBacktracker, BuildError> {
+ nfa.look_set_any().available().map_err(BuildError::word)?;
+ Ok(BoundedBacktracker { config: self.config.clone(), nfa })
+ }
+
+ /// Apply the given `BoundedBacktracker` configuration options to this
+ /// builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a `BoundedBacktracker`
+ /// directly from a pattern.
+ #[cfg(feature = "syntax")]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like if additional time should be spent
+ /// shrinking the size of the NFA.
+ ///
+ /// These settings only apply when constructing a `BoundedBacktracker`
+ /// directly from a pattern.
+ #[cfg(feature = "syntax")]
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+/// A backtracking regex engine that bounds its execution to avoid exponential
+/// blow-up.
+///
+/// This regex engine only implements leftmost-first match semantics and
+/// only supports leftmost searches. It effectively does the same thing as a
+/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
+/// it doesn't have to worry about copying capturing group spans for most NFA
+/// states. Instead, the backtracker can maintain one set of captures (provided
+/// by the caller) and never needs to copy them. In exchange, the backtracker
+/// bounds itself to ensure it doesn't exhibit worst case exponential time.
+/// This results in the backtracker only being able to handle short haystacks
+/// given reasonable memory usage.
+///
+/// # Searches may return an error!
+///
+/// By design, this backtracking regex engine is bounded. This bound is
+/// implemented by not visiting any combination of NFA state ID and position
+/// in a haystack more than once. Thus, the total memory required to bound
+/// backtracking is proportional to `haystack.len() * nfa.states().len()`.
+/// This can obviously get quite large, since large haystacks aren't terribly
+/// uncommon. To avoid using exorbitant memory, the capacity is bounded by
+/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
+/// capacity required for a particular regex and a haystack exceeds this
+/// capacity, then the search routine will return an error.
+///
+/// Unlike other regex engines that may return an error at search time (like
+/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
+/// backtracker will work for every haystack. Therefore, this regex engine
+/// _only_ exposes fallible search routines to avoid the footgun of panicking
+/// when running a search on a haystack that is too big.
+///
+/// If one wants to use the fallible search APIs without handling the
+/// error, the only way to guarantee an error won't occur from the
+/// haystack length is to ensure the haystack length does not exceed
+/// [`BoundedBacktracker::max_haystack_len`].
+///
+/// # Example: Unicode word boundaries
+///
+/// This example shows that the bounded backtracker implements Unicode word
+/// boundaries correctly by default.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
+///
+/// let re = BoundedBacktracker::new(r"\b\w+\b")?;
+/// let mut cache = re.create_cache();
+///
+/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
+/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
+/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
+/// assert_eq!(None, it.next());
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: multiple regex patterns
+///
+/// The bounded backtracker supports searching for multiple patterns
+/// simultaneously, just like other regex engines. Note though that because it
+/// uses a backtracking strategy, this regex engine is unlikely to scale well
+/// as more patterns are added. But then again, as more patterns are added, the
+/// maximum haystack length allowed will also shorten (assuming the visited
+/// capacity remains invariant).
+///
+/// ```
+/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
+///
+/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
+/// let mut cache = re.create_cache();
+///
+/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
+/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
+/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
+/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
+/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
+/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
+/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
+/// assert_eq!(None, it.next());
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct BoundedBacktracker {
+ config: Config,
+ nfa: NFA,
+}
+
+impl BoundedBacktracker {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding `BoundedBacktracker`.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(Ok(Match::must(0, 3..14))),
+ /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
+ BoundedBacktracker::builder().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<BoundedBacktracker, BuildError> {
+ BoundedBacktracker::builder().build_many(patterns)
+ }
+
+ /// # Example
+ ///
+ /// This shows how to hand assemble a regular expression via its HIR,
+ /// compile an NFA from it and build a BoundedBacktracker from the NFA.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{NFA, backtrack::BoundedBacktracker},
+ /// Match,
+ /// };
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
+ BoundedBacktracker::builder().build_from_nfa(nfa)
+ }
+
+ /// Create a new `BoundedBacktracker` that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::always_match()?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let expected = Some(Ok(Match::must(0, 0..0)));
+ /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
+ /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ BoundedBacktracker::new_from_nfa(nfa)
+ }
+
+ /// Create a new `BoundedBacktracker` that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// let re = BoundedBacktracker::never_match()?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
+ /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ BoundedBacktracker::new_from_nfa(nfa)
+ }
+
+ /// Return a default configuration for a `BoundedBacktracker`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a `BoundedBacktracker`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
+ /// disabled, zero-width matches that split a codepoint are allowed.
+ /// Otherwise they are never reported.
+ ///
+ /// In the code below, notice that `""` is permitted to match positions
+ /// that split the encoding of a codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, backtrack::BoundedBacktracker},
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z";
+ /// let mut it = re.try_find_iter(&mut cache, haystack);
+ /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
+ /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a
+ /// `BoundedBacktracker`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::{self, backtrack::BoundedBacktracker},
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 1..9));
+ /// re.try_captures(&mut cache, haystack, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new cache for this regex.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// regex. If you want to reuse the cache for another regex, then you
+ /// must call [`Cache::reset`] with that regex (or, equivalently,
+ /// [`BoundedBacktracker::reset_cache`]).
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Create a new empty set of capturing groups that is guaranteed to be
+ /// valid for the search APIs on this `BoundedBacktracker`.
+ ///
+ /// A `Captures` value created for a specific `BoundedBacktracker` cannot
+ /// be used with any other `BoundedBacktracker`.
+ ///
+ /// This is a convenience function for [`Captures::all`]. See the
+ /// [`Captures`] documentation for an explanation of its alternative
+ /// constructors that permit the `BoundedBacktracker` to do less work
+ /// during a search, and thus might make it faster.
+ pub fn create_captures(&self) -> Captures {
+ Captures::all(self.get_nfa().group_info().clone())
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `BoundedBacktracker`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different
+ /// `BoundedBacktracker`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re1 = BoundedBacktracker::new(r"\w")?;
+ /// let re2 = BoundedBacktracker::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Ok(Match::must(0, 0..2))),
+ /// re1.try_find_iter(&mut cache, "Δ").next(),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the BoundedBacktracker we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(Ok(Match::must(0, 0..3))),
+ /// re2.try_find_iter(&mut cache, "☃").next(),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ cache.reset(self);
+ }
+
+ /// Returns the total number of patterns compiled into this
+ /// `BoundedBacktracker`.
+ ///
+ /// In the case of a `BoundedBacktracker` that contains no patterns, this
+ /// returns `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern length for a `BoundedBacktracker` that
+ /// never matches:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// let re = BoundedBacktracker::never_match()?;
+ /// assert_eq!(re.pattern_len(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a `BoundedBacktracker` that matches at every
+ /// position:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// let re = BoundedBacktracker::always_match()?;
+ /// assert_eq!(re.pattern_len(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a `BoundedBacktracker` that was constructed from multiple
+ /// patterns:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(re.pattern_len(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_len(&self) -> usize {
+ self.nfa.pattern_len()
+ }
+
+ /// Return the config for this `BoundedBacktracker`.
+ #[inline]
+ pub fn get_config(&self) -> &Config {
+ &self.config
+ }
+
+ /// Returns a reference to the underlying NFA.
+ #[inline]
+ pub fn get_nfa(&self) -> &NFA {
+ &self.nfa
+ }
+
+ /// Returns the maximum haystack length supported by this backtracker.
+ ///
+ /// This routine is a function of both [`Config::visited_capacity`] and the
+ /// internal size of the backtracker's NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the maximum haystack length can vary depending
+ /// on the size of the regex itself. Note though that the specific maximum
+ /// values here are not an API guarantee. The default visited capacity is
+ /// subject to change and not covered by semver.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match, MatchError,
+ /// };
+ ///
+ /// // If you're only using ASCII, you get a big budget.
+ /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(re.max_haystack_len(), 299_592);
+ /// // Things work up to the max.
+ /// let mut haystack = "a".repeat(299_592);
+ /// let expected = Some(Ok(Match::must(0, 0..299_592)));
+ /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
+ /// // But you'll get an error if you provide a haystack that's too big.
+ /// // Notice that we use the 'try_find_iter' routine instead, which
+ /// // yields Result<Match, MatchError> instead of Match.
+ /// haystack.push('a');
+ /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
+ /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
+ ///
+ /// // Unicode inflates the size of the underlying NFA quite a bit, and
+ /// // thus means that the backtracker can only handle smaller haystacks,
+ /// // assuming that the visited capacity remains unchanged.
+ /// let re = BoundedBacktracker::new(r"\w+")?;
+ /// assert!(re.max_haystack_len() <= 7_000);
+ /// // But we can increase the visited capacity to handle bigger haystacks!
+ /// let re = BoundedBacktracker::builder()
+ /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
+ /// .build(r"\w+")?;
+ /// assert!(re.max_haystack_len() >= 25_000);
+ /// assert!(re.max_haystack_len() <= 28_000);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn max_haystack_len(&self) -> usize {
+ // The capacity given in the config is "bytes of heap memory," but the
+ // capacity we use here is "number of bits." So convert the capacity in
+ // bytes to the capacity in bits.
+ let capacity = 8 * self.get_config().get_visited_capacity();
+ let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
+ let real_capacity = blocks * Visited::BLOCK_SIZE;
+ (real_capacity / self.nfa.states().len()) - 1
+ }
+}
+
+impl BoundedBacktracker {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// In the case of a backtracking regex engine, and unlike most other
+ /// regex engines in this crate, short circuiting isn't practical. However,
+ /// this routine may still be faster because it instructs backtracking to
+ /// not keep track of any capturing groups.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For this
+ /// backtracking regex engine, this only occurs when the haystack length
+ /// exceeds [`BoundedBacktracker::max_haystack_len`].
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
+ ///
+ /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
+ /// assert!(!re.try_is_match(&mut cache, "foobar")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: consistency with search APIs
+ ///
+ /// `is_match` is guaranteed to return `true` whenever `find` returns a
+ /// match. This includes searches that are executed entirely within a
+ /// codepoint:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Input,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Notice that when UTF-8 mode is disabled, then the above reports a
+ /// match because the restriction against zero-width matches that split a
+ /// codepoint has been lifted:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{backtrack::BoundedBacktracker, NFA},
+ /// Input,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::builder()
+ /// .thompson(NFA::config().utf8(false))
+ /// .build("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_is_match<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> Result<bool, MatchError> {
+ let input = input.into().earliest(true);
+ self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
+ }
+
+ /// Executes a leftmost forward search and returns a `Match` if one exists.
+ ///
+ /// This routine only includes the overall match span. To get
+ /// access to the individual spans of each capturing group, use
+ /// [`BoundedBacktracker::try_captures`].
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For this
+ /// backtracking regex engine, this only occurs when the haystack length
+ /// exceeds [`BoundedBacktracker::max_haystack_len`].
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..8);
+ /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_find<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> Result<Option<Match>, MatchError> {
+ let input = input.into();
+ if self.get_nfa().pattern_len() == 1 {
+ let mut slots = [None, None];
+ let pid = match self.try_search_slots(cache, &input, &mut slots)? {
+ None => return Ok(None),
+ Some(pid) => pid,
+ };
+ let start = match slots[0] {
+ None => return Ok(None),
+ Some(s) => s.get(),
+ };
+ let end = match slots[1] {
+ None => return Ok(None),
+ Some(s) => s.get(),
+ };
+ return Ok(Some(Match::new(pid, Span { start, end })));
+ }
+ let ginfo = self.get_nfa().group_info();
+ let slots_len = ginfo.implicit_slot_len();
+ let mut slots = vec![None; slots_len];
+ let pid = match self.try_search_slots(cache, &input, &mut slots)? {
+ None => return Ok(None),
+ Some(pid) => pid,
+ };
+ let start = match slots[pid.as_usize() * 2] {
+ None => return Ok(None),
+ Some(s) => s.get(),
+ };
+ let end = match slots[pid.as_usize() * 2 + 1] {
+ None => return Ok(None),
+ Some(s) => s.get(),
+ };
+ Ok(Some(Match::new(pid, Span { start, end })))
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For this
+ /// backtracking regex engine, this only occurs when the haystack length
+ /// exceeds [`BoundedBacktracker::max_haystack_len`].
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Span,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new(
+ /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
+ /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_captures<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ caps: &mut Captures,
+ ) -> Result<(), MatchError> {
+ self.try_search(cache, &input.into(), caps)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// If the regex engine returns an error at any point, then the iterator
+ /// will yield that error.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match, MatchError,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = "foo1 foo12 foo123";
+ /// let result: Result<Vec<Match>, MatchError> = re
+ /// .try_find_iter(&mut cache, text)
+ /// .collect();
+ /// let matches = result?;
+ /// assert_eq!(matches, vec![
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ cache: &'c mut Cache,
+ input: I,
+ ) -> TryFindMatches<'r, 'c, 'h> {
+ let caps = Captures::matches(self.get_nfa().group_info().clone());
+ let it = iter::Searcher::new(input.into());
+ TryFindMatches { re: self, cache, caps, it }
+ }
+
+ /// Returns an iterator over all non-overlapping `Captures` values. If no
+ /// match exists, then the iterator yields no elements.
+ ///
+ /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
+ /// but it includes the spans of all capturing groups that participate in
+ /// each match.
+ ///
+ /// If the regex engine returns an error at any point, then the iterator
+ /// will yield that error.
+ ///
+ /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
+ /// how to correctly iterate over all matches in a haystack while avoiding
+ /// the creation of a new `Captures` value for every match. (Which you are
+ /// forced to do with an `Iterator`.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Span,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = "foo1 foo12 foo123";
+ /// let mut spans = vec![];
+ /// for result in re.try_captures_iter(&mut cache, text) {
+ /// let caps = result?;
+ /// // The unwrap is OK since 'numbers' matches if the pattern matches.
+ /// spans.push(caps.get_group_by_name("numbers").unwrap());
+ /// }
+ /// assert_eq!(spans, vec![
+ /// Span::from(3..4),
+ /// Span::from(8..10),
+ /// Span::from(14..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ cache: &'c mut Cache,
+ input: I,
+ ) -> TryCapturesMatches<'r, 'c, 'h> {
+ let caps = self.create_captures();
+ let it = iter::Searcher::new(input.into());
+ TryCapturesMatches { re: self, cache, caps, it }
+ }
+}
+
+impl BoundedBacktracker {
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
+ /// concrete `&Input` instead of an `Into<Input>`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For this
+ /// backtracking regex engine, this only occurs when the haystack length
+ /// exceeds [`BoundedBacktracker::max_haystack_len`].
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi bounded backtracker that
+ /// permits searching for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Anchored, Input, Match, PatternID,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new_many(&[
+ /// "[a-z0-9]{6}",
+ /// "[a-z][a-z0-9]{5}",
+ /// ])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123";
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(Match::must(0, 0..6));
+ /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(Match::must(1, 0..6));
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// re.try_search(
+ /// &mut cache, &Input::new(haystack).range(3..6), &mut caps,
+ /// )?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ caps: &mut Captures,
+ ) -> Result<(), MatchError> {
+ caps.set_pattern(None);
+ let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
+ caps.set_pattern(pid);
+ Ok(())
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided `slots`, and
+ /// returns the matching pattern ID. The contents of the slots for patterns
+ /// other than the matching pattern are unspecified. If no match was found,
+ /// then `None` is returned and the contents of all `slots` is unspecified.
+ ///
+ /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
+ /// slots slice instead of a `Captures` value. This is useful in contexts
+ /// where you don't want or need to allocate a `Captures`.
+ ///
+ /// It is legal to pass _any_ number of slots to this routine. If the regex
+ /// engine would otherwise write a slot offset that doesn't fit in the
+ /// provided slice, then it is simply skipped. In general though, there are
+ /// usually three slice lengths you might want to use:
+ ///
+ /// * An empty slice, if you only care about which pattern matched.
+ /// * A slice with
+ /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
+ /// slots, if you only care about the overall match spans for each matching
+ /// pattern.
+ /// * A slice with
+ /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
+ /// permits recording match offsets for every capturing group in every
+ /// pattern.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For this
+ /// backtracking regex engine, this only occurs when the haystack length
+ /// exceeds [`BoundedBacktracker::max_haystack_len`].
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// PatternID, Input,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::new_many(&[
+ /// r"\pL+",
+ /// r"\d+",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("!@#123");
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ if !utf8empty {
+ let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+ return Ok(maybe_hm.map(|hm| hm.pattern()));
+ }
+ // See PikeVM::try_search_slots for why we do this.
+ let min = self.get_nfa().group_info().implicit_slot_len();
+ if slots.len() >= min {
+ let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
+ return Ok(maybe_hm.map(|hm| hm.pattern()));
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut enough = [None, None];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger
+ // than `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ return Ok(got.map(|hm| hm.pattern()));
+ }
+ let mut enough = vec![None; min];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger than
+ // `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ Ok(got.map(|hm| hm.pattern()))
+ }
+
+ /// This is the actual implementation of `try_search_slots_imp` that
+ /// doesn't account for the special case when 1) the NFA has UTF-8 mode
+ /// enabled, 2) the NFA can match the empty string and 3) the caller has
+ /// provided an insufficient number of slots to record match offsets.
+ #[inline(never)]
+ fn try_search_slots_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ let hm = match self.search_imp(cache, input, slots)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+ Ok(self
+ .search_imp(cache, input, slots)?
+ .map(|hm| (hm, hm.offset())))
+ })
+ }
+
+ /// The implementation of standard leftmost backtracking search.
+ ///
+ /// Capturing group spans are written to 'caps', but only if requested.
+ /// 'caps' can be one of three things: 1) totally empty, in which case, we
+ /// only report the pattern that matched or 2) only has slots for recording
+ /// the overall match offsets for any pattern or 3) has all slots available
+ /// for recording the spans of any groups participating in a match.
+ fn search_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ // Unlike in the PikeVM, we write our capturing group spans directly
+ // into the caller's captures groups. So we have to make sure we're
+ // starting with a blank slate first. In the PikeVM, we avoid this
+ // by construction: the spans that are copied to every slot in the
+ // 'Captures' value already account for presence/absence. In this
+ // backtracker, we write directly into the caller provided slots, where
+ // as in the PikeVM, we write into scratch space first and only copy
+ // them to the caller provided slots when a match is found.
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ cache.setup_search(&self, input)?;
+ if input.is_done() {
+ return Ok(None);
+ }
+ let (anchored, start_id) = match input.get_anchored() {
+ // Only way we're unanchored is if both the caller asked for an
+ // unanchored search *and* the pattern is itself not anchored.
+ Anchored::No => (
+ self.nfa.is_always_start_anchored(),
+ // We always use the anchored starting state here, even if
+ // doing an unanchored search. The "unanchored" part of it is
+ // implemented in the loop below, by simply trying the next
+ // byte offset if the previous backtracking exploration failed.
+ self.nfa.start_anchored(),
+ ),
+ Anchored::Yes => (true, self.nfa.start_anchored()),
+ Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
+ None => return Ok(None),
+ Some(sid) => (true, sid),
+ },
+ };
+ if anchored {
+ let at = input.start();
+ return Ok(self.backtrack(cache, input, at, start_id, slots));
+ }
+ let pre = self.get_config().get_prefilter();
+ let mut at = input.start();
+ while at <= input.end() {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => break,
+ Some(ref span) => at = span.start,
+ }
+ }
+ if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
+ {
+ return Ok(Some(hm));
+ }
+ at += 1;
+ }
+ Ok(None)
+ }
+
+ /// Look for a match starting at `at` in `input` and write the matching
+ /// pattern ID and group spans to `caps`. The search uses `start_id` as its
+ /// starting state in the underlying NFA.
+ ///
+ /// If no match was found, then the caller should increment `at` and try
+ /// at the next position.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn backtrack(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ at: usize,
+ start_id: StateID,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<HalfMatch> {
+ cache.stack.push(Frame::Step { sid: start_id, at });
+ while let Some(frame) = cache.stack.pop() {
+ match frame {
+ Frame::Step { sid, at } => {
+ if let Some(hm) = self.step(cache, input, sid, at, slots) {
+ return Some(hm);
+ }
+ }
+ Frame::RestoreCapture { slot, offset } => {
+ slots[slot] = offset;
+ }
+ }
+ }
+ None
+ }
+
+ // LAMENTATION: The actual backtracking search is implemented in about
+ // 75 lines below. Yet this file is over 2,000 lines long. What have I
+ // done?
+
+ /// Execute a "step" in the backtracing algorithm.
+ ///
+ /// A "step" is somewhat of a misnomer, because this routine keeps going
+ /// until it either runs out of things to try or fins a match. In the
+ /// former case, it may have pushed some things on to the backtracking
+ /// stack, in which case, those will be tried next as part of the
+ /// 'backtrack' routine above.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn step(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ mut sid: StateID,
+ mut at: usize,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<HalfMatch> {
+ loop {
+ if !cache.visited.insert(sid, at - input.start()) {
+ return None;
+ }
+ match *self.nfa.state(sid) {
+ State::ByteRange { ref trans } => {
+ // Why do we need this? Unlike other regex engines in this
+ // crate, the backtracker can steam roll ahead in the
+ // haystack outside of the main loop over the bytes in the
+ // haystack. While 'trans.matches()' below handles the case
+ // of 'at' being out of bounds of 'input.haystack()', we
+ // also need to handle the case of 'at' going out of bounds
+ // of the span the caller asked to search.
+ //
+ // We should perhaps make the 'trans.matches()' API accept
+ // an '&Input' instead of a '&[u8]'. Or at least, add a new
+ // API that does it.
+ if at >= input.end() {
+ return None;
+ }
+ if !trans.matches(input.haystack(), at) {
+ return None;
+ }
+ sid = trans.next;
+ at += 1;
+ }
+ State::Sparse(ref sparse) => {
+ if at >= input.end() {
+ return None;
+ }
+ sid = sparse.matches(input.haystack(), at)?;
+ at += 1;
+ }
+ State::Dense(ref dense) => {
+ if at >= input.end() {
+ return None;
+ }
+ sid = dense.matches(input.haystack(), at)?;
+ at += 1;
+ }
+ State::Look { look, next } => {
+ // OK because we don't permit building a searcher with a
+ // Unicode word boundary if the requisite Unicode data is
+ // unavailable.
+ if !self.nfa.look_matcher().matches_inline(
+ look,
+ input.haystack(),
+ at,
+ ) {
+ return None;
+ }
+ sid = next;
+ }
+ State::Union { ref alternates } => {
+ sid = match alternates.get(0) {
+ None => return None,
+ Some(&sid) => sid,
+ };
+ cache.stack.extend(
+ alternates[1..]
+ .iter()
+ .copied()
+ .rev()
+ .map(|sid| Frame::Step { sid, at }),
+ );
+ }
+ State::BinaryUnion { alt1, alt2 } => {
+ sid = alt1;
+ cache.stack.push(Frame::Step { sid: alt2, at });
+ }
+ State::Capture { next, slot, .. } => {
+ if slot.as_usize() < slots.len() {
+ cache.stack.push(Frame::RestoreCapture {
+ slot,
+ offset: slots[slot],
+ });
+ slots[slot] = NonMaxUsize::new(at);
+ }
+ sid = next;
+ }
+ State::Fail => return None,
+ State::Match { pattern_id } => {
+ return Some(HalfMatch::new(pattern_id, at));
+ }
+ }
+ }
+ }
+}
+
+/// An iterator over all non-overlapping matches for a fallible search.
+///
+/// The iterator yields a `Result<Match, MatchError` value until no more
+/// matches could be found.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the BoundedBacktracker.
+/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
+/// method.
+#[derive(Debug)]
+pub struct TryFindMatches<'r, 'c, 'h> {
+ re: &'r BoundedBacktracker,
+ cache: &'c mut Cache,
+ caps: Captures,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
+ type Item = Result<Match, MatchError>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Result<Match, MatchError>> {
+ // Splitting 'self' apart seems necessary to appease borrowck.
+ let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
+ *self;
+ it.try_advance(|input| {
+ re.try_search(cache, input, caps)?;
+ Ok(caps.get_match())
+ })
+ .transpose()
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches, with their capturing
+/// groups, for a fallible search.
+///
+/// The iterator yields a `Result<Captures, MatchError>` value until no more
+/// matches could be found.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the BoundedBacktracker.
+/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the
+/// [`BoundedBacktracker::try_captures_iter`] method.
+#[derive(Debug)]
+pub struct TryCapturesMatches<'r, 'c, 'h> {
+ re: &'r BoundedBacktracker,
+ cache: &'c mut Cache,
+ caps: Captures,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
+ type Item = Result<Captures, MatchError>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Result<Captures, MatchError>> {
+ // Splitting 'self' apart seems necessary to appease borrowck.
+ let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
+ *self;
+ let _ = it
+ .try_advance(|input| {
+ re.try_search(cache, input, caps)?;
+ Ok(caps.get_match())
+ })
+ .transpose()?;
+ if caps.is_match() {
+ Some(Ok(caps.clone()))
+ } else {
+ None
+ }
+ }
+}
+
+/// A cache represents mutable state that a [`BoundedBacktracker`] requires
+/// during a search.
+///
+/// For a given [`BoundedBacktracker`], its corresponding cache may be created
+/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
+/// They are equivalent in every way, except the former does not require
+/// explicitly importing `Cache`.
+///
+/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
+/// it was created. It may only be used with that `BoundedBacktracker`. A cache
+/// and its allocations may be re-purposed via [`Cache::reset`], in which case,
+/// it can only be used with the new `BoundedBacktracker` (and not the old
+/// one).
+#[derive(Clone, Debug)]
+pub struct Cache {
+ /// Stack used on the heap for doing backtracking instead of the
+ /// traditional recursive approach. We don't want recursion because then
+ /// we're likely to hit a stack overflow for bigger regexes.
+ stack: Vec<Frame>,
+ /// The set of (StateID, HaystackOffset) pairs that have been visited
+ /// by the backtracker within a single search. If such a pair has been
+ /// visited, then we avoid doing the work for that pair again. This is
+ /// what "bounds" the backtracking and prevents it from having worst case
+ /// exponential time.
+ visited: Visited,
+}
+
+impl Cache {
+ /// Create a new [`BoundedBacktracker`] cache.
+ ///
+ /// A potentially more convenient routine to create a cache is
+ /// [`BoundedBacktracker::create_cache`], as it does not require also
+ /// importing the `Cache` type.
+ ///
+ /// If you want to reuse the returned `Cache` with some other
+ /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
+ /// desired `BoundedBacktracker`.
+ pub fn new(re: &BoundedBacktracker) -> Cache {
+ Cache { stack: vec![], visited: Visited::new(re) }
+ }
+
+ /// Reset this cache such that it can be used for searching with different
+ /// [`BoundedBacktracker`].
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `BoundedBacktracker`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different
+ /// `BoundedBacktracker`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::backtrack::BoundedBacktracker,
+ /// Match,
+ /// };
+ ///
+ /// let re1 = BoundedBacktracker::new(r"\w")?;
+ /// let re2 = BoundedBacktracker::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Ok(Match::must(0, 0..2))),
+ /// re1.try_find_iter(&mut cache, "Δ").next(),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the BoundedBacktracker we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(Ok(Match::must(0, 0..3))),
+ /// re2.try_find_iter(&mut cache, "☃").next(),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &BoundedBacktracker) {
+ self.visited.reset(re);
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.stack.len() * core::mem::size_of::<Frame>()
+ + self.visited.memory_usage()
+ }
+
+ /// Clears this cache. This should be called at the start of every search
+ /// to ensure we start with a clean slate.
+ ///
+ /// This also sets the length of the capturing groups used in the current
+ /// search. This permits an optimization where by 'SlotTable::for_state'
+ /// only returns the number of slots equivalent to the number of slots
+ /// given in the 'Captures' value. This may be less than the total number
+ /// of possible slots, e.g., when one only wants to track overall match
+ /// offsets. This in turn permits less copying of capturing group spans
+ /// in the BoundedBacktracker.
+ fn setup_search(
+ &mut self,
+ re: &BoundedBacktracker,
+ input: &Input<'_>,
+ ) -> Result<(), MatchError> {
+ self.stack.clear();
+ self.visited.setup_search(re, input)?;
+ Ok(())
+ }
+}
+
+/// Represents a stack frame on the heap while doing backtracking.
+///
+/// Instead of using explicit recursion for backtracking, we use a stack on
+/// the heap to keep track of things that we want to explore if the current
+/// backtracking branch turns out to not lead to a match.
+#[derive(Clone, Debug)]
+enum Frame {
+ /// Look for a match starting at `sid` and the given position in the
+ /// haystack.
+ Step { sid: StateID, at: usize },
+ /// Reset the given `slot` to the given `offset` (which might be `None`).
+ /// This effectively gives a "scope" to capturing groups, such that an
+ /// offset for a particular group only gets returned if the match goes
+ /// through that capturing group. If backtracking ends up going down a
+ /// different branch that results in a different offset (or perhaps none at
+ /// all), then this "restore capture" frame will cause the offset to get
+ /// reset.
+ RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
+}
+
+/// A bitset that keeps track of whether a particular (StateID, offset) has
+/// been considered during backtracking. If it has already been visited, then
+/// backtracking skips it. This is what gives backtracking its "bound."
+#[derive(Clone, Debug)]
+struct Visited {
+ /// The actual underlying bitset. Each element in the bitset corresponds
+ /// to a particular (StateID, offset) pair. States correspond to the rows
+ /// and the offsets correspond to the columns.
+ ///
+ /// If our underlying NFA has N states and the haystack we're searching
+ /// has M bytes, then we have N*(M+1) entries in our bitset table. The
+ /// M+1 occurs because our matches are delayed by one byte (to support
+ /// look-around), and so we need to handle the end position itself rather
+ /// than stopping just before the end. (If there is no end position, then
+ /// it's treated as "end-of-input," which is matched by things like '$'.)
+ ///
+ /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
+ /// blocks.
+ ///
+ /// We use 'usize' to represent our blocks because it makes some of the
+ /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
+ /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
+ bitset: Vec<usize>,
+ /// The stride represents one plus length of the haystack we're searching
+ /// (as described above). The stride must be initialized for each search.
+ stride: usize,
+}
+
+impl Visited {
+ /// The size of each block, in bits.
+ const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
+
+ /// Create a new visited set for the given backtracker.
+ ///
+ /// The set is ready to use, but must be setup at the beginning of each
+ /// search by calling `setup_search`.
+ fn new(re: &BoundedBacktracker) -> Visited {
+ let mut visited = Visited { bitset: vec![], stride: 0 };
+ visited.reset(re);
+ visited
+ }
+
+ /// Insert the given (StateID, offset) pair into this set. If it already
+ /// exists, then this is a no-op and it returns false. Otherwise this
+ /// returns true.
+ fn insert(&mut self, sid: StateID, at: usize) -> bool {
+ let table_index = sid.as_usize() * self.stride + at;
+ let block_index = table_index / Visited::BLOCK_SIZE;
+ let bit = table_index % Visited::BLOCK_SIZE;
+ let block_with_bit = 1 << bit;
+ if self.bitset[block_index] & block_with_bit != 0 {
+ return false;
+ }
+ self.bitset[block_index] |= block_with_bit;
+ true
+ }
+
+ /// Reset this visited set to work with the given bounded backtracker.
+ fn reset(&mut self, _: &BoundedBacktracker) {
+ self.bitset.truncate(0);
+ }
+
+ /// Setup this visited set to work for a search using the given NFA
+ /// and input configuration. The NFA must be the same NFA used by the
+ /// BoundedBacktracker given to Visited::reset. Failing to call this might
+ /// result in panics or silently incorrect search behavior.
+ fn setup_search(
+ &mut self,
+ re: &BoundedBacktracker,
+ input: &Input<'_>,
+ ) -> Result<(), MatchError> {
+ // Our haystack length is only the length of the span of the entire
+ // haystack that we'll be searching.
+ let haylen = input.get_span().len();
+ let err = || MatchError::haystack_too_long(haylen);
+ // Our stride is one more than the length of the input because our main
+ // search loop includes the position at input.end(). (And it does this
+ // because matches are delayed by one byte to account for look-around.)
+ self.stride = haylen + 1;
+ let needed_capacity =
+ match re.get_nfa().states().len().checked_mul(self.stride) {
+ None => return Err(err()),
+ Some(capacity) => capacity,
+ };
+ let max_capacity = 8 * re.get_config().get_visited_capacity();
+ if needed_capacity > max_capacity {
+ return Err(err());
+ }
+ let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
+ self.bitset.truncate(needed_blocks);
+ for block in self.bitset.iter_mut() {
+ *block = 0;
+ }
+ if needed_blocks > self.bitset.len() {
+ self.bitset.resize(needed_blocks, 0);
+ }
+ Ok(())
+ }
+
+ /// Return the heap memory usage, in bytes, of this visited set.
+ fn memory_usage(&self) -> usize {
+ self.bitset.len() * core::mem::size_of::<usize>()
+ }
+}
+
+/// Integer division, but rounds up instead of down.
+fn div_ceil(lhs: usize, rhs: usize) -> usize {
+ if lhs % rhs == 0 {
+ lhs / rhs
+ } else {
+ (lhs / rhs) + 1
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs
new file mode 100644
index 000000000..b57e5bc0f
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/builder.rs
@@ -0,0 +1,1337 @@
+use core::mem;
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+ nfa::thompson::{
+ error::BuildError,
+ nfa::{self, SparseTransitions, Transition, NFA},
+ },
+ util::{
+ look::{Look, LookMatcher},
+ primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
+ },
+};
+
+/// An intermediate NFA state used during construction.
+///
+/// During construction of an NFA, it is often convenient to work with states
+/// that are amenable to mutation and other carry more information than we
+/// otherwise need once an NFA has been built. This type represents those
+/// needs.
+///
+/// Once construction is finished, the builder will convert these states to a
+/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not
+/// only results in a simpler representation, but in some cases, entire classes
+/// of states are completely removed (such as [`State::Empty`]).
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum State {
+ /// An empty state whose only purpose is to forward the automaton to
+ /// another state via an unconditional epsilon transition.
+ ///
+ /// Unconditional epsilon transitions are quite useful during the
+ /// construction of an NFA, as they permit the insertion of no-op
+ /// placeholders that make it easier to compose NFA sub-graphs. When
+ /// the Thompson NFA builder produces a final NFA, all unconditional
+ /// epsilon transitions are removed, and state identifiers are remapped
+ /// accordingly.
+ Empty {
+ /// The next state that this state should transition to.
+ next: StateID,
+ },
+ /// A state that only transitions to another state if the current input
+ /// byte is in a particular range of bytes.
+ ByteRange { trans: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions must be ordered lexicographically by input range
+ /// and be non-overlapping. As such, this may only be used when every
+ /// transition has equal priority. (In practice, this is only used for
+ /// encoding large UTF-8 automata.) In contrast, a `Union` state has each
+ /// alternate in order of priority. Priority is used to implement greedy
+ /// matching and also alternations themselves, e.g., `abc|a` where `abc`
+ /// has priority over `a`.
+ ///
+ /// To clarify, it is possible to remove `Sparse` and represent all things
+ /// that `Sparse` is used for via `Union`. But this creates a more bloated
+ /// NFA with more epsilon transitions than is necessary in the special case
+ /// of character classes.
+ Sparse { transitions: Vec<Transition> },
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around.
+ Look { look: Look, next: StateID },
+ /// An empty state that records the start of a capture location. This is an
+ /// unconditional epsilon transition like `Empty`, except it can be used to
+ /// record position information for a captue group when using the NFA for
+ /// search.
+ CaptureStart {
+ /// The ID of the pattern that this capture was defined.
+ pattern_id: PatternID,
+ /// The capture group index that this capture state corresponds to.
+ /// The capture group index is always relative to its corresponding
+ /// pattern. Therefore, in the presence of multiple patterns, both the
+ /// pattern ID and the capture group index are required to uniquely
+ /// identify a capturing group.
+ group_index: SmallIndex,
+ /// The next state that this state should transition to.
+ next: StateID,
+ },
+ /// An empty state that records the end of a capture location. This is an
+ /// unconditional epsilon transition like `Empty`, except it can be used to
+ /// record position information for a captue group when using the NFA for
+ /// search.
+ CaptureEnd {
+ /// The ID of the pattern that this capture was defined.
+ pattern_id: PatternID,
+ /// The capture group index that this capture state corresponds to.
+ /// The capture group index is always relative to its corresponding
+ /// pattern. Therefore, in the presence of multiple patterns, both the
+ /// pattern ID and the capture group index are required to uniquely
+ /// identify a capturing group.
+ group_index: SmallIndex,
+ /// The next state that this state should transition to.
+ next: StateID,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Vec<StateID> },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via later transitions are
+ /// preferred over earlier transitions.
+ ///
+ /// This "reverse" state exists for convenience during compilation that
+ /// permits easy construction of non-greedy combinations of NFA states. At
+ /// the end of compilation, Union and UnionReverse states are merged into
+ /// one Union type of state, where the latter has its epsilon transitions
+ /// reversed to reflect the priority inversion.
+ ///
+ /// The "convenience" here arises from the fact that as new states are
+ /// added to the list of `alternates`, we would like that add operation
+ /// to be amortized constant time. But if we used a `Union`, we'd need to
+ /// prepend the state, which takes O(n) time. There are other approaches we
+ /// could use to solve this, but this seems simple enough.
+ UnionReverse { alternates: Vec<StateID> },
+ /// A state that cannot be transitioned out of. This is useful for cases
+ /// where you want to prevent matching from occurring. For example, if your
+ /// regex parser permits empty character classes, then one could choose a
+ /// `Fail` state to represent it.
+ Fail,
+ /// A match state. There is at most one such occurrence of this state in
+ /// an NFA for each pattern compiled into the NFA. At time of writing, a
+ /// match state is always produced for every pattern given, but in theory,
+ /// if a pattern can never lead to a match, then the match state could be
+ /// omitted.
+ ///
+ /// `pattern_id` refers to the ID of the pattern itself, which corresponds
+ /// to the pattern's index (starting at 0).
+ Match { pattern_id: PatternID },
+}
+
+impl State {
+ /// If this state is an unconditional espilon transition, then this returns
+ /// the target of the transition.
+ fn goto(&self) -> Option<StateID> {
+ match *self {
+ State::Empty { next } => Some(next),
+ State::Union { ref alternates } if alternates.len() == 1 => {
+ Some(alternates[0])
+ }
+ State::UnionReverse { ref alternates }
+ if alternates.len() == 1 =>
+ {
+ Some(alternates[0])
+ }
+ _ => None,
+ }
+ }
+
+ /// Returns the heap memory usage, in bytes, of this state.
+ fn memory_usage(&self) -> usize {
+ match *self {
+ State::Empty { .. }
+ | State::ByteRange { .. }
+ | State::Look { .. }
+ | State::CaptureStart { .. }
+ | State::CaptureEnd { .. }
+ | State::Fail
+ | State::Match { .. } => 0,
+ State::Sparse { ref transitions } => {
+ transitions.len() * mem::size_of::<Transition>()
+ }
+ State::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ State::UnionReverse { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+}
+
+/// An abstraction for building Thompson NFAs by hand.
+///
+/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler)
+/// uses internally to translate a regex's high-level intermediate
+/// representation into an [`NFA`].
+///
+/// The primary function of this builder is to abstract away the internal
+/// representation of an NFA and make it difficult to produce NFAs are that
+/// internally invalid or inconsistent. This builder also provides a way to
+/// add "empty" states (which can be thought of as unconditional epsilon
+/// transitions), despite the fact that [`thompson::State`](nfa::State) does
+/// not have any "empty" representation. The advantage of "empty" states is
+/// that they make the code for constructing a Thompson NFA logically simpler.
+///
+/// Many of the routines on this builder may panic or return errors. Generally
+/// speaking, panics occur when an invalid sequence of method calls were made,
+/// where as an error occurs if things get too big. (Where "too big" might mean
+/// exhausting identifier space or using up too much heap memory in accordance
+/// with the configured [`size_limit`](Builder::set_size_limit).)
+///
+/// # Overview
+///
+/// ## Adding multiple patterns
+///
+/// Each pattern you add to an NFA should correspond to a pair of
+/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with
+/// calls inbetween that add NFA states for that pattern. NFA states may be
+/// added without first calling `start_pattern`, with the exception of adding
+/// capturing states.
+///
+/// ## Adding NFA states
+///
+/// Here is a very brief overview of each of the methods that add NFA states.
+/// Every method adds a single state.
+///
+/// * [`add_empty`](Builder::add_empty): Add a state with a single
+/// unconditional epsilon transition to another state.
+/// * [`add_union`](Builder::add_union): Adds a state with unconditional
+/// epsilon transitions to two or more states, with earlier transitions
+/// preferred over later ones.
+/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with
+/// unconditional epsilon transitions to two or more states, with later
+/// transitions preferred over earlier ones.
+/// * [`add_range`](Builder::add_range): Adds a state with a single transition
+/// to another state that can only be followed if the current input byte is
+/// within the range given.
+/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more
+/// range transitions to other states, where a transition is only followed
+/// if the current input byte is within one of the ranges. All transitions
+/// in this state have equal priority, and the corresponding ranges must be
+/// non-overlapping.
+/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional*
+/// epsilon transition to another state, where the condition depends on a
+/// limited look-around property.
+/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with
+/// a single unconditional epsilon transition that also instructs an NFA
+/// simulation to record the current input position to a specific location in
+/// memory. This is intended to represent the starting location of a capturing
+/// group.
+/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with
+/// a single unconditional epsilon transition that also instructs an NFA
+/// simulation to record the current input position to a specific location in
+/// memory. This is intended to represent the ending location of a capturing
+/// group.
+/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to
+/// another state.
+/// * [`add_match`](Builder::add_match): Add a state that indicates a match has
+/// been found for a particular pattern. A match state is a final state with
+/// no outgoing transitions.
+///
+/// ## Setting transitions between NFA states
+///
+/// The [`Builder::patch`] method creates a transition from one state to the
+/// next. If the `from` state corresponds to a state that supports multiple
+/// outgoing transitions (such as "union"), then this adds the corresponding
+/// transition. Otherwise, it sets the single transition. (This routine panics
+/// if `from` corresponds to a state added by `add_sparse`, since sparse states
+/// need more specialized handling.)
+///
+/// # Example
+///
+/// This annotated example shows how to hand construct the regex `[a-z]+`
+/// (without an unanchored prefix).
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
+/// util::primitives::StateID,
+/// Match,
+/// };
+///
+/// let mut builder = Builder::new();
+/// // Before adding NFA states for our pattern, we need to tell the builder
+/// // that we are starting the pattern.
+/// builder.start_pattern()?;
+/// // Since we use the Pike VM below for searching, we need to add capturing
+/// // states. If you're just going to build a DFA from the NFA, then capturing
+/// // states do not need to be added.
+/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
+/// let range = builder.add_range(Transition {
+/// // We don't know the state ID of the 'next' state yet, so we just fill
+/// // in a dummy 'ZERO' value.
+/// start: b'a', end: b'z', next: StateID::ZERO,
+/// })?;
+/// // This state will point back to 'range', but also enable us to move ahead.
+/// // That is, this implements the '+' repetition operator. We add 'range' and
+/// // then 'end' below to this alternation.
+/// let alt = builder.add_union(vec![])?;
+/// // The final state before the match state, which serves to capture the
+/// // end location of the match.
+/// let end = builder.add_capture_end(StateID::ZERO, 0)?;
+/// // The match state for our pattern.
+/// let mat = builder.add_match()?;
+/// // Now we fill in the transitions between states.
+/// builder.patch(start, range)?;
+/// builder.patch(range, alt)?;
+/// // If we added 'end' before 'range', then we'd implement non-greedy
+/// // matching, i.e., '+?'.
+/// builder.patch(alt, range)?;
+/// builder.patch(alt, end)?;
+/// builder.patch(end, mat)?;
+/// // We must explicitly finish pattern and provide the starting state ID for
+/// // this particular pattern.
+/// builder.finish_pattern(start)?;
+/// // Finally, when we build the NFA, we provide the anchored and unanchored
+/// // starting state IDs. Since we didn't bother with an unanchored prefix
+/// // here, we only support anchored searching. Thus, both starting states are
+/// // the same.
+/// let nfa = builder.build(start, start)?;
+///
+/// // Now build a Pike VM from our NFA, and use it for searching. This shows
+/// // how we can use a regex engine without ever worrying about syntax!
+/// let re = PikeVM::new_from_nfa(nfa)?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+/// let expected = Some(Match::must(0, 0..3));
+/// re.captures(&mut cache, "foo0", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug, Default)]
+pub struct Builder {
+ /// The ID of the pattern that we're currently building.
+ ///
+ /// Callers are required to set (and unset) this by calling
+ /// {start,finish}_pattern. Otherwise, most methods will panic.
+ pattern_id: Option<PatternID>,
+ /// A sequence of intermediate NFA states. Once a state is added to this
+ /// sequence, it is assigned a state ID equivalent to its index. Once a
+ /// state is added, it is still expected to be mutated, e.g., to set its
+ /// transition to a state that didn't exist at the time it was added.
+ states: Vec<State>,
+ /// The starting states for each individual pattern. Starting at any
+ /// of these states will result in only an anchored search for the
+ /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
+ /// contains a single regex, then `start_pattern[0]` and `start_anchored`
+ /// are always equivalent.
+ start_pattern: Vec<StateID>,
+ /// A map from pattern ID to capture group index to name. (If no name
+ /// exists, then a None entry is present. Thus, all capturing groups are
+ /// present in this mapping.)
+ ///
+ /// The outer vec is indexed by pattern ID, while the inner vec is indexed
+ /// by capture index offset for the corresponding pattern.
+ ///
+ /// The first capture group for each pattern is always unnamed and is thus
+ /// always None.
+ captures: Vec<Vec<Option<Arc<str>>>>,
+ /// The combined memory used by each of the 'State's in 'states'. This
+ /// only includes heap usage by each state, and not the size of the state
+ /// itself. In other words, this tracks heap memory used that isn't
+ /// captured via `size_of::<State>() * states.len()`.
+ memory_states: usize,
+ /// Whether this NFA only matches UTF-8 and whether regex engines using
+ /// this NFA for searching should report empty matches that split a
+ /// codepoint.
+ utf8: bool,
+ /// Whether this NFA should be matched in reverse or not.
+ reverse: bool,
+ /// The matcher to use for look-around assertions.
+ look_matcher: LookMatcher,
+ /// A size limit to respect when building an NFA. If the total heap memory
+ /// of the intermediate NFA states exceeds (or would exceed) this amount,
+ /// then an error is returned.
+ size_limit: Option<usize>,
+}
+
+impl Builder {
+ /// Create a new builder for hand-assembling NFAs.
+ pub fn new() -> Builder {
+ Builder::default()
+ }
+
+ /// Clear this builder.
+ ///
+ /// Clearing removes all state associated with building an NFA, but does
+ /// not reset configuration (such as size limits and whether the NFA
+ /// should only match UTF-8). After clearing, the builder can be reused to
+ /// assemble an entirely new NFA.
+ pub fn clear(&mut self) {
+ self.pattern_id = None;
+ self.states.clear();
+ self.start_pattern.clear();
+ self.captures.clear();
+ self.memory_states = 0;
+ }
+
+ /// Assemble a [`NFA`] from the states added so far.
+ ///
+ /// After building an NFA, more states may be added and `build` may be
+ /// called again. To reuse a builder to produce an entirely new NFA from
+ /// scratch, call the [`clear`](Builder::clear) method first.
+ ///
+ /// `start_anchored` refers to the ID of the starting state that anchored
+ /// searches should use. That is, searches who matches are limited to the
+ /// starting position of the search.
+ ///
+ /// `start_unanchored` refers to the ID of the starting state that
+ /// unanchored searches should use. This permits searches to report matches
+ /// that start after the beginning of the search. In cases where unanchored
+ /// searches are not supported, the unanchored starting state ID must be
+ /// the same as the anchored starting state ID.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if there was a problem producing the final NFA.
+ /// In particular, this might include an error if the capturing groups
+ /// added to this builder violate any of the invariants documented on
+ /// [`GroupInfo`](crate::util::captures::GroupInfo).
+ ///
+ /// # Panics
+ ///
+ /// If `start_pattern` was called, then `finish_pattern` must be called
+ /// before `build`, otherwise this panics.
+ ///
+ /// This may panic for other invalid uses of a builder. For example, if
+ /// a "start capture" state was added without a corresponding "end capture"
+ /// state.
+ pub fn build(
+ &self,
+ start_anchored: StateID,
+ start_unanchored: StateID,
+ ) -> Result<NFA, BuildError> {
+ assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
+ debug!(
+ "intermediate NFA compilation via builder is complete, \
+ intermediate NFA size: {} states, {} bytes on heap",
+ self.states.len(),
+ self.memory_usage(),
+ );
+
+ let mut nfa = nfa::Inner::default();
+ nfa.set_utf8(self.utf8);
+ nfa.set_reverse(self.reverse);
+ nfa.set_look_matcher(self.look_matcher.clone());
+ // A set of compiler internal state IDs that correspond to states
+ // that are exclusively epsilon transitions, i.e., goto instructions,
+ // combined with the state that they point to. This is used to
+ // record said states while transforming the compiler's internal NFA
+ // representation to the external form.
+ let mut empties = vec![];
+ // A map used to re-map state IDs when translating this builder's
+ // internal NFA state representation to the final NFA representation.
+ let mut remap = vec![];
+ remap.resize(self.states.len(), StateID::ZERO);
+
+ nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
+ nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
+ // The idea here is to convert our intermediate states to their final
+ // form. The only real complexity here is the process of converting
+ // transitions, which are expressed in terms of state IDs. The new
+ // set of states will be smaller because of partial epsilon removal,
+ // so the state IDs will not be the same.
+ for (sid, state) in self.states.iter().with_state_ids() {
+ match *state {
+ State::Empty { next } => {
+ // Since we're removing empty states, we need to handle
+ // them later since we don't yet know which new state this
+ // empty state will be mapped to.
+ empties.push((sid, next));
+ }
+ State::ByteRange { trans } => {
+ remap[sid] = nfa.add(nfa::State::ByteRange { trans });
+ }
+ State::Sparse { ref transitions } => {
+ remap[sid] = match transitions.len() {
+ 0 => nfa.add(nfa::State::Fail),
+ 1 => nfa.add(nfa::State::ByteRange {
+ trans: transitions[0],
+ }),
+ _ => {
+ let transitions =
+ transitions.to_vec().into_boxed_slice();
+ let sparse = SparseTransitions { transitions };
+ nfa.add(nfa::State::Sparse(sparse))
+ }
+ }
+ }
+ State::Look { look, next } => {
+ remap[sid] = nfa.add(nfa::State::Look { look, next });
+ }
+ State::CaptureStart { pattern_id, group_index, next } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ let slot = nfa
+ .group_info()
+ .slot(pattern_id, group_index.as_usize())
+ .expect("invalid capture index");
+ let slot =
+ SmallIndex::new(slot).expect("a small enough slot");
+ remap[sid] = nfa.add(nfa::State::Capture {
+ next,
+ pattern_id,
+ group_index,
+ slot,
+ });
+ }
+ State::CaptureEnd { pattern_id, group_index, next } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ // Also, this always succeeds because we check that all
+ // slot indices are valid for all capture indices when they
+ // are initially added.
+ let slot = nfa
+ .group_info()
+ .slot(pattern_id, group_index.as_usize())
+ .expect("invalid capture index")
+ .checked_add(1)
+ .unwrap();
+ let slot =
+ SmallIndex::new(slot).expect("a small enough slot");
+ remap[sid] = nfa.add(nfa::State::Capture {
+ next,
+ pattern_id,
+ group_index,
+ slot,
+ });
+ }
+ State::Union { ref alternates } => {
+ if alternates.is_empty() {
+ remap[sid] = nfa.add(nfa::State::Fail);
+ } else if alternates.len() == 1 {
+ empties.push((sid, alternates[0]));
+ remap[sid] = alternates[0];
+ } else if alternates.len() == 2 {
+ remap[sid] = nfa.add(nfa::State::BinaryUnion {
+ alt1: alternates[0],
+ alt2: alternates[1],
+ });
+ } else {
+ let alternates =
+ alternates.to_vec().into_boxed_slice();
+ remap[sid] = nfa.add(nfa::State::Union { alternates });
+ }
+ }
+ State::UnionReverse { ref alternates } => {
+ if alternates.is_empty() {
+ remap[sid] = nfa.add(nfa::State::Fail);
+ } else if alternates.len() == 1 {
+ empties.push((sid, alternates[0]));
+ remap[sid] = alternates[0];
+ } else if alternates.len() == 2 {
+ remap[sid] = nfa.add(nfa::State::BinaryUnion {
+ alt1: alternates[1],
+ alt2: alternates[0],
+ });
+ } else {
+ let mut alternates =
+ alternates.to_vec().into_boxed_slice();
+ alternates.reverse();
+ remap[sid] = nfa.add(nfa::State::Union { alternates });
+ }
+ }
+ State::Fail => {
+ remap[sid] = nfa.add(nfa::State::Fail);
+ }
+ State::Match { pattern_id } => {
+ remap[sid] = nfa.add(nfa::State::Match { pattern_id });
+ }
+ }
+ }
+ // Some of the new states still point to empty state IDs, so we need to
+ // follow each of them and remap the empty state IDs to their non-empty
+ // state IDs.
+ //
+ // We also keep track of which states we've already mapped. This helps
+ // avoid quadratic behavior in a long chain of empty states. For
+ // example, in 'a{0}{50000}'.
+ let mut remapped = vec![false; self.states.len()];
+ for &(empty_id, empty_next) in empties.iter() {
+ if remapped[empty_id] {
+ continue;
+ }
+ // empty states can point to other empty states, forming a chain.
+ // So we must follow the chain until the end, which must end at
+ // a non-empty state, and therefore, a state that is correctly
+ // remapped. We are guaranteed to terminate because our compiler
+ // never builds a loop among only empty states.
+ let mut new_next = empty_next;
+ while let Some(next) = self.states[new_next].goto() {
+ new_next = next;
+ }
+ remap[empty_id] = remap[new_next];
+ remapped[empty_id] = true;
+
+ // Now that we've remapped the main 'empty_id' above, we re-follow
+ // the chain from above and remap every empty state we found along
+ // the way to our ultimate non-empty target. We are careful to set
+ // 'remapped' to true for each such state. We thus will not need
+ // to re-compute this chain for any subsequent empty states in
+ // 'empties' that are part of this chain.
+ let mut next2 = empty_next;
+ while let Some(next) = self.states[next2].goto() {
+ remap[next2] = remap[new_next];
+ remapped[next2] = true;
+ next2 = next;
+ }
+ }
+ // Finally remap all of the state IDs.
+ nfa.remap(&remap);
+ let final_nfa = nfa.into_nfa();
+ debug!(
+ "NFA compilation via builder complete, \
+ final NFA size: {} states, {} bytes on heap, \
+ has empty? {:?}, utf8? {:?}",
+ final_nfa.states().len(),
+ final_nfa.memory_usage(),
+ final_nfa.has_empty(),
+ final_nfa.is_utf8(),
+ );
+ Ok(final_nfa)
+ }
+
+ /// Start the assembly of a pattern in this NFA.
+ ///
+ /// Upon success, this returns the identifier for the new pattern.
+ /// Identifiers start at `0` and are incremented by 1 for each new pattern.
+ ///
+ /// It is necessary to call this routine before adding capturing states.
+ /// Otherwise, any other NFA state may be added before starting a pattern.
+ ///
+ /// # Errors
+ ///
+ /// If the pattern identifier space is exhausted, then this returns an
+ /// error.
+ ///
+ /// # Panics
+ ///
+ /// If this is called while assembling another pattern (i.e., before
+ /// `finish_pattern` is called), then this panics.
+ pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> {
+ assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
+
+ let proposed = self.start_pattern.len();
+ let pid = PatternID::new(proposed)
+ .map_err(|_| BuildError::too_many_patterns(proposed))?;
+ self.pattern_id = Some(pid);
+ // This gets filled in when 'finish_pattern' is called.
+ self.start_pattern.push(StateID::ZERO);
+ Ok(pid)
+ }
+
+ /// Finish the assembly of a pattern in this NFA.
+ ///
+ /// Upon success, this returns the identifier for the new pattern.
+ /// Identifiers start at `0` and are incremented by 1 for each new
+ /// pattern. This is the same identifier returned by the corresponding
+ /// `start_pattern` call.
+ ///
+ /// Note that `start_pattern` and `finish_pattern` pairs cannot be
+ /// interleaved or nested. A correct `finish_pattern` call _always_
+ /// corresponds to the most recently called `start_pattern` routine.
+ ///
+ /// # Errors
+ ///
+ /// This currently never returns an error, but this is subject to change.
+ ///
+ /// # Panics
+ ///
+ /// If this is called without a corresponding `start_pattern` call, then
+ /// this panics.
+ pub fn finish_pattern(
+ &mut self,
+ start_id: StateID,
+ ) -> Result<PatternID, BuildError> {
+ let pid = self.current_pattern_id();
+ self.start_pattern[pid] = start_id;
+ self.pattern_id = None;
+ Ok(pid)
+ }
+
+ /// Returns the pattern identifier of the current pattern.
+ ///
+ /// # Panics
+ ///
+ /// If this doesn't occur after a `start_pattern` call and before the
+ /// corresponding `finish_pattern` call, then this panics.
+ pub fn current_pattern_id(&self) -> PatternID {
+ self.pattern_id.expect("must call 'start_pattern' first")
+ }
+
+ /// Returns the number of patterns added to this builder so far.
+ ///
+ /// This only includes patterns that have had `finish_pattern` called
+ /// for them.
+ pub fn pattern_len(&self) -> usize {
+ self.start_pattern.len()
+ }
+
+ /// Add an "empty" NFA state.
+ ///
+ /// An "empty" NFA state is a state with a single unconditional epsilon
+ /// transition to another NFA state. Such empty states are removed before
+ /// building the final [`NFA`] (which has no such "empty" states), but they
+ /// can be quite useful in the construction process of an NFA.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_empty(&mut self) -> Result<StateID, BuildError> {
+ self.add(State::Empty { next: StateID::ZERO })
+ }
+
+ /// Add a "union" NFA state.
+ ///
+ /// A "union" NFA state that contains zero or more unconditional epsilon
+ /// transitions to other NFA states. The order of these transitions
+ /// reflects a priority order where earlier transitions are preferred over
+ /// later transitions.
+ ///
+ /// Callers may provide an empty set of alternates to this method call, and
+ /// then later add transitions via `patch`. At final build time, a "union"
+ /// state with no alternates is converted to a "fail" state, and a "union"
+ /// state with exactly one alternate is treated as if it were an "empty"
+ /// state.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_union(
+ &mut self,
+ alternates: Vec<StateID>,
+ ) -> Result<StateID, BuildError> {
+ self.add(State::Union { alternates })
+ }
+
+ /// Add a "reverse union" NFA state.
+ ///
+ /// A "reverse union" NFA state contains zero or more unconditional epsilon
+ /// transitions to other NFA states. The order of these transitions
+ /// reflects a priority order where later transitions are preferred
+ /// over earlier transitions. This is an inverted priority order when
+ /// compared to `add_union`. This is useful, for example, for implementing
+ /// non-greedy repetition operators.
+ ///
+ /// Callers may provide an empty set of alternates to this method call, and
+ /// then later add transitions via `patch`. At final build time, a "reverse
+ /// union" state with no alternates is converted to a "fail" state, and a
+ /// "reverse union" state with exactly one alternate is treated as if it
+ /// were an "empty" state.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_union_reverse(
+ &mut self,
+ alternates: Vec<StateID>,
+ ) -> Result<StateID, BuildError> {
+ self.add(State::UnionReverse { alternates })
+ }
+
+ /// Add a "range" NFA state.
+ ///
+ /// A "range" NFA state is a state with one outgoing transition to another
+ /// state, where that transition may only be followed if the current input
+ /// byte falls between a range of bytes given.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_range(
+ &mut self,
+ trans: Transition,
+ ) -> Result<StateID, BuildError> {
+ self.add(State::ByteRange { trans })
+ }
+
+ /// Add a "sparse" NFA state.
+ ///
+ /// A "sparse" NFA state contains zero or more outgoing transitions, where
+ /// the transition to be followed (if any) is chosen based on whether the
+ /// current input byte falls in the range of one such transition. The
+ /// transitions given *must* be non-overlapping and in ascending order. (A
+ /// "sparse" state with no transitions is equivalent to a "fail" state.)
+ ///
+ /// A "sparse" state is like adding a "union" state and pointing it at a
+ /// bunch of "range" states, except that the different alternates have
+ /// equal priority.
+ ///
+ /// Note that a "sparse" state is the only state that cannot be patched.
+ /// This is because a "sparse" state has many transitions, each of which
+ /// may point to a different NFA state. Moreover, adding more such
+ /// transitions requires more than just an NFA state ID to point to. It
+ /// also requires a byte range. The `patch` routine does not support the
+ /// additional information required. Therefore, callers must ensure that
+ /// all outgoing transitions for this state are included when `add_sparse`
+ /// is called. There is no way to add more later.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ ///
+ /// # Panics
+ ///
+ /// This routine _may_ panic if the transitions given overlap or are not
+ /// in ascending order.
+ pub fn add_sparse(
+ &mut self,
+ transitions: Vec<Transition>,
+ ) -> Result<StateID, BuildError> {
+ self.add(State::Sparse { transitions })
+ }
+
+ /// Add a "look" NFA state.
+ ///
+ /// A "look" NFA state corresponds to a state with exactly one
+ /// *conditional* epsilon transition to another NFA state. Namely, it
+ /// represents one of a small set of simplistic look-around operators.
+ ///
+ /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
+ /// and then change it later with [`patch`](Builder::patch).
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_look(
+ &mut self,
+ next: StateID,
+ look: Look,
+ ) -> Result<StateID, BuildError> {
+ self.add(State::Look { look, next })
+ }
+
+ /// Add a "start capture" NFA state.
+ ///
+ /// A "start capture" NFA state corresponds to a state with exactly one
+ /// outgoing unconditional epsilon transition to another state. Unlike
+ /// "empty" states, a "start capture" state also carries with it an
+ /// instruction for saving the current position of input to a particular
+ /// location in memory. NFA simulations, like the Pike VM, may use this
+ /// information to report the match locations of capturing groups in a
+ /// regex pattern.
+ ///
+ /// If the corresponding capturing group has a name, then callers should
+ /// include it here.
+ ///
+ /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
+ /// and then change it later with [`patch`](Builder::patch).
+ ///
+ /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
+ /// end states may be interleaved. Indeed, it is typical for many "start
+ /// capture" NFA states to appear before the first "end capture" state.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded or if the given
+ /// capture index overflows `usize`.
+ ///
+ /// While the above are the only conditions in which this routine can
+ /// currently return an error, it is possible to call this method with an
+ /// inputs that results in the final `build()` step failing to produce an
+ /// NFA. For example, if one adds two distinct capturing groups with the
+ /// same name, then that will result in `build()` failing with an error.
+ ///
+ /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
+ /// more information on what qualifies as valid capturing groups.
+ ///
+ /// # Example
+ ///
+ /// This example shows that an error occurs when one tries to add multiple
+ /// capturing groups with the same name to the same pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::Builder,
+ /// util::primitives::StateID,
+ /// };
+ ///
+ /// let name = Some(std::sync::Arc::from("foo"));
+ /// let mut builder = Builder::new();
+ /// builder.start_pattern()?;
+ /// // 0th capture group should always be unnamed.
+ /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
+ /// // OK
+ /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?;
+ /// // This is not OK, but 'add_capture_start' still succeeds. We don't
+ /// // get an error until we call 'build' below. Without this call, the
+ /// // call to 'build' below would succeed.
+ /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?;
+ /// // Finish our pattern so we can try to build the NFA.
+ /// builder.finish_pattern(start)?;
+ /// let result = builder.build(start, start);
+ /// assert!(result.is_err());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// However, adding multiple capturing groups with the same name to
+ /// distinct patterns is okay:
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ ///
+ /// use regex_automata::{
+ /// nfa::thompson::{pikevm::PikeVM, Builder, Transition},
+ /// util::{
+ /// captures::Captures,
+ /// primitives::{PatternID, StateID},
+ /// },
+ /// Span,
+ /// };
+ ///
+ /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'.
+ /// let mut builder = Builder::new();
+ /// // We compile them to support an unanchored search, which requires
+ /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern.
+ /// let unanchored_prefix = builder.add_union_reverse(vec![])?;
+ /// let any = builder.add_range(Transition {
+ /// start: b'\x00', end: b'\xFF', next: StateID::ZERO,
+ /// })?;
+ /// builder.patch(unanchored_prefix, any)?;
+ /// builder.patch(any, unanchored_prefix)?;
+ ///
+ /// // Compile an alternation that permits matching multiple patterns.
+ /// let alt = builder.add_union(vec![])?;
+ /// builder.patch(unanchored_prefix, alt)?;
+ ///
+ /// // Compile '(?P<foo>[a-z]+)'.
+ /// builder.start_pattern()?;
+ /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?;
+ /// // N.B. 0th capture group must always be unnamed.
+ /// let foo_start0 = builder.add_capture_start(
+ /// StateID::ZERO, 1, Some(Arc::from("foo")),
+ /// )?;
+ /// let lowercase = builder.add_range(Transition {
+ /// start: b'a', end: b'z', next: StateID::ZERO,
+ /// })?;
+ /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?;
+ /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?;
+ /// let match0 = builder.add_match()?;
+ /// builder.patch(start0, foo_start0)?;
+ /// builder.patch(foo_start0, lowercase)?;
+ /// builder.patch(lowercase, foo_end0)?;
+ /// builder.patch(foo_end0, end0)?;
+ /// builder.patch(end0, match0)?;
+ /// builder.finish_pattern(start0)?;
+ ///
+ /// // Compile '(?P<foo>[A-Z]+)'.
+ /// builder.start_pattern()?;
+ /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?;
+ /// // N.B. 0th capture group must always be unnamed.
+ /// let foo_start1 = builder.add_capture_start(
+ /// StateID::ZERO, 1, Some(Arc::from("foo")),
+ /// )?;
+ /// let uppercase = builder.add_range(Transition {
+ /// start: b'A', end: b'Z', next: StateID::ZERO,
+ /// })?;
+ /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?;
+ /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?;
+ /// let match1 = builder.add_match()?;
+ /// builder.patch(start1, foo_start1)?;
+ /// builder.patch(foo_start1, uppercase)?;
+ /// builder.patch(uppercase, foo_end1)?;
+ /// builder.patch(foo_end1, end1)?;
+ /// builder.patch(end1, match1)?;
+ /// builder.finish_pattern(start1)?;
+ ///
+ /// // Now add the patterns to our alternation that we started above.
+ /// builder.patch(alt, start0)?;
+ /// builder.patch(alt, start1)?;
+ ///
+ /// // Finally build the NFA. The first argument is the anchored starting
+ /// // state (the pattern alternation) where as the second is the
+ /// // unanchored starting state (the unanchored prefix).
+ /// let nfa = builder.build(alt, unanchored_prefix)?;
+ ///
+ /// // Now build a Pike VM from our NFA and access the 'foo' capture
+ /// // group regardless of which pattern matched, since it is defined
+ /// // for both patterns.
+ /// let vm = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = vm.create_cache();
+ /// let caps: Vec<Captures> =
+ /// vm.captures_iter(&mut cache, "0123aAaAA").collect();
+ /// assert_eq!(5, caps.len());
+ ///
+ /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern());
+ /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo"));
+ ///
+ /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern());
+ /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo"));
+ ///
+ /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern());
+ /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo"));
+ ///
+ /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern());
+ /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo"));
+ ///
+ /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern());
+ /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn add_capture_start(
+ &mut self,
+ next: StateID,
+ group_index: u32,
+ name: Option<Arc<str>>,
+ ) -> Result<StateID, BuildError> {
+ let pid = self.current_pattern_id();
+ let group_index = match SmallIndex::try_from(group_index) {
+ Err(_) => {
+ return Err(BuildError::invalid_capture_index(group_index))
+ }
+ Ok(group_index) => group_index,
+ };
+ // Make sure we have space to insert our (pid,index)|-->name mapping.
+ if pid.as_usize() >= self.captures.len() {
+ for _ in 0..=(pid.as_usize() - self.captures.len()) {
+ self.captures.push(vec![]);
+ }
+ }
+ // In the case where 'group_index < self.captures[pid].len()', it means
+ // that we are adding a duplicate capture group. This is somewhat
+ // weird, but permissible because the capture group itself can be
+ // repeated in the syntax. For example, '([a-z]){4}' will produce 4
+ // capture groups. In practice, only the last will be set at search
+ // time when a match occurs. For duplicates, we don't need to push
+ // anything other than a CaptureStart NFA state.
+ if group_index.as_usize() >= self.captures[pid].len() {
+ // For discontiguous indices, push placeholders for earlier capture
+ // groups that weren't explicitly added.
+ for _ in 0..(group_index.as_usize() - self.captures[pid].len()) {
+ self.captures[pid].push(None);
+ }
+ self.captures[pid].push(name);
+ }
+ self.add(State::CaptureStart { pattern_id: pid, group_index, next })
+ }
+
+ /// Add a "end capture" NFA state.
+ ///
+ /// A "end capture" NFA state corresponds to a state with exactly one
+ /// outgoing unconditional epsilon transition to another state. Unlike
+ /// "empty" states, a "end capture" state also carries with it an
+ /// instruction for saving the current position of input to a particular
+ /// location in memory. NFA simulations, like the Pike VM, may use this
+ /// information to report the match locations of capturing groups in a
+ ///
+ /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
+ /// and then change it later with [`patch`](Builder::patch).
+ ///
+ /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
+ /// end states may be interleaved. Indeed, it is typical for many "start
+ /// capture" NFA states to appear before the first "end capture" state.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded or if the given
+ /// capture index overflows `usize`.
+ ///
+ /// While the above are the only conditions in which this routine can
+ /// currently return an error, it is possible to call this method with an
+ /// inputs that results in the final `build()` step failing to produce an
+ /// NFA. For example, if one adds two distinct capturing groups with the
+ /// same name, then that will result in `build()` failing with an error.
+ ///
+ /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
+ /// more information on what qualifies as valid capturing groups.
+ pub fn add_capture_end(
+ &mut self,
+ next: StateID,
+ group_index: u32,
+ ) -> Result<StateID, BuildError> {
+ let pid = self.current_pattern_id();
+ let group_index = match SmallIndex::try_from(group_index) {
+ Err(_) => {
+ return Err(BuildError::invalid_capture_index(group_index))
+ }
+ Ok(group_index) => group_index,
+ };
+ self.add(State::CaptureEnd { pattern_id: pid, group_index, next })
+ }
+
+ /// Adds a "fail" NFA state.
+ ///
+ /// A "fail" state is simply a state that has no outgoing transitions. It
+ /// acts as a way to cause a search to stop without reporting a match.
+ /// For example, one way to represent an NFA with zero patterns is with a
+ /// single "fail" state.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ pub fn add_fail(&mut self) -> Result<StateID, BuildError> {
+ self.add(State::Fail)
+ }
+
+ /// Adds a "match" NFA state.
+ ///
+ /// A "match" state has no outgoing transitions (just like a "fail"
+ /// state), but it has special significance in that if a search enters
+ /// this state, then a match has been found. The match state that is added
+ /// automatically has the current pattern ID associated with it. This is
+ /// used to report the matching pattern ID at search time.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the state identifier space is exhausted, or if
+ /// the configured heap size limit has been exceeded.
+ ///
+ /// # Panics
+ ///
+ /// This must be called after a `start_pattern` call but before the
+ /// corresponding `finish_pattern` call. Otherwise, it panics.
+ pub fn add_match(&mut self) -> Result<StateID, BuildError> {
+ let pattern_id = self.current_pattern_id();
+ let sid = self.add(State::Match { pattern_id })?;
+ Ok(sid)
+ }
+
+ /// The common implementation of "add a state." It handles the common
+ /// error cases of state ID exhausting (by owning state ID allocation) and
+ /// whether the size limit has been exceeded.
+ fn add(&mut self, state: State) -> Result<StateID, BuildError> {
+ let id = StateID::new(self.states.len())
+ .map_err(|_| BuildError::too_many_states(self.states.len()))?;
+ self.memory_states += state.memory_usage();
+ self.states.push(state);
+ self.check_size_limit()?;
+ Ok(id)
+ }
+
+ /// Add a transition from one state to another.
+ ///
+ /// This routine is called "patch" since it is very common to add the
+ /// states you want, typically with "dummy" state ID transitions, and then
+ /// "patch" in the real state IDs later. This is because you don't always
+ /// know all of the necessary state IDs to add because they might not
+ /// exist yet.
+ ///
+ /// # Errors
+ ///
+ /// This may error if patching leads to an increase in heap usage beyond
+ /// the configured size limit. Heap usage only grows when patching adds a
+ /// new transition (as in the case of a "union" state).
+ ///
+ /// # Panics
+ ///
+ /// This panics if `from` corresponds to a "sparse" state. When "sparse"
+ /// states are added, there is no way to patch them after-the-fact. (If you
+ /// have a use case where this would be helpful, please file an issue. It
+ /// will likely require a new API.)
+ pub fn patch(
+ &mut self,
+ from: StateID,
+ to: StateID,
+ ) -> Result<(), BuildError> {
+ let old_memory_states = self.memory_states;
+ match self.states[from] {
+ State::Empty { ref mut next } => {
+ *next = to;
+ }
+ State::ByteRange { ref mut trans } => {
+ trans.next = to;
+ }
+ State::Sparse { .. } => {
+ panic!("cannot patch from a sparse NFA state")
+ }
+ State::Look { ref mut next, .. } => {
+ *next = to;
+ }
+ State::Union { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_states += mem::size_of::<StateID>();
+ }
+ State::UnionReverse { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_states += mem::size_of::<StateID>();
+ }
+ State::CaptureStart { ref mut next, .. } => {
+ *next = to;
+ }
+ State::CaptureEnd { ref mut next, .. } => {
+ *next = to;
+ }
+ State::Fail => {}
+ State::Match { .. } => {}
+ }
+ if old_memory_states != self.memory_states {
+ self.check_size_limit()?;
+ }
+ Ok(())
+ }
+
+ /// Set whether the NFA produced by this builder should only match UTF-8.
+ ///
+ /// This should be set when both of the following are true:
+ ///
+ /// 1. The caller guarantees that the NFA created by this build will only
+ /// report non-empty matches with spans that are valid UTF-8.
+ /// 2. The caller desires regex engines using this NFA to avoid reporting
+ /// empty matches with a span that splits a valid UTF-8 encoded codepoint.
+ ///
+ /// Property (1) is not checked. Instead, this requires the caller to
+ /// promise that it is true. Property (2) corresponds to the behavior of
+ /// regex engines using the NFA created by this builder. Namely, there
+ /// is no way in the NFA's graph itself to say that empty matches found
+ /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries.
+ /// Instead, this option is used to communicate the UTF-8 semantic to regex
+ /// engines that will typically implement it as a post-processing step by
+ /// filtering out empty matches that don't fall on UTF-8 boundaries.
+ ///
+ /// If you're building an NFA from an HIR (and not using a
+ /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can
+ /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8)
+ /// option to guarantee that if the HIR detects a non-empty match, then it
+ /// is guaranteed to be valid UTF-8.
+ ///
+ /// Note that property (2) does *not* specify the behavior of executing
+ /// a search on a haystack that is not valid UTF-8. Therefore, if you're
+ /// *not* running this NFA on strings that are guaranteed to be valid
+ /// UTF-8, you almost certainly do not want to enable this option.
+ /// Similarly, if you are running the NFA on strings that *are* guaranteed
+ /// to be valid UTF-8, then you almost certainly want to enable this option
+ /// unless you can guarantee that your NFA will never produce a zero-width
+ /// match.
+ ///
+ /// It is disabled by default.
+ pub fn set_utf8(&mut self, yes: bool) {
+ self.utf8 = yes;
+ }
+
+ /// Returns whether UTF-8 mode is enabled for this builder.
+ ///
+ /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8
+ }
+
+ /// Sets whether the NFA produced by this builder should be matched in
+ /// reverse or not. Generally speaking, when enabled, the NFA produced
+ /// should be matched by moving backwards through a haystack, from a higher
+ /// memory address to a lower memory address.
+ ///
+ /// See also [`NFA::is_reverse`] for more details.
+ ///
+ /// This is disabled by default, which means NFAs are by default matched
+ /// in the forward direction.
+ pub fn set_reverse(&mut self, yes: bool) {
+ self.reverse = yes;
+ }
+
+ /// Returns whether reverse mode is enabled for this builder.
+ ///
+ /// See [`Builder::set_reverse`] for more details about what "reverse mode"
+ /// is.
+ pub fn get_reverse(&self) -> bool {
+ self.reverse
+ }
+
+ /// Sets the look-around matcher that should be used for the resulting NFA.
+ ///
+ /// A look-around matcher can be used to configure how look-around
+ /// assertions are matched. For example, a matcher might carry
+ /// configuration that changes the line terminator used for `(?m:^)` and
+ /// `(?m:$)` assertions.
+ pub fn set_look_matcher(&mut self, m: LookMatcher) {
+ self.look_matcher = m;
+ }
+
+ /// Returns the look-around matcher used for this builder.
+ ///
+ /// If a matcher was not explicitly set, then `LookMatcher::default()` is
+ /// returned.
+ pub fn get_look_matcher(&self) -> &LookMatcher {
+ &self.look_matcher
+ }
+
+ /// Set the size limit on this builder.
+ ///
+ /// Setting the size limit will also check whether the NFA built so far
+ /// fits within the given size limit. If it doesn't, then an error is
+ /// returned.
+ ///
+ /// By default, there is no configured size limit.
+ pub fn set_size_limit(
+ &mut self,
+ limit: Option<usize>,
+ ) -> Result<(), BuildError> {
+ self.size_limit = limit;
+ self.check_size_limit()
+ }
+
+ /// Return the currently configured size limit.
+ ///
+ /// By default, this returns `None`, which corresponds to no configured
+ /// size limit.
+ pub fn get_size_limit(&self) -> Option<usize> {
+ self.size_limit
+ }
+
+ /// Returns the heap memory usage, in bytes, used by the NFA states added
+ /// so far.
+ ///
+ /// Note that this is an approximation of how big the final NFA will be.
+ /// In practice, the final NFA will likely be a bit smaller because of
+ /// its simpler state representation. (For example, using things like
+ /// `Box<[StateID]>` instead of `Vec<StateID>`.)
+ pub fn memory_usage(&self) -> usize {
+ self.states.len() * mem::size_of::<State>() + self.memory_states
+ }
+
+ fn check_size_limit(&self) -> Result<(), BuildError> {
+ if let Some(limit) = self.size_limit {
+ if self.memory_usage() > limit {
+ return Err(BuildError::exceeded_size_limit(limit));
+ }
+ }
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // This asserts that a builder state doesn't have its size changed. It is
+ // *really* easy to accidentally increase the size, and thus potentially
+ // dramatically increase the memory usage of NFA builder.
+ //
+ // This assert doesn't mean we absolutely cannot increase the size of a
+ // builder state. We can. It's just here to make sure we do it knowingly
+ // and intentionally.
+ //
+ // A builder state is unfortunately a little bigger than an NFA state,
+ // since we really want to support adding things to a pre-existing state.
+ // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an
+ // extra 8 bytes per state. Sad, but at least it gets freed once the NFA
+ // is built.
+ #[test]
+ fn state_has_small_size() {
+ #[cfg(target_pointer_width = "64")]
+ assert_eq!(32, core::mem::size_of::<State>());
+ #[cfg(target_pointer_width = "32")]
+ assert_eq!(16, core::mem::size_of::<State>());
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs
index 301194005..065e9ef27 100644
--- a/vendor/regex-automata/src/nfa/thompson/compiler.rs
+++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs
@@ -1,73 +1,37 @@
-/*
-This module provides an NFA compiler using Thompson's construction
-algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
-graph as output. The NFA graph is structured in a way that permits it to be
-executed by a virtual machine and also used to efficiently build a DFA.
-
-The compiler deals with a slightly expanded set of NFA states that notably
-includes an empty node that has exactly one epsilon transition to the next
-state. In other words, it's a "goto" instruction if one views Thompson's NFA
-as a set of bytecode instructions. These goto instructions are removed in
-a subsequent phase before returning the NFA to the caller. The purpose of
-these empty nodes is that they make the construction algorithm substantially
-simpler to implement. We remove them before returning to the caller because
-they can represent substantial overhead when traversing the NFA graph
-(either while searching using the NFA directly or while building a DFA).
-
-In the future, it would be nice to provide a Glushkov compiler as well,
-as it would work well as a bit-parallel NFA for smaller regexes. But
-the Thompson construction is one I'm more familiar with and seems more
-straight-forward to deal with when it comes to large Unicode character
-classes.
-
-Internally, the compiler uses interior mutability to improve composition
-in the face of the borrow checker. In particular, we'd really like to be
-able to write things like this:
-
- self.c_concat(exprs.iter().map(|e| self.c(e)))
-
-Which elegantly uses iterators to build up a sequence of compiled regex
-sub-expressions and then hands it off to the concatenating compiler
-routine. Without interior mutability, the borrow checker won't let us
-borrow `self` mutably both inside and outside the closure at the same
-time.
-*/
-
-use core::{
- borrow::Borrow,
- cell::{Cell, RefCell},
- mem,
-};
+use core::{borrow::Borrow, cell::RefCell};
use alloc::{sync::Arc, vec, vec::Vec};
use regex_syntax::{
- hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
+ hir::{self, Hir},
utf8::{Utf8Range, Utf8Sequences},
ParserBuilder,
};
use crate::{
nfa::thompson::{
- error::Error,
+ builder::Builder,
+ error::BuildError,
+ literal_trie::LiteralTrie,
map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
+ nfa::{Transition, NFA},
range_trie::RangeTrie,
- Look, SparseTransitions, State, Transition, NFA,
},
util::{
- alphabet::ByteClassSet,
- id::{IteratorIDExt, PatternID, StateID},
+ look::{Look, LookMatcher},
+ primitives::{PatternID, StateID},
},
};
-/// The configuration used for compiling a Thompson NFA from a regex pattern.
-#[derive(Clone, Copy, Debug, Default)]
+/// The configuration used for a Thompson NFA compiler.
+#[derive(Clone, Debug, Default)]
pub struct Config {
- reverse: Option<bool>,
utf8: Option<bool>,
+ reverse: Option<bool>,
nfa_size_limit: Option<Option<usize>>,
shrink: Option<bool>,
- captures: Option<bool>,
+ which_captures: Option<WhichCaptures>,
+ look_matcher: Option<LookMatcher>,
#[cfg(test)]
unanchored_prefix: Option<bool>,
}
@@ -78,42 +42,162 @@ impl Config {
Config::default()
}
+ /// Whether to enable UTF-8 mode during search or not.
+ ///
+ /// A regex engine is said to be in UTF-8 mode when it guarantees that
+ /// all matches returned by it have spans consisting of only valid UTF-8.
+ /// That is, it is impossible for a match span to be returned that
+ /// contains any invalid UTF-8.
+ ///
+ /// UTF-8 mode generally consists of two things:
+ ///
+ /// 1. Whether the NFA's states are constructed such that all paths to a
+ /// match state that consume at least one byte always correspond to valid
+ /// UTF-8.
+ /// 2. Whether all paths to a match state that do _not_ consume any bytes
+ /// should always correspond to valid UTF-8 boundaries.
+ ///
+ /// (1) is a guarantee made by whoever constructs the NFA.
+ /// If you're parsing a regex from its concrete syntax, then
+ /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
+ /// this guarantee for you. It does it by returning an error if the regex
+ /// pattern could every report a non-empty match span that contains invalid
+ /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
+ /// successfully parses, then you're guaranteed that the corresponding NFA
+ /// will only ever report non-empty match spans containing valid UTF-8.
+ ///
+ /// (2) is a trickier guarantee because it cannot be enforced by the NFA
+ /// state graph itself. Consider, for example, the regex `a*`. It matches
+ /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
+ /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
+ /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
+ /// guarantee must be made at a higher level than the NFA state graph
+ /// itself. This crate deals with this case in each regex engine. Namely,
+ /// when a zero-width match that splits a codepoint is found and UTF-8
+ /// mode enabled, then it is ignored and the engine moves on looking for
+ /// the next match.
+ ///
+ /// Thus, UTF-8 mode is both a promise that the NFA built only reports
+ /// non-empty matches that are valid UTF-8, and an *instruction* to regex
+ /// engines that empty matches that split codepoints should be banned.
+ ///
+ /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
+ /// it only makes sense to enable this option when you *know* your haystack
+ /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
+ /// searching a haystack that contains invalid UTF-8 leads to **unspecified
+ /// behavior**.
+ ///
+ /// Therefore, it may make sense to enable `syntax::Config::utf8` while
+ /// simultaneously *disabling* this option. That would ensure all non-empty
+ /// match spans are valid UTF-8, but that empty match spans may still split
+ /// a codepoint or match at other places that aren't valid UTF-8.
+ ///
+ /// In general, this mode is only relevant if your regex can match the
+ /// empty string. Most regexes don't.
+ ///
+ /// This is enabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new("")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let mut input = Input::new("☃");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
+ ///
+ /// input.set_start(2);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
+ ///
+ /// input.set_start(3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// input.set_start(4);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
/// Reverse the NFA.
///
/// A NFA reversal is performed by reversing all of the concatenated
- /// sub-expressions in the original pattern, recursively. The resulting
- /// NFA can be used to match the pattern starting from the end of a string
- /// instead of the beginning of a string.
+ /// sub-expressions in the original pattern, recursively. (Look around
+ /// operators are also inverted.) The resulting NFA can be used to match
+ /// the pattern starting from the end of a string instead of the beginning
+ /// of a string.
///
/// Reversing the NFA is useful for building a reverse DFA, which is most
/// useful for finding the start of a match after its ending position has
- /// been found.
+ /// been found. NFA execution engines typically do not work on reverse
+ /// NFAs. For example, currently, the Pike VM reports the starting location
+ /// of matches without a reverse NFA.
+ ///
+ /// Currently, enabling this setting requires disabling the
+ /// [`captures`](Config::captures) setting. If both are enabled, then the
+ /// compiler will return an error. It is expected that this limitation will
+ /// be lifted in the future.
///
/// This is disabled by default.
- pub fn reverse(mut self, yes: bool) -> Config {
- self.reverse = Some(yes);
- self
- }
-
- /// Whether to enable UTF-8 mode or not.
///
- /// When UTF-8 mode is enabled (which is the default), unanchored searches
- /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
- /// an unanchored search will stop at that point. This is equivalent to
- /// putting a `(?s:.)*?` at the start of the regex.
+ /// # Example
///
- /// When UTF-8 mode is disabled, then unanchored searches will match
- /// through any arbitrary byte. This is equivalent to putting a
- /// `(?s-u:.)*?` at the start of the regex.
+ /// This example shows how to build a DFA from a reverse NFA, and then use
+ /// the DFA to search backwards.
///
- /// Generally speaking, UTF-8 mode should only be used when you know you
- /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
- /// on input that is not valid UTF-8, then the regex is not likely to work
- /// as expected.
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{self, Automaton},
+ /// nfa::thompson::{NFA, WhichCaptures},
+ /// HalfMatch, Input,
+ /// };
///
- /// This is enabled by default.
- pub fn utf8(mut self, yes: bool) -> Config {
- self.utf8 = Some(yes);
+ /// let dfa = dfa::dense::Builder::new()
+ /// .thompson(NFA::config()
+ /// .which_captures(WhichCaptures::None)
+ /// .reverse(true)
+ /// )
+ /// .build("baz[0-9]+")?;
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(
+ /// expected,
+ /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reverse(mut self, yes: bool) -> Config {
+ self.reverse = Some(yes);
self
}
@@ -143,16 +227,17 @@ impl Config {
/// size of the NFA.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::nfa::thompson::NFA;
///
/// // 300KB isn't enough!
- /// NFA::builder()
+ /// NFA::compiler()
/// .configure(NFA::config().nfa_size_limit(Some(300_000)))
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 400KB probably is.
- /// let nfa = NFA::builder()
+ /// let nfa = NFA::compiler()
/// .configure(NFA::config().nfa_size_limit(Some(400_000)))
/// .build(r"\w{20}")?;
///
@@ -168,17 +253,52 @@ impl Config {
/// Apply best effort heuristics to shrink the NFA at the expense of more
/// time/memory.
///
- /// This is enabled by default. Generally speaking, if one is using an NFA
- /// to compile a DFA, then the extra time used to shrink the NFA will be
- /// more than made up for during DFA construction (potentially by a lot).
- /// In other words, enabling this can substantially decrease the overall
- /// amount of time it takes to build a DFA.
+ /// Generally speaking, if one is using an NFA to compile a DFA, then the
+ /// extra time used to shrink the NFA will be more than made up for during
+ /// DFA construction (potentially by a lot). In other words, enabling this
+ /// can substantially decrease the overall amount of time it takes to build
+ /// a DFA.
///
- /// The only reason to disable this if you want to compile an NFA and start
- /// using it as quickly as possible without needing to build a DFA. e.g.,
- /// for an NFA simulation or for a lazy DFA.
+ /// A reason to keep this disabled is if you want to compile an NFA and
+ /// start using it as quickly as possible without needing to build a DFA,
+ /// and you don't mind using a bit of extra memory for the NFA. e.g., for
+ /// an NFA simulation or for a lazy DFA.
///
- /// This is enabled by default.
+ /// NFA shrinking is currently most useful when compiling a reverse
+ /// NFA with large Unicode character classes. In particular, it trades
+ /// additional CPU time during NFA compilation in favor of generating fewer
+ /// NFA states.
+ ///
+ /// This is disabled by default because it can increase compile times
+ /// quite a bit if you aren't building a full DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows that NFA shrinking can lead to substantial space
+ /// savings in some cases. Notice that, as noted above, we build a reverse
+ /// DFA and use a pattern with a large Unicode character class.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
+ ///
+ /// // Currently we have to disable captures when enabling reverse NFA.
+ /// let config = NFA::config()
+ /// .which_captures(WhichCaptures::None)
+ /// .reverse(true);
+ /// let not_shrunk = NFA::compiler()
+ /// .configure(config.clone().shrink(false))
+ /// .build(r"\w")?;
+ /// let shrunk = NFA::compiler()
+ /// .configure(config.clone().shrink(true))
+ /// .build(r"\w")?;
+ ///
+ /// // While a specific shrink factor is not guaranteed, the savings can be
+ /// // considerable in some cases.
+ /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn shrink(mut self, yes: bool) -> Config {
self.shrink = Some(yes);
self
@@ -186,13 +306,153 @@ impl Config {
/// Whether to include 'Capture' states in the NFA.
///
- /// This can only be enabled when compiling a forward NFA. This is
- /// always disabled---with no way to override it---when the `reverse`
- /// configuration is enabled.
+ /// Currently, enabling this setting requires disabling the
+ /// [`reverse`](Config::reverse) setting. If both are enabled, then the
+ /// compiler will return an error. It is expected that this limitation will
+ /// be lifted in the future.
///
/// This is enabled by default.
- pub fn captures(mut self, yes: bool) -> Config {
- self.captures = Some(yes);
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates that some regex engines, like the Pike VM,
+ /// require capturing states to be present in the NFA to report match
+ /// offsets.
+ ///
+ /// (Note that since this method is deprecated, the example below uses
+ /// [`Config::which_captures`] to disable capture states.)
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// pikevm::PikeVM,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "abc"));
+ /// assert_eq!(None, re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[deprecated(since = "0.3.5", note = "use which_captures instead")]
+ pub fn captures(self, yes: bool) -> Config {
+ self.which_captures(if yes {
+ WhichCaptures::All
+ } else {
+ WhichCaptures::None
+ })
+ }
+
+ /// Configures what kinds of capture groups are compiled into
+ /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
+ /// Thompson NFA.
+ ///
+ /// Currently, using any option except for [`WhichCaptures::None`] requires
+ /// disabling the [`reverse`](Config::reverse) setting. If both are
+ /// enabled, then the compiler will return an error. It is expected that
+ /// this limitation will be lifted in the future.
+ ///
+ /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
+ /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
+ /// overhead of capture states for explicit groups. Usually this occurs
+ /// when one wants to use the `PikeVM` only for determining the overall
+ /// match. Otherwise, the `PikeVM` could use much more memory than is
+ /// necessary.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates that some regex engines, like the Pike VM,
+ /// require capturing states to be present in the NFA to report match
+ /// offsets.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// pikevm::PikeVM,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "abc"));
+ /// assert_eq!(None, re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// The same applies to the bounded backtracker:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// backtrack::BoundedBacktracker,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.try_is_match(&mut cache, "abc")?);
+ /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
+ self.which_captures = Some(which_captures);
+ self
+ }
+
+ /// Sets the look-around matcher that should be used with this NFA.
+ ///
+ /// A look-around matcher determines how to match look-around assertions.
+ /// In particular, some assertions are configurable. For example, the
+ /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
+ /// from the default of `\n` to any other byte.
+ ///
+ /// # Example
+ ///
+ /// This shows how to change the line terminator for multi-line assertions.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// util::look::LookMatcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let mut lookm = LookMatcher::new();
+ /// lookm.set_line_terminator(b'\x00');
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().look_matcher(lookm))
+ /// .build(r"(?m)^[a-z]+$")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// // Multi-line assertions now use NUL as a terminator.
+ /// assert_eq!(
+ /// Some(Match::must(0, 1..4)),
+ /// re.find(&mut cache, b"\x00abc\x00"),
+ /// );
+ /// // ... and \n is no longer recognized as a terminator.
+ /// assert_eq!(
+ /// None,
+ /// re.find(&mut cache, b"\nabc\n"),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn look_matcher(mut self, m: LookMatcher) -> Config {
+ self.look_matcher = Some(m);
self
}
@@ -206,26 +466,47 @@ impl Config {
self
}
- pub fn get_reverse(&self) -> bool {
- self.reverse.unwrap_or(false)
- }
-
+ /// Returns whether this configuration has enabled UTF-8 mode.
pub fn get_utf8(&self) -> bool {
self.utf8.unwrap_or(true)
}
+ /// Returns whether this configuration has enabled reverse NFA compilation.
+ pub fn get_reverse(&self) -> bool {
+ self.reverse.unwrap_or(false)
+ }
+
+ /// Return the configured NFA size limit, if it exists, in the number of
+ /// bytes of heap used.
pub fn get_nfa_size_limit(&self) -> Option<usize> {
self.nfa_size_limit.unwrap_or(None)
}
+ /// Return whether NFA shrinking is enabled.
pub fn get_shrink(&self) -> bool {
- self.shrink.unwrap_or(true)
+ self.shrink.unwrap_or(false)
}
+ /// Return whether NFA compilation is configured to produce capture states.
+ #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
pub fn get_captures(&self) -> bool {
- !self.get_reverse() && self.captures.unwrap_or(true)
+ self.get_which_captures().is_any()
+ }
+
+ /// Return what kinds of capture states will be compiled into an NFA.
+ pub fn get_which_captures(&self) -> WhichCaptures {
+ self.which_captures.unwrap_or(WhichCaptures::All)
+ }
+
+ /// Return the look-around matcher for this NFA.
+ pub fn get_look_matcher(&self) -> LookMatcher {
+ self.look_matcher.clone().unwrap_or(LookMatcher::default())
}
+ /// Return whether NFA compilation is configured to include an unanchored
+ /// prefix.
+ ///
+ /// This is always false when not in test mode.
fn get_unanchored_prefix(&self) -> bool {
#[cfg(test)]
{
@@ -237,56 +518,283 @@ impl Config {
}
}
- pub(crate) fn overwrite(self, o: Config) -> Config {
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
Config {
- reverse: o.reverse.or(self.reverse),
utf8: o.utf8.or(self.utf8),
+ reverse: o.reverse.or(self.reverse),
nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
shrink: o.shrink.or(self.shrink),
- captures: o.captures.or(self.captures),
+ which_captures: o.which_captures.or(self.which_captures),
+ look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
#[cfg(test)]
unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
}
}
}
-/// A builder for compiling an NFA.
+/// A configuration indicating which kinds of
+/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
+///
+/// This configuration can be used with [`Config::which_captures`] to control
+/// which capture states are compiled into a Thompson NFA.
+///
+/// The default configuration is [`WhichCaptures::All`].
+#[derive(Clone, Copy, Debug)]
+pub enum WhichCaptures {
+ /// All capture states, including those corresponding to both implicit and
+ /// explicit capture groups, are included in the Thompson NFA.
+ All,
+ /// Only capture states corresponding to implicit capture groups are
+ /// included. Implicit capture groups appear in every pattern implicitly
+ /// and correspond to the overall match of a pattern.
+ ///
+ /// This is useful when one only cares about the overall match of a
+ /// pattern. By excluding capture states from explicit capture groups,
+ /// one might be able to reduce the memory usage of a multi-pattern regex
+ /// substantially if it was otherwise written to have many explicit capture
+ /// groups.
+ Implicit,
+ /// No capture states are compiled into the Thompson NFA.
+ ///
+ /// This is useful when capture states are either not needed (for example,
+ /// if one is only trying to build a DFA) or if they aren't supported (for
+ /// example, a reverse NFA).
+ None,
+}
+
+impl Default for WhichCaptures {
+ fn default() -> WhichCaptures {
+ WhichCaptures::All
+ }
+}
+
+impl WhichCaptures {
+ /// Returns true if this configuration indicates that no capture states
+ /// should be produced in an NFA.
+ pub fn is_none(&self) -> bool {
+ matches!(*self, WhichCaptures::None)
+ }
+
+ /// Returns true if this configuration indicates that some capture states
+ /// should be added to an NFA. Note that this might only include capture
+ /// states for implicit capture groups.
+ pub fn is_any(&self) -> bool {
+ !self.is_none()
+ }
+}
+
+/*
+This compiler below uses Thompson's construction algorithm. The compiler takes
+a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
+is structured in a way that permits it to be executed by a virtual machine and
+also used to efficiently build a DFA.
+
+The compiler deals with a slightly expanded set of NFA states than what is
+in a final NFA (as exhibited by builder::State and nfa::State). Notably a
+compiler state includes an empty node that has exactly one unconditional
+epsilon transition to the next state. In other words, it's a "goto" instruction
+if one views Thompson's NFA as a set of bytecode instructions. These goto
+instructions are removed in a subsequent phase before returning the NFA to the
+caller. The purpose of these empty nodes is that they make the construction
+algorithm substantially simpler to implement. We remove them before returning
+to the caller because they can represent substantial overhead when traversing
+the NFA graph (either while searching using the NFA directly or while building
+a DFA).
+
+In the future, it would be nice to provide a Glushkov compiler as well, as it
+would work well as a bit-parallel NFA for smaller regexes. But the Thompson
+construction is one I'm more familiar with and seems more straight-forward to
+deal with when it comes to large Unicode character classes.
+
+Internally, the compiler uses interior mutability to improve composition in the
+face of the borrow checker. In particular, we'd really like to be able to write
+things like this:
+
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+
+Which elegantly uses iterators to build up a sequence of compiled regex
+sub-expressions and then hands it off to the concatenating compiler routine.
+Without interior mutability, the borrow checker won't let us borrow `self`
+mutably both inside and outside the closure at the same time.
+*/
+
+/// A builder for compiling an NFA from a regex's high-level intermediate
+/// representation (HIR).
+///
+/// This compiler provides a way to translate a parsed regex pattern into an
+/// NFA state graph. The NFA state graph can either be used directly to execute
+/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
+///
+/// This compiler provides APIs both for compiling regex patterns directly from
+/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
+///
+/// This compiler has various options that may be configured via
+/// [`thompson::Config`](Config).
+///
+/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
+/// A `Builder` provides a lower level API that is uncoupled from a regex
+/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
+/// together an NFA by hand. See its docs for examples.
+///
+/// # Example: compilation from concrete syntax
+///
+/// This shows how to compile an NFA from a pattern string while setting a size
+/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{NFA, pikevm::PikeVM},
+/// Match,
+/// };
+///
+/// let config = NFA::config().nfa_size_limit(Some(1_000));
+/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+///
+/// let re = PikeVM::new_from_nfa(nfa)?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+/// let expected = Some(Match::must(0, 3..4));
+/// re.captures(&mut cache, "!@#A#@!", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: compilation from HIR
+///
+/// This shows how to hand assemble a regular expression via its HIR, and then
+/// compile an NFA directly from it.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+///
+/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+/// ClassBytesRange::new(b'0', b'9'),
+/// ClassBytesRange::new(b'A', b'Z'),
+/// ClassBytesRange::new(b'_', b'_'),
+/// ClassBytesRange::new(b'a', b'z'),
+/// ])));
+///
+/// let config = NFA::config().nfa_size_limit(Some(1_000));
+/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+///
+/// let re = PikeVM::new_from_nfa(nfa)?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+/// let expected = Some(Match::must(0, 3..4));
+/// re.captures(&mut cache, "!@#A#@!", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
#[derive(Clone, Debug)]
-pub struct Builder {
- config: Config,
+pub struct Compiler {
+ /// A regex parser, used when compiling an NFA directly from a pattern
+ /// string.
parser: ParserBuilder,
+ /// The compiler configuration.
+ config: Config,
+ /// The builder for actually constructing an NFA. This provides a
+ /// convenient abstraction for writing a compiler.
+ builder: RefCell<Builder>,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
}
-impl Builder {
+impl Compiler {
/// Create a new NFA builder with its default configuration.
- pub fn new() -> Builder {
- Builder { config: Config::default(), parser: ParserBuilder::new() }
+ pub fn new() -> Compiler {
+ Compiler {
+ parser: ParserBuilder::new(),
+ config: Config::default(),
+ builder: RefCell::new(Builder::new()),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ }
}
- /// Compile the given regular expression into an NFA.
+ /// Compile the given regular expression pattern into an NFA.
///
/// If there was a problem parsing the regex, then that error is returned.
///
/// Otherwise, if there was a problem building the NFA, then an error is
/// returned. The only error that can occur is if the compiled regex would
- /// exceed the size limits configured on this builder.
- pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
+ /// exceed the size limits configured on this builder, or if any part of
+ /// the NFA would exceed the integer representations used. (For example,
+ /// too many states might plausibly occur on a 16-bit target.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.captures(&mut cache, "!@#A#@!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
self.build_many(&[pattern])
}
+ /// Compile the given regular expression patterns into a single NFA.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_many(&[
+ /// r"(?-u)\s",
+ /// r"(?-u)\w",
+ /// ])?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(1, 1..2));
+ /// re.captures(&mut cache, "!A! !A!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
- ) -> Result<NFA, Error> {
+ ) -> Result<NFA, BuildError> {
let mut hirs = vec![];
for p in patterns {
hirs.push(
self.parser
.build()
.parse(p.as_ref())
- .map_err(Error::syntax)?,
+ .map_err(BuildError::syntax)?,
);
- log!(log::trace!("parsed: {:?}", p.as_ref()));
+ debug!("parsed: {:?}", p.as_ref());
}
self.build_many_from_hir(&hirs)
}
@@ -296,418 +804,219 @@ impl Builder {
///
/// If there was a problem building the NFA, then an error is returned. The
/// only error that can occur is if the compiled regex would exceed the
- /// size limits configured on this builder.
- pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
- self.build_from_hir_with(&mut Compiler::new(), expr)
+ /// size limits configured on this builder, or if any part of the NFA would
+ /// exceed the integer representations used. (For example, too many states
+ /// might plausibly occur on a 16-bit target.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.captures(&mut cache, "!@#A#@!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
+ self.build_many_from_hir(&[expr])
}
+ /// Compile the given high level intermediate representations of regular
+ /// expressions into a single NFA.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hirs = &[
+ /// Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'\t', b'\r'),
+ /// ClassBytesRange::new(b' ', b' '),
+ /// ]))),
+ /// Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ]))),
+ /// ];
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(1, 1..2));
+ /// re.captures(&mut cache, "!A! !A!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn build_many_from_hir<H: Borrow<Hir>>(
&self,
exprs: &[H],
- ) -> Result<NFA, Error> {
- self.build_many_from_hir_with(&mut Compiler::new(), exprs)
- }
-
- /// Compile the given high level intermediate representation of a regular
- /// expression into the NFA given using the given compiler. Callers may
- /// prefer this over `build` if they would like to reuse allocations while
- /// compiling many regular expressions.
- ///
- /// On success, the given NFA is completely overwritten with the NFA
- /// produced by the compiler.
- ///
- /// If there was a problem building the NFA, then an error is returned.
- /// The only error that can occur is if the compiled regex would exceed
- /// the size limits configured on this builder. When an error is returned,
- /// the contents of `nfa` are unspecified and should not be relied upon.
- /// However, it can still be reused in subsequent calls to this method.
- fn build_from_hir_with(
- &self,
- compiler: &mut Compiler,
- expr: &Hir,
- ) -> Result<NFA, Error> {
- self.build_many_from_hir_with(compiler, &[expr])
- }
-
- fn build_many_from_hir_with<H: Borrow<Hir>>(
- &self,
- compiler: &mut Compiler,
- exprs: &[H],
- ) -> Result<NFA, Error> {
- compiler.configure(self.config);
- compiler.compile(exprs)
+ ) -> Result<NFA, BuildError> {
+ self.compile(exprs)
}
/// Apply the given NFA configuration options to this builder.
- pub fn configure(&mut self, config: Config) -> &mut Builder {
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+ /// assert_eq!(nfa.pattern_len(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn configure(&mut self, config: Config) -> &mut Compiler {
self.config = self.config.overwrite(config);
self
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
- /// This syntax configuration generally only applies when an NFA is built
- /// directly from a pattern string. If an NFA is built from an HIR, then
- /// all syntax settings are ignored.
+ /// This syntax configuration only applies when an NFA is built directly
+ /// from a pattern string. If an NFA is built from an HIR, then all syntax
+ /// settings are ignored.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::syntax};
+ ///
+ /// let syntax_config = syntax::Config::new().unicode(false);
+ /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
+ /// // If Unicode were enabled, the number of states would be much bigger.
+ /// assert!(nfa.states().len() < 15);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
- ) -> &mut Builder {
+ config: crate::util::syntax::Config,
+ ) -> &mut Compiler {
config.apply(&mut self.parser);
self
}
}
-/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
-/// construction. Namely, this compiler permits epsilon transitions between
-/// states.
-#[derive(Clone, Debug)]
-pub struct Compiler {
- /// The configuration from the builder.
- config: Config,
- /// The final NFA that is built.
- ///
- /// Parts of this NFA are constructed during compilation, but the actual
- /// states aren't added until a final "finish" step. This is because the
- /// states constructed during compilation have unconditional epsilon
- /// transitions, which makes the logic of compilation much simpler. The
- /// "finish" step removes these unconditional epsilon transitions and must
- /// therefore remap all of the transition state IDs.
- nfa: RefCell<NFA>,
- /// The set of compiled NFA states. Once a state is compiled, it is
- /// assigned a state ID equivalent to its index in this list. Subsequent
- /// compilation can modify previous states by adding new transitions.
- states: RefCell<Vec<CState>>,
- /// State used for compiling character classes to UTF-8 byte automata.
- /// State is not retained between character class compilations. This just
- /// serves to amortize allocation to the extent possible.
- utf8_state: RefCell<Utf8State>,
- /// State used for arranging character classes in reverse into a trie.
- trie_state: RefCell<RangeTrie>,
- /// State used for caching common suffixes when compiling reverse UTF-8
- /// automata (for Unicode character classes).
- utf8_suffix: RefCell<Utf8SuffixMap>,
- /// A map used to re-map state IDs when translating the compiler's internal
- /// NFA state representation to the external NFA representation.
- remap: RefCell<Vec<StateID>>,
- /// A set of compiler internal state IDs that correspond to states that are
- /// exclusively epsilon transitions, i.e., goto instructions, combined with
- /// the state that they point to. This is used to record said states while
- /// transforming the compiler's internal NFA representation to the external
- /// form.
- empties: RefCell<Vec<(StateID, StateID)>>,
- /// The total memory used by each of the 'CState's in 'states'. This only
- /// includes heap usage by each state, and not the size of the state
- /// itself.
- memory_cstates: Cell<usize>,
-}
-
-/// A compiler intermediate state representation for an NFA that is only used
-/// during compilation. Once compilation is done, `CState`s are converted
-/// to `State`s (defined in the parent module), which have a much simpler
-/// representation.
-#[derive(Clone, Debug, Eq, PartialEq)]
-enum CState {
- /// An empty state whose only purpose is to forward the automaton to
- /// another state via en epsilon transition. These are useful during
- /// compilation but are otherwise removed at the end.
- Empty {
- next: StateID,
- },
- /// An empty state that records a capture location.
- ///
- /// From the perspective of finite automata, this is precisely equivalent
- /// to 'Empty', but serves the purpose of instructing NFA simulations to
- /// record additional state when the finite state machine passes through
- /// this epsilon transition.
- ///
- /// These transitions are treated as epsilon transitions with no additional
- /// effects in DFAs.
- ///
- /// 'slot' in this context refers to the specific capture group offset that
- /// is being recorded. Each capturing group has two slots corresponding to
- /// the start and end of the matching portion of that group.
- CaptureStart {
- next: StateID,
- capture_index: u32,
- name: Option<Arc<str>>,
- },
- CaptureEnd {
- next: StateID,
- capture_index: u32,
- },
- /// A state that only transitions to `next` if the current input byte is
- /// in the range `[start, end]` (inclusive on both ends).
- Range {
- range: Transition,
- },
- /// A state with possibly many transitions, represented in a sparse
- /// fashion. Transitions are ordered lexicographically by input range.
- /// As such, this may only be used when every transition has equal
- /// priority. (In practice, this is only used for encoding large UTF-8
- /// automata.) In contrast, a `Union` state has each alternate in order
- /// of priority. Priority is used to implement greedy matching and also
- /// alternations themselves, e.g., `abc|a` where `abc` has priority over
- /// `a`.
- ///
- /// To clarify, it is possible to remove `Sparse` and represent all things
- /// that `Sparse` is used for via `Union`. But this creates a more bloated
- /// NFA with more epsilon transitions than is necessary in the special case
- /// of character classes.
- Sparse {
- ranges: Vec<Transition>,
- },
- /// A conditional epsilon transition satisfied via some sort of
- /// look-around.
- Look {
- look: Look,
- next: StateID,
- },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via earlier transitions
- /// are preferred over later transitions.
- Union {
- alternates: Vec<StateID>,
- },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via later transitions are
- /// preferred over earlier transitions.
- ///
- /// This "reverse" state exists for convenience during compilation that
- /// permits easy construction of non-greedy combinations of NFA states. At
- /// the end of compilation, Union and UnionReverse states are merged into
- /// one Union type of state, where the latter has its epsilon transitions
- /// reversed to reflect the priority inversion.
- ///
- /// The "convenience" here arises from the fact that as new states are
- /// added to the list of `alternates`, we would like that add operation
- /// to be amortized constant time. But if we used a `Union`, we'd need to
- /// prepend the state, which takes O(n) time. There are other approaches we
- /// could use to solve this, but this seems simple enough.
- UnionReverse {
- alternates: Vec<StateID>,
- },
- /// A match state. There is at most one such occurrence of this state in
- /// an NFA for each pattern compiled into the NFA. At time of writing, a
- /// match state is always produced for every pattern given, but in theory,
- /// if a pattern can never lead to a match, then the match state could be
- /// omitted.
- ///
- /// `id` refers to the ID of the pattern itself, which corresponds to the
- /// pattern's index (starting at 0). `start_id` refers to the anchored
- /// NFA starting state corresponding to this pattern.
- Match {
- pattern_id: PatternID,
- start_id: StateID,
- },
-}
-
-/// A value that represents the result of compiling a sub-expression of a
-/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
-/// has an initial state at `start` and a final state at `end`.
-#[derive(Clone, Copy, Debug)]
-pub struct ThompsonRef {
- start: StateID,
- end: StateID,
-}
-
impl Compiler {
- /// Create a new compiler.
- pub fn new() -> Compiler {
- Compiler {
- config: Config::default(),
- nfa: RefCell::new(NFA::empty()),
- states: RefCell::new(vec![]),
- utf8_state: RefCell::new(Utf8State::new()),
- trie_state: RefCell::new(RangeTrie::new()),
- utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
- remap: RefCell::new(vec![]),
- empties: RefCell::new(vec![]),
- memory_cstates: Cell::new(0),
- }
- }
-
- /// Configure and prepare this compiler from the builder's knobs.
+ /// Compile the sequence of HIR expressions given. Pattern IDs are
+ /// allocated starting from 0, in correspondence with the slice given.
///
- /// The compiler is must always reconfigured by the builder before using it
- /// to build an NFA. Namely, this will also clear any latent state in the
- /// compiler used during previous compilations.
- fn configure(&mut self, config: Config) {
- self.config = config;
- self.nfa.borrow_mut().clear();
- self.states.borrow_mut().clear();
- self.memory_cstates.set(0);
- // We don't need to clear anything else since they are cleared on
- // their own and only when they are used.
- }
-
- /// Convert the current intermediate NFA to its final compiled form.
- fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
- if exprs.is_empty() {
- return Ok(NFA::never_match());
- }
+ /// It is legal to provide an empty slice. In that case, the NFA returned
+ /// has no patterns and will never match anything.
+ fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
if exprs.len() > PatternID::LIMIT {
- return Err(Error::too_many_patterns(exprs.len()));
+ return Err(BuildError::too_many_patterns(exprs.len()));
+ }
+ if self.config.get_reverse()
+ && self.config.get_which_captures().is_any()
+ {
+ return Err(BuildError::unsupported_captures());
}
+ self.builder.borrow_mut().clear();
+ self.builder.borrow_mut().set_utf8(self.config.get_utf8());
+ self.builder.borrow_mut().set_reverse(self.config.get_reverse());
+ self.builder
+ .borrow_mut()
+ .set_look_matcher(self.config.get_look_matcher());
+ self.builder
+ .borrow_mut()
+ .set_size_limit(self.config.get_nfa_size_limit())?;
+
// We always add an unanchored prefix unless we were specifically told
// not to (for tests only), or if we know that the regex is anchored
// for all matches. When an unanchored prefix is not added, then the
// NFA's anchored and unanchored start states are equivalent.
- let all_anchored =
- exprs.iter().all(|e| e.borrow().is_anchored_start());
+ let all_anchored = exprs.iter().all(|e| {
+ e.borrow()
+ .properties()
+ .look_set_prefix()
+ .contains(hir::Look::Start)
+ });
let anchored = !self.config.get_unanchored_prefix() || all_anchored;
let unanchored_prefix = if anchored {
self.c_empty()?
} else {
- if self.config.get_utf8() {
- self.c_unanchored_prefix_valid_utf8()?
- } else {
- self.c_unanchored_prefix_invalid_utf8()?
- }
+ self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
};
- let compiled = self.c_alternation(
- exprs.iter().with_pattern_ids().map(|(pid, e)| {
- let group_kind = hir::GroupKind::CaptureIndex(0);
- let one = self.c_group(&group_kind, e.borrow())?;
- let match_state_id = self.add_match(pid, one.start)?;
- self.patch(one.end, match_state_id)?;
- Ok(ThompsonRef { start: one.start, end: match_state_id })
- }),
- )?;
+ let compiled = self.c_alt_iter(exprs.iter().map(|e| {
+ let _ = self.start_pattern()?;
+ let one = self.c_cap(0, None, e.borrow())?;
+ let match_state_id = self.add_match()?;
+ self.patch(one.end, match_state_id)?;
+ let _ = self.finish_pattern(one.start)?;
+ Ok(ThompsonRef { start: one.start, end: match_state_id })
+ }))?;
self.patch(unanchored_prefix.end, compiled.start)?;
- self.finish(compiled.start, unanchored_prefix.start)?;
- Ok(self.nfa.replace(NFA::empty()))
- }
+ let nfa = self
+ .builder
+ .borrow_mut()
+ .build(compiled.start, unanchored_prefix.start)?;
- /// Finishes the compilation process and populates the NFA attached to this
- /// compiler with the final graph.
- fn finish(
- &self,
- start_anchored: StateID,
- start_unanchored: StateID,
- ) -> Result<(), Error> {
- trace!(
- "intermediate NFA compilation complete, \
- intermediate NFA size: {} states, {} bytes on heap",
- self.states.borrow().len(),
- self.nfa_memory_usage(),
- );
- let mut nfa = self.nfa.borrow_mut();
- let mut bstates = self.states.borrow_mut();
- let mut remap = self.remap.borrow_mut();
- let mut empties = self.empties.borrow_mut();
- remap.resize(bstates.len(), StateID::ZERO);
- empties.clear();
-
- // The idea here is to convert our intermediate states to their final
- // form. The only real complexity here is the process of converting
- // transitions, which are expressed in terms of state IDs. The new
- // set of states will be smaller because of partial epsilon removal,
- // so the state IDs will not be the same.
- for (sid, bstate) in bstates.iter_mut().with_state_ids() {
- match *bstate {
- CState::Empty { next } => {
- // Since we're removing empty states, we need to handle
- // them later since we don't yet know which new state this
- // empty state will be mapped to.
- empties.push((sid, next));
- }
- CState::CaptureStart { next, capture_index, ref name } => {
- // We can't remove this empty state because of the side
- // effect of capturing an offset for this capture slot.
- remap[sid] = nfa.add_capture_start(
- next,
- capture_index,
- name.clone(),
- )?;
- }
- CState::CaptureEnd { next, capture_index } => {
- // We can't remove this empty state because of the side
- // effect of capturing an offset for this capture slot.
- remap[sid] = nfa.add_capture_end(next, capture_index)?;
- }
- CState::Range { range } => {
- remap[sid] = nfa.add_range(range)?;
- }
- CState::Sparse { ref mut ranges } => {
- let ranges =
- mem::replace(ranges, vec![]).into_boxed_slice();
- remap[sid] =
- nfa.add_sparse(SparseTransitions { ranges })?;
- }
- CState::Look { look, next } => {
- remap[sid] = nfa.add_look(next, look)?;
- }
- CState::Union { ref mut alternates } => {
- let alternates =
- mem::replace(alternates, vec![]).into_boxed_slice();
- remap[sid] = nfa.add_union(alternates)?;
- }
- CState::UnionReverse { ref mut alternates } => {
- let mut alternates =
- mem::replace(alternates, vec![]).into_boxed_slice();
- alternates.reverse();
- remap[sid] = nfa.add_union(alternates)?;
- }
- CState::Match { start_id, .. } => {
- remap[sid] = nfa.add_match()?;
- nfa.finish_pattern(start_id)?;
- }
- }
- }
- for &(empty_id, mut empty_next) in empties.iter() {
- // empty states can point to other empty states, forming a chain.
- // So we must follow the chain until the end, which must end at
- // a non-empty state, and therefore, a state that is correctly
- // remapped. We are guaranteed to terminate because our compiler
- // never builds a loop among only empty states.
- while let CState::Empty { next } = bstates[empty_next] {
- empty_next = next;
- }
- remap[empty_id] = remap[empty_next];
- }
- nfa.set_start_anchored(start_anchored);
- nfa.set_start_unanchored(start_unanchored);
- nfa.remap(&remap);
- trace!(
- "final NFA (reverse? {:?}) compilation complete, \
- final NFA size: {} states, {} bytes on heap",
- self.config.get_reverse(),
- nfa.states().len(),
- nfa.memory_usage(),
- );
- Ok(())
+ debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
+ Ok(nfa)
}
- fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
+ /// Compile an arbitrary HIR expression.
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
+ use regex_syntax::hir::{Class, HirKind::*};
+
match *expr.kind() {
- HirKind::Empty => self.c_empty(),
- HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
- HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
- HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
- HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
- HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
- HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
- HirKind::Repetition(ref rep) => self.c_repetition(rep),
- HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
- HirKind::Concat(ref es) => {
- self.c_concat(es.iter().map(|e| self.c(e)))
- }
- HirKind::Alternation(ref es) => {
- self.c_alternation(es.iter().map(|e| self.c(e)))
- }
+ Empty => self.c_empty(),
+ Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
+ Class(Class::Bytes(ref c)) => self.c_byte_class(c),
+ Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
+ Look(ref look) => self.c_look(look),
+ Repetition(ref rep) => self.c_repetition(rep),
+ Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
+ Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
+ Alternation(ref es) => self.c_alt_slice(es),
}
}
- fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ /// Compile a concatenation of the sub-expressions yielded by the given
+ /// iterator. If the iterator yields no elements, then this compiles down
+ /// to an "empty" state that always matches.
+ ///
+ /// If the compiler is in reverse mode, then the expressions given are
+ /// automatically compiled in reverse.
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
where
- I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
+ I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
{
let first = if self.is_reverse() { it.next_back() } else { it.next() };
let ThompsonRef { start, mut end } = match first {
@@ -727,11 +1036,57 @@ impl Compiler {
Ok(ThompsonRef { start, end })
}
- fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ /// Compile an alternation of the given HIR values.
+ ///
+ /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
+ /// of an iterator of compiled NFA subgraphs. The point of accepting a
+ /// slice here is that it opens up some optimization opportunities. For
+ /// example, if all of the HIR values are literals, then this routine might
+ /// re-shuffle them to make NFA epsilon closures substantially faster.
+ fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
+ // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
+ let literal_count = exprs
+ .iter()
+ .filter(|e| {
+ matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
+ })
+ .count();
+ if literal_count <= 1 || literal_count < exprs.len() {
+ return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
+ }
+
+ let mut trie = if self.is_reverse() {
+ LiteralTrie::reverse()
+ } else {
+ LiteralTrie::forward()
+ };
+ for expr in exprs.iter() {
+ let literal = match *expr.kind() {
+ hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
+ _ => unreachable!(),
+ };
+ trie.add(literal)?;
+ }
+ trie.compile(&mut self.builder.borrow_mut())
+ }
+
+ /// Compile an alternation, where each element yielded by the given
+ /// iterator represents an item in the alternation. If the iterator yields
+ /// no elements, then this compiles down to a "fail" state.
+ ///
+ /// In an alternation, expressions appearing earlier are "preferred" at
+ /// match time over expressions appearing later. At least, this is true
+ /// when using "leftmost first" match semantics. (If "leftmost longest" are
+ /// ever added in the future, then this preference order of priority would
+ /// not apply in that mode.)
+ fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
where
- I: Iterator<Item = Result<ThompsonRef, Error>>,
+ I: Iterator<Item = Result<ThompsonRef, BuildError>>,
{
- let first = it.next().expect("alternations must be non-empty")?;
+ let first = match it.next() {
+ None => return self.c_fail(),
+ Some(result) => result?,
+ };
let second = match it.next() {
None => return Ok(first),
Some(result) => result?,
@@ -751,66 +1106,64 @@ impl Compiler {
Ok(ThompsonRef { start: union, end })
}
- fn c_group(
+ /// Compile the given capture sub-expression. `expr` should be the
+ /// sub-expression contained inside the capture. If "capture" states are
+ /// enabled, then they are added as appropriate.
+ ///
+ /// This accepts the pieces of a capture instead of a `hir::Capture` so
+ /// that it's easy to manufacture a "fake" group when necessary, e.g., for
+ /// adding the entire pattern as if it were a group in order to create
+ /// appropriate "capture" states in the NFA.
+ fn c_cap(
&self,
- kind: &hir::GroupKind,
+ index: u32,
+ name: Option<&str>,
expr: &Hir,
- ) -> Result<ThompsonRef, Error> {
- if !self.config.get_captures() {
- return self.c(expr);
+ ) -> Result<ThompsonRef, BuildError> {
+ match self.config.get_which_captures() {
+ // No capture states means we always skip them.
+ WhichCaptures::None => return self.c(expr),
+ // Implicit captures states means we only add when index==0 since
+ // index==0 implies the group is implicit.
+ WhichCaptures::Implicit if index > 0 => return self.c(expr),
+ _ => {}
}
- let (capi, name) = match *kind {
- hir::GroupKind::NonCapturing => return self.c(expr),
- hir::GroupKind::CaptureIndex(index) => (index, None),
- hir::GroupKind::CaptureName { ref name, index } => {
- (index, Some(Arc::from(&**name)))
- }
- };
- let start = self.add_capture_start(capi, name)?;
+ let start = self.add_capture_start(index, name)?;
let inner = self.c(expr)?;
- let end = self.add_capture_end(capi)?;
-
+ let end = self.add_capture_end(index)?;
self.patch(start, inner.start)?;
self.patch(inner.end, end)?;
Ok(ThompsonRef { start, end })
}
+ /// Compile the given repetition expression. This handles all types of
+ /// repetitions and greediness.
fn c_repetition(
&self,
rep: &hir::Repetition,
- ) -> Result<ThompsonRef, Error> {
- match rep.kind {
- hir::RepetitionKind::ZeroOrOne => {
- self.c_zero_or_one(&rep.hir, rep.greedy)
- }
- hir::RepetitionKind::ZeroOrMore => {
- self.c_at_least(&rep.hir, rep.greedy, 0)
- }
- hir::RepetitionKind::OneOrMore => {
- self.c_at_least(&rep.hir, rep.greedy, 1)
- }
- hir::RepetitionKind::Range(ref rng) => match *rng {
- hir::RepetitionRange::Exactly(count) => {
- self.c_exactly(&rep.hir, count)
- }
- hir::RepetitionRange::AtLeast(m) => {
- self.c_at_least(&rep.hir, rep.greedy, m)
- }
- hir::RepetitionRange::Bounded(min, max) => {
- self.c_bounded(&rep.hir, rep.greedy, min, max)
- }
- },
+ ) -> Result<ThompsonRef, BuildError> {
+ match (rep.min, rep.max) {
+ (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
+ (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
+ (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
+ (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
}
}
+ /// Compile the given expression such that it matches at least `min` times,
+ /// but no more than `max` times.
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
fn c_bounded(
&self,
expr: &Hir,
greedy: bool,
min: u32,
max: u32,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
let prefix = self.c_exactly(expr, min)?;
if min == max {
return Ok(prefix);
@@ -851,7 +1204,7 @@ impl Compiler {
let union = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
let compiled = self.c(expr)?;
self.patch(prev_end, union)?;
@@ -863,22 +1216,29 @@ impl Compiler {
Ok(ThompsonRef { start: prefix.start, end: empty })
}
+ /// Compile the given expression such that it may be matched `n` or more
+ /// times, where `n` can be any integer. (Although a particularly large
+ /// integer is likely to run afoul of any configured size limits.)
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
fn c_at_least(
&self,
expr: &Hir,
greedy: bool,
n: u32,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
if n == 0 {
// When the expression cannot match the empty string, then we
// can get away with something much simpler: just one 'alt'
// instruction that optionally repeats itself. But if the expr
// can match the empty string... see below.
- if !expr.is_match_empty() {
+ if expr.properties().minimum_len().map_or(false, |len| len > 0) {
let union = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
let compiled = self.c(expr)?;
self.patch(union, compiled.start)?;
@@ -898,7 +1258,7 @@ impl Compiler {
let plus = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
self.patch(compiled.end, plus)?;
self.patch(plus, compiled.start)?;
@@ -906,7 +1266,7 @@ impl Compiler {
let question = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
let empty = self.add_empty()?;
self.patch(question, compiled.start)?;
@@ -918,7 +1278,7 @@ impl Compiler {
let union = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
self.patch(compiled.end, union)?;
self.patch(union, compiled.start)?;
@@ -929,7 +1289,7 @@ impl Compiler {
let union = if greedy {
self.add_union()
} else {
- self.add_reverse_union()
+ self.add_union_reverse()
}?;
self.patch(prefix.end, last.start)?;
self.patch(last.end, union)?;
@@ -938,13 +1298,19 @@ impl Compiler {
}
}
+ /// Compile the given expression such that it may be matched zero or one
+ /// times.
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
fn c_zero_or_one(
&self,
expr: &Hir,
greedy: bool,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
let union =
- if greedy { self.add_union() } else { self.add_reverse_union() }?;
+ if greedy { self.add_union() } else { self.add_union_reverse() }?;
let compiled = self.c(expr)?;
let empty = self.add_empty()?;
self.patch(union, compiled.start)?;
@@ -953,15 +1319,30 @@ impl Compiler {
Ok(ThompsonRef { start: union, end: empty })
}
- fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
+ /// Compile the given HIR expression exactly `n` times.
+ fn c_exactly(
+ &self,
+ expr: &Hir,
+ n: u32,
+ ) -> Result<ThompsonRef, BuildError> {
let it = (0..n).map(|_| self.c(expr));
self.c_concat(it)
}
+ /// Compile the given byte oriented character class.
+ ///
+ /// This uses "sparse" states to represent an alternation between ranges in
+ /// this character class. We can use "sparse" states instead of stitching
+ /// together a "union" state because all ranges in a character class have
+ /// equal priority *and* are non-overlapping (thus, only one can match, so
+ /// there's never a question of priority in the first place). This saves a
+ /// fair bit of overhead when traversing an NFA.
+ ///
+ /// This routine compiles an empty character class into a "fail" state.
fn c_byte_class(
&self,
cls: &hir::ClassBytes,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
let end = self.add_empty()?;
let mut trans = Vec::with_capacity(cls.ranges().len());
for r in cls.iter() {
@@ -974,22 +1355,36 @@ impl Compiler {
Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
}
+ /// Compile the given Unicode character class.
+ ///
+ /// This routine specifically tries to use various types of compression,
+ /// since UTF-8 automata of large classes can get quite large. The specific
+ /// type of compression used depends on forward vs reverse compilation, and
+ /// whether NFA shrinking is enabled or not.
+ ///
+ /// Aside from repetitions causing lots of repeat group, this is like the
+ /// single most expensive part of regex compilation. Therefore, a large part
+ /// of the expense of compilation may be reduce by disabling Unicode in the
+ /// pattern.
+ ///
+ /// This routine compiles an empty character class into a "fail" state.
fn c_unicode_class(
&self,
cls: &hir::ClassUnicode,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
// If all we have are ASCII ranges wrapped in a Unicode package, then
// there is zero reason to bring out the big guns. We can fit all ASCII
// ranges within a single sparse state.
- if cls.is_all_ascii() {
+ if cls.is_ascii() {
let end = self.add_empty()?;
let mut trans = Vec::with_capacity(cls.ranges().len());
for r in cls.iter() {
- assert!(r.start() <= '\x7F');
- assert!(r.end() <= '\x7F');
+ // The unwraps below are OK because we've verified that this
+ // class only contains ASCII codepoints.
trans.push(Transition {
- start: r.start() as u8,
- end: r.end() as u8,
+ // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
+ start: u8::try_from(u32::from(r.start())).unwrap(),
+ end: u8::try_from(u32::from(r.end())).unwrap(),
next: end,
});
}
@@ -1022,8 +1417,10 @@ impl Compiler {
trie.insert(seq.as_slice());
}
}
+ let mut builder = self.builder.borrow_mut();
let mut utf8_state = self.utf8_state.borrow_mut();
- let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ let mut utf8c =
+ Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
trie.iter(|seq| {
utf8c.add(&seq)?;
Ok(())
@@ -1035,8 +1432,10 @@ impl Compiler {
// because we can stream it right into the UTF-8 compiler. There
// is almost no downside (in either memory or time) to using this
// approach.
+ let mut builder = self.builder.borrow_mut();
let mut utf8_state = self.utf8_state.borrow_mut();
- let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ let mut utf8c =
+ Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
for rng in cls.iter() {
for seq in Utf8Sequences::new(rng.start(), rng.end()) {
utf8c.add(seq.as_slice())?;
@@ -1058,7 +1457,23 @@ impl Compiler {
//
// The code below is kept as a reference point in order to make it
// easier to understand the higher level goal here. Although, it will
- // almost certainly bit-rot, so keep that in mind.
+ // almost certainly bit-rot, so keep that in mind. Also, if you try to
+ // use it, some of the tests in this module will fail because they look
+ // for terser byte code produce by the more optimized handling above.
+ // But the integration test suite should still pass.
+ //
+ // One good example of the substantial difference this can make is to
+ // compare and contrast performance of the Pike VM when the code below
+ // is active vs the code above. Here's an example to try:
+ //
+ // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+ //
+ // With Unicode classes generated below, this search takes about 45s on
+ // my machine. But with the compressed version above, the search takes
+ // only around 1.4s. The NFA is also 20% smaller. This is in part due
+ // to the compression, but also because of the utilization of 'sparse'
+ // NFA states. They lead to much less state shuffling during the NFA
+ // search.
/*
let it = cls
.iter()
@@ -1070,14 +1485,29 @@ impl Compiler {
.map(|rng| self.c_range(rng.start, rng.end));
self.c_concat(it)
});
- self.c_alternation(it)
+ self.c_alt_iter(it)
*/
}
+ /// Compile the given Unicode character class in reverse with suffix
+ /// caching.
+ ///
+ /// This is a "quick" way to compile large Unicode classes into reverse
+ /// UTF-8 automata while doing a small amount of compression on that
+ /// automata by reusing common suffixes.
+ ///
+ /// A more comprehensive compression scheme can be accomplished by using
+ /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
+ /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
+ ///
+ /// This is the technique used when "NFA shrinking" is disabled.
+ ///
+ /// (This also tries to use "sparse" states where possible, just like
+ /// `c_byte_class` does.)
fn c_unicode_class_reverse_with_suffix(
&self,
cls: &hir::ClassUnicode,
- ) -> Result<ThompsonRef, Error> {
+ ) -> Result<ThompsonRef, BuildError> {
// N.B. It would likely be better to cache common *prefixes* in the
// reverse direction, but it's not quite clear how to do that. The
// advantage of caching suffixes is that it does give us a win, and
@@ -1113,229 +1543,178 @@ impl Compiler {
Ok(ThompsonRef { start: union, end: alt_end })
}
- fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
+ /// Compile the given HIR look-around assertion to an NFA look-around
+ /// assertion.
+ fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
let look = match *anchor {
- Anchor::StartLine => Look::StartLine,
- Anchor::EndLine => Look::EndLine,
- Anchor::StartText => Look::StartText,
- Anchor::EndText => Look::EndText,
+ hir::Look::Start => Look::Start,
+ hir::Look::End => Look::End,
+ hir::Look::StartLF => Look::StartLF,
+ hir::Look::EndLF => Look::EndLF,
+ hir::Look::StartCRLF => Look::StartCRLF,
+ hir::Look::EndCRLF => Look::EndCRLF,
+ hir::Look::WordAscii => Look::WordAscii,
+ hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
+ hir::Look::WordUnicode => Look::WordUnicode,
+ hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
};
let id = self.add_look(look)?;
Ok(ThompsonRef { start: id, end: id })
}
- fn c_word_boundary(
- &self,
- wb: &WordBoundary,
- ) -> Result<ThompsonRef, Error> {
- let look = match *wb {
- WordBoundary::Unicode => Look::WordBoundaryUnicode,
- WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
- WordBoundary::Ascii => Look::WordBoundaryAscii,
- WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
- };
- let id = self.add_look(look)?;
- Ok(ThompsonRef { start: id, end: id })
- }
-
- fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
- let mut buf = [0; 4];
- let it = ch
- .encode_utf8(&mut buf)
- .as_bytes()
- .iter()
- .map(|&b| self.c_range(b, b));
- self.c_concat(it)
+ /// Compile the given byte string to a concatenation of bytes.
+ fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
+ self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
}
- fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
+ /// Compile a "range" state with one transition that may only be followed
+ /// if the input byte is in the (inclusive) range given.
+ ///
+ /// Both the `start` and `end` locations point to the state created.
+ /// Callers will likely want to keep the `start`, but patch the `end` to
+ /// point to some other state.
+ fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
let id = self.add_range(start, end)?;
Ok(ThompsonRef { start: id, end: id })
}
- fn c_empty(&self) -> Result<ThompsonRef, Error> {
+ /// Compile an "empty" state with one unconditional epsilon transition.
+ ///
+ /// Both the `start` and `end` locations point to the state created.
+ /// Callers will likely want to keep the `start`, but patch the `end` to
+ /// point to some other state.
+ fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
let id = self.add_empty()?;
Ok(ThompsonRef { start: id, end: id })
}
- fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
- self.c_at_least(&Hir::any(false), false, 0)
+ /// Compile a "fail" state that can never have any outgoing transitions.
+ fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
+ let id = self.add_fail()?;
+ Ok(ThompsonRef { start: id, end: id })
}
- fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
- self.c_at_least(&Hir::any(true), false, 0)
- }
+ // The below helpers are meant to be simple wrappers around the
+ // corresponding Builder methods. For the most part, they let us write
+ // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
+ // the latter is a mouthful. Some of the methods do inject a little bit
+ // of extra logic. e.g., Flipping look-around operators when compiling in
+ // reverse mode.
- fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
- let old_memory_cstates = self.memory_cstates.get();
- match self.states.borrow_mut()[from] {
- CState::Empty { ref mut next } => {
- *next = to;
- }
- CState::Range { ref mut range } => {
- range.next = to;
- }
- CState::Sparse { .. } => {
- panic!("cannot patch from a sparse NFA state")
- }
- CState::Look { ref mut next, .. } => {
- *next = to;
- }
- CState::Union { ref mut alternates } => {
- alternates.push(to);
- self.memory_cstates
- .set(old_memory_cstates + mem::size_of::<StateID>());
- }
- CState::UnionReverse { ref mut alternates } => {
- alternates.push(to);
- self.memory_cstates
- .set(old_memory_cstates + mem::size_of::<StateID>());
- }
- CState::CaptureStart { ref mut next, .. } => {
- *next = to;
- }
- CState::CaptureEnd { ref mut next, .. } => {
- *next = to;
- }
- CState::Match { .. } => {}
- }
- if old_memory_cstates != self.memory_cstates.get() {
- self.check_nfa_size_limit()?;
- }
- Ok(())
+ fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
+ self.builder.borrow_mut().patch(from, to)
}
- fn add_empty(&self) -> Result<StateID, Error> {
- self.add_state(CState::Empty { next: StateID::ZERO })
+ fn start_pattern(&self) -> Result<PatternID, BuildError> {
+ self.builder.borrow_mut().start_pattern()
}
- fn add_capture_start(
+ fn finish_pattern(
&self,
- capture_index: u32,
- name: Option<Arc<str>>,
- ) -> Result<StateID, Error> {
- self.add_state(CState::CaptureStart {
- next: StateID::ZERO,
- capture_index,
- name,
- })
+ start_id: StateID,
+ ) -> Result<PatternID, BuildError> {
+ self.builder.borrow_mut().finish_pattern(start_id)
}
- fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
- self.add_state(CState::CaptureEnd {
- next: StateID::ZERO,
- capture_index,
- })
+ fn add_empty(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_empty()
}
- fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
- let trans = Transition { start, end, next: StateID::ZERO };
- self.add_state(CState::Range { range: trans })
+ fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_range(Transition {
+ start,
+ end,
+ next: StateID::ZERO,
+ })
}
- fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
- if ranges.len() == 1 {
- self.add_state(CState::Range { range: ranges[0] })
- } else {
- self.add_state(CState::Sparse { ranges })
- }
+ fn add_sparse(
+ &self,
+ ranges: Vec<Transition>,
+ ) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_sparse(ranges)
}
- fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
+ fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
if self.is_reverse() {
look = look.reversed();
}
- self.add_state(CState::Look { look, next: StateID::ZERO })
+ self.builder.borrow_mut().add_look(StateID::ZERO, look)
}
- fn add_union(&self) -> Result<StateID, Error> {
- self.add_state(CState::Union { alternates: vec![] })
+ fn add_union(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_union(vec![])
}
- fn add_reverse_union(&self) -> Result<StateID, Error> {
- self.add_state(CState::UnionReverse { alternates: vec![] })
+ fn add_union_reverse(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_union_reverse(vec![])
}
- fn add_match(
+ fn add_capture_start(
&self,
- pattern_id: PatternID,
- start_id: StateID,
- ) -> Result<StateID, Error> {
- self.add_state(CState::Match { pattern_id, start_id })
- }
-
- fn add_state(&self, state: CState) -> Result<StateID, Error> {
- let mut states = self.states.borrow_mut();
- let id = StateID::new(states.len())
- .map_err(|_| Error::too_many_states(states.len()))?;
- self.memory_cstates
- .set(self.memory_cstates.get() + state.memory_usage());
- states.push(state);
- // If we don't explicitly drop this, then 'nfa_memory_usage' will also
- // try to borrow it when we check the size limit and hit an error.
- drop(states);
- self.check_nfa_size_limit()?;
- Ok(id)
+ capture_index: u32,
+ name: Option<&str>,
+ ) -> Result<StateID, BuildError> {
+ let name = name.map(|n| Arc::from(n));
+ self.builder.borrow_mut().add_capture_start(
+ StateID::ZERO,
+ capture_index,
+ name,
+ )
}
- fn is_reverse(&self) -> bool {
- self.config.get_reverse()
+ fn add_capture_end(
+ &self,
+ capture_index: u32,
+ ) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
}
- /// If an NFA size limit was set, this checks that the NFA compiled so far
- /// fits within that limit. If so, then nothing is returned. Otherwise, an
- /// error is returned.
- ///
- /// This should be called after increasing the heap usage of the
- /// intermediate NFA.
- ///
- /// Note that this borrows 'self.states', so callers should ensure there is
- /// no mutable borrow of it outstanding.
- fn check_nfa_size_limit(&self) -> Result<(), Error> {
- if let Some(limit) = self.config.get_nfa_size_limit() {
- if self.nfa_memory_usage() > limit {
- return Err(Error::exceeded_size_limit(limit));
- }
- }
- Ok(())
+ fn add_fail(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_fail()
}
- /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
- ///
- /// Note that this is an approximation of how big the final NFA will be.
- /// In practice, the final NFA will likely be a bit smaller since it uses
- /// things like `Box<[T]>` instead of `Vec<T>`.
- fn nfa_memory_usage(&self) -> usize {
- self.states.borrow().len() * mem::size_of::<CState>()
- + self.memory_cstates.get()
+ fn add_match(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_match()
}
-}
-impl CState {
- fn memory_usage(&self) -> usize {
- match *self {
- CState::Empty { .. }
- | CState::Range { .. }
- | CState::Look { .. }
- | CState::CaptureStart { .. }
- | CState::CaptureEnd { .. }
- | CState::Match { .. } => 0,
- CState::Sparse { ref ranges } => {
- ranges.len() * mem::size_of::<Transition>()
- }
- CState::Union { ref alternates } => {
- alternates.len() * mem::size_of::<StateID>()
- }
- CState::UnionReverse { ref alternates } => {
- alternates.len() * mem::size_of::<StateID>()
- }
- }
+ fn is_reverse(&self) -> bool {
+ self.config.get_reverse()
}
}
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct ThompsonRef {
+ pub(crate) start: StateID,
+ pub(crate) end: StateID,
+}
+
+/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
+/// from a lexicographically sorted sequence of strings in linear time.
+///
+/// The trick here is that any Unicode codepoint range can be converted to
+/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
+/// together via an alternation is trivial, and indeed, it works. However,
+/// there is a lot of redundant structure in many UTF-8 automatons. Since our
+/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
+/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
+/// minimal because we use a bounded cache of previously build DFA states.)
+///
+/// The drawback is that this sadly doesn't work for reverse automata, since
+/// the ranges are no longer in lexicographic order. For that, we invented the
+/// range trie (which gets its own module). Once a range trie is built, we then
+/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
+///
+/// The high level idea is described here:
+/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
+///
+/// There is also another implementation of this in the `fst` crate.
#[derive(Debug)]
struct Utf8Compiler<'a> {
- nfac: &'a Compiler,
+ builder: &'a mut Builder,
state: &'a mut Utf8State,
target: StateID,
}
@@ -1371,24 +1750,24 @@ impl Utf8State {
impl<'a> Utf8Compiler<'a> {
fn new(
- nfac: &'a Compiler,
+ builder: &'a mut Builder,
state: &'a mut Utf8State,
- ) -> Result<Utf8Compiler<'a>, Error> {
- let target = nfac.add_empty()?;
+ ) -> Result<Utf8Compiler<'a>, BuildError> {
+ let target = builder.add_empty()?;
state.clear();
- let mut utf8c = Utf8Compiler { nfac, state, target };
+ let mut utf8c = Utf8Compiler { builder, state, target };
utf8c.add_empty();
Ok(utf8c)
}
- fn finish(&mut self) -> Result<ThompsonRef, Error> {
+ fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
self.compile_from(0)?;
let node = self.pop_root();
let start = self.compile(node)?;
Ok(ThompsonRef { start, end: self.target })
}
- fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
+ fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
let prefix_len = ranges
.iter()
.zip(&self.state.uncompiled)
@@ -1404,7 +1783,7 @@ impl<'a> Utf8Compiler<'a> {
Ok(())
}
- fn compile_from(&mut self, from: usize) -> Result<(), Error> {
+ fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
let mut next = self.target;
while from + 1 < self.state.uncompiled.len() {
let node = self.pop_freeze(next);
@@ -1414,12 +1793,15 @@ impl<'a> Utf8Compiler<'a> {
Ok(())
}
- fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
+ fn compile(
+ &mut self,
+ node: Vec<Transition>,
+ ) -> Result<StateID, BuildError> {
let hash = self.state.compiled.hash(&node);
if let Some(id) = self.state.compiled.get(&node, hash) {
return Ok(id);
}
- let id = self.nfac.add_sparse(node.clone())?;
+ let id = self.builder.add_sparse(node.clone())?;
self.state.compiled.set(node, hash, id);
Ok(id)
}
@@ -1486,16 +1868,22 @@ impl Utf8Node {
#[cfg(test)]
mod tests {
- use alloc::vec::Vec;
+ use alloc::{vec, vec::Vec};
- use super::{
- Builder, Config, PatternID, SparseTransitions, State, StateID,
- Transition, NFA,
+ use crate::{
+ nfa::thompson::{SparseTransitions, State, Transition, NFA},
+ util::primitives::{PatternID, SmallIndex, StateID},
};
+ use super::*;
+
fn build(pattern: &str) -> NFA {
- Builder::new()
- .configure(Config::new().captures(false).unanchored_prefix(false))
+ NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false),
+ )
.build(pattern)
.unwrap()
}
@@ -1511,17 +1899,17 @@ mod tests {
fn s_byte(byte: u8, next: usize) -> State {
let next = sid(next);
let trans = Transition { start: byte, end: byte, next };
- State::Range { range: trans }
+ State::ByteRange { trans }
}
fn s_range(start: u8, end: u8, next: usize) -> State {
let next = sid(next);
let trans = Transition { start, end, next };
- State::Range { range: trans }
+ State::ByteRange { trans }
}
- fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
- let ranges = ranges
+ fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
+ let transitions = transitions
.iter()
.map(|&(start, end, next)| Transition {
start,
@@ -1529,7 +1917,11 @@ mod tests {
next: sid(next),
})
.collect();
- State::Sparse(SparseTransitions { ranges })
+ State::Sparse(SparseTransitions { transitions })
+ }
+
+ fn s_bin_union(alt1: usize, alt2: usize) -> State {
+ State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
}
fn s_union(alts: &[usize]) -> State {
@@ -1542,34 +1934,35 @@ mod tests {
}
}
+ fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
+ State::Capture {
+ next: sid(next),
+ pattern_id: pid(pattern),
+ group_index: SmallIndex::new(index).unwrap(),
+ slot: SmallIndex::new(slot).unwrap(),
+ }
+ }
+
+ fn s_fail() -> State {
+ State::Fail
+ }
+
fn s_match(id: usize) -> State {
- State::Match { id: pid(id) }
+ State::Match { pattern_id: pid(id) }
}
// Test that building an unanchored NFA has an appropriate `(?s:.)*?`
// prefix.
#[test]
fn compile_unanchored_prefix() {
- // When the machine can only match valid UTF-8.
- let nfa = Builder::new()
- .configure(Config::new().captures(false))
- .build(r"a")
- .unwrap();
- // There should be many states since the `.` in `(?s:.)*?` matches any
- // Unicode scalar value.
- assert_eq!(11, nfa.len());
- assert_eq!(nfa.states[10], s_match(0));
- assert_eq!(nfa.states[9], s_byte(b'a', 10));
-
- // When the machine can match through invalid UTF-8.
- let nfa = Builder::new()
- .configure(Config::new().captures(false).utf8(false))
+ let nfa = NFA::compiler()
+ .configure(NFA::config().which_captures(WhichCaptures::None))
.build(r"a")
.unwrap();
assert_eq!(
- nfa.states,
+ nfa.states(),
&[
- s_union(&[2, 1]),
+ s_bin_union(2, 1),
s_range(0, 255, 0),
s_byte(b'a', 3),
s_match(0),
@@ -1579,51 +1972,55 @@ mod tests {
#[test]
fn compile_empty() {
- assert_eq!(build("").states, &[s_match(0),]);
+ assert_eq!(build("").states(), &[s_match(0),]);
}
#[test]
fn compile_literal() {
- assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
+ assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
assert_eq!(
- build("ab").states,
+ build("ab").states(),
&[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
);
assert_eq!(
- build("☃").states,
+ build("☃").states(),
&[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
);
// Check that non-UTF-8 literals work.
- let nfa = Builder::new()
+ let nfa = NFA::compiler()
.configure(
- Config::new()
- .captures(false)
- .utf8(false)
+ NFA::config()
+ .which_captures(WhichCaptures::None)
.unanchored_prefix(false),
)
- .syntax(crate::SyntaxConfig::new().utf8(false))
+ .syntax(crate::util::syntax::Config::new().utf8(false))
.build(r"(?-u)\xFF")
.unwrap();
- assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
+ assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
}
#[test]
- fn compile_class() {
+ fn compile_class_ascii() {
assert_eq!(
- build(r"[a-z]").states,
+ build(r"[a-z]").states(),
&[s_range(b'a', b'z', 1), s_match(0),]
);
assert_eq!(
- build(r"[x-za-c]").states,
+ build(r"[x-za-c]").states(),
&[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
);
+ }
+
+ #[test]
+ #[cfg(not(miri))]
+ fn compile_class_unicode() {
assert_eq!(
- build(r"[\u03B1-\u03B4]").states,
+ build(r"[\u03B1-\u03B4]").states(),
&[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
);
assert_eq!(
- build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
&[
s_range(0xB1, 0xB4, 5),
s_range(0x99, 0x9E, 5),
@@ -1634,7 +2031,7 @@ mod tests {
]
);
assert_eq!(
- build(r"[a-z☃]").states,
+ build(r"[a-z☃]").states(),
&[
s_byte(0x83, 3),
s_byte(0x98, 0),
@@ -1647,67 +2044,214 @@ mod tests {
#[test]
fn compile_repetition() {
assert_eq!(
- build(r"a?").states,
- &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
+ build(r"a?").states(),
+ &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
);
assert_eq!(
- build(r"a??").states,
- &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
+ build(r"a??").states(),
+ &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
);
}
#[test]
fn compile_group() {
assert_eq!(
- build(r"ab+").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
+ build(r"ab+").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
);
assert_eq!(
- build(r"(ab)").states,
+ build(r"(ab)").states(),
&[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
);
assert_eq!(
- build(r"(ab)+").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
+ build(r"(ab)+").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
);
}
#[test]
fn compile_alternation() {
assert_eq!(
- build(r"a|b").states,
- &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
+ build(r"a|b").states(),
+ &[s_range(b'a', b'b', 1), s_match(0)]
+ );
+ assert_eq!(
+ build(r"ab|cd").states(),
+ &[
+ s_byte(b'b', 3),
+ s_byte(b'd', 3),
+ s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
+ s_match(0)
+ ],
);
assert_eq!(
- build(r"|b").states,
- &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
+ build(r"|b").states(),
+ &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
);
assert_eq!(
- build(r"a|").states,
- &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
+ build(r"a|").states(),
+ &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
);
}
+ // This tests the use of a non-binary union, i.e., a state with more than
+ // 2 unconditional epsilon transitions. The only place they tend to appear
+ // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
+ // and 'sparse' tend to cover all other cases of alternation.
#[test]
- fn many_start_pattern() {
- let nfa = Builder::new()
- .configure(Config::new().captures(false).unanchored_prefix(false))
+ fn compile_non_binary_union() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .reverse(true)
+ .shrink(false)
+ .unanchored_prefix(false),
+ )
+ .build(r"[\u1000\u2000\u3000]")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_union(&[3, 6, 9]),
+ s_byte(0xE1, 10),
+ s_byte(0x80, 1),
+ s_byte(0x80, 2),
+ s_byte(0xE2, 10),
+ s_byte(0x80, 4),
+ s_byte(0x80, 5),
+ s_byte(0xE3, 10),
+ s_byte(0x80, 7),
+ s_byte(0x80, 8),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_many_start_pattern() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false),
+ )
.build_many(&["a", "b"])
.unwrap();
assert_eq!(
- nfa.states,
+ nfa.states(),
&[
s_byte(b'a', 1),
s_match(0),
s_byte(b'b', 3),
s_match(1),
- s_union(&[0, 2]),
+ s_bin_union(0, 2),
]
);
assert_eq!(nfa.start_anchored().as_usize(), 4);
assert_eq!(nfa.start_unanchored().as_usize(), 4);
// Test that the start states for each individual pattern are correct.
- assert_eq!(nfa.start_pattern(pid(0)), sid(0));
- assert_eq!(nfa.start_pattern(pid(1)), sid(2));
+ assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
+ assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
+ }
+
+ // This tests that our compiler can handle an empty character class. At the
+ // time of writing, the regex parser forbids it, so the only way to test it
+ // is to provide a hand written HIR.
+ #[test]
+ fn empty_class_bytes() {
+ use regex_syntax::hir::{Class, ClassBytes, Hir};
+
+ let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
+ let config = NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false);
+ let nfa =
+ NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
+ assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
+ }
+
+ // Like empty_class_bytes, but for a Unicode class.
+ #[test]
+ fn empty_class_unicode() {
+ use regex_syntax::hir::{Class, ClassUnicode, Hir};
+
+ let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
+ let config = NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false);
+ let nfa =
+ NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
+ assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
+ }
+
+ #[test]
+ fn compile_captures_all() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::All),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_cap(1, 0, 0, 0),
+ s_byte(b'a', 2),
+ s_cap(3, 0, 1, 2),
+ s_byte(b'b', 4),
+ s_cap(5, 0, 1, 3),
+ s_byte(b'c', 6),
+ s_cap(7, 0, 0, 1),
+ s_match(0)
+ ]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(2, ginfo.all_group_len());
+ }
+
+ #[test]
+ fn compile_captures_implicit() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::Implicit),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_cap(1, 0, 0, 0),
+ s_byte(b'a', 2),
+ s_byte(b'b', 3),
+ s_byte(b'c', 4),
+ s_cap(5, 0, 0, 1),
+ s_match(0)
+ ]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(1, ginfo.all_group_len());
+ }
+
+ #[test]
+ fn compile_captures_none() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::None),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(0, ginfo.all_group_len());
}
}
diff --git a/vendor/regex-automata/src/nfa/thompson/error.rs b/vendor/regex-automata/src/nfa/thompson/error.rs
index 52f02e888..3c2fa8a21 100644
--- a/vendor/regex-automata/src/nfa/thompson/error.rs
+++ b/vendor/regex-automata/src/nfa/thompson/error.rs
@@ -1,6 +1,9 @@
-use crate::util::id::{PatternID, StateID};
+use crate::util::{
+ captures, look,
+ primitives::{PatternID, StateID},
+};
-/// An error that can occured during the construction of a thompson NFA.
+/// An error that can occurred during the construction of a thompson NFA.
///
/// This error does not provide many introspection capabilities. There are
/// generally only two things you can do with it:
@@ -15,17 +18,27 @@ use crate::util::id::{PatternID, StateID};
/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
/// building the NFA will fail.
#[derive(Clone, Debug)]
-pub struct Error {
- kind: ErrorKind,
+pub struct BuildError {
+ kind: BuildErrorKind,
}
/// The kind of error that occurred during the construction of a thompson NFA.
#[derive(Clone, Debug)]
-enum ErrorKind {
+enum BuildErrorKind {
/// An error that occurred while parsing a regular expression. Note that
/// this error may be printed over multiple lines, and is generally
/// intended to be end user readable on its own.
+ #[cfg(feature = "syntax")]
Syntax(regex_syntax::Error),
+ /// An error that occurs if the capturing groups provided to an NFA builder
+ /// do not satisfy the documented invariants. For example, things like
+ /// too many groups, missing groups, having the first (zeroth) group be
+ /// named or duplicate group names within the same pattern.
+ Captures(captures::GroupInfoError),
+ /// An error that occurs when an NFA contains a Unicode word boundary, but
+ /// where the crate was compiled without the necessary data for dealing
+ /// with Unicode word boundaries.
+ Word(look::UnicodeWordBoundaryError),
/// An error that occurs if too many patterns were given to the NFA
/// compiler.
TooManyPatterns {
@@ -49,96 +62,123 @@ enum ErrorKind {
limit: usize,
},
/// An error that occurs when an invalid capture group index is added to
- /// the NFA. An "invalid" index can be one that is too big (e.g., results
- /// in an integer overflow) or one that is discontinuous from previous
- /// capture group indices added.
+ /// the NFA. An "invalid" index can be one that would otherwise overflow
+ /// a `usize` on the current target.
InvalidCaptureIndex {
/// The invalid index that was given.
- index: usize,
+ index: u32,
},
- /// An error that occurs when an NFA contains a Unicode word boundary, but
- /// where the crate was compiled without the necessary data for dealing
- /// with Unicode word boundaries.
- UnicodeWordUnavailable,
+ /// An error that occurs when one tries to build a reverse NFA with
+ /// captures enabled. Currently, this isn't supported, but we probably
+ /// should support it at some point.
+ #[cfg(feature = "syntax")]
+ UnsupportedCaptures,
}
-impl Error {
- fn kind(&self) -> &ErrorKind {
+impl BuildError {
+ /// If this error occurred because the NFA exceeded the configured size
+ /// limit before being built, then this returns the configured size limit.
+ ///
+ /// The limit returned is what was configured, and corresponds to the
+ /// maximum amount of heap usage in bytes.
+ pub fn size_limit(&self) -> Option<usize> {
+ match self.kind {
+ BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
+ _ => None,
+ }
+ }
+
+ fn kind(&self) -> &BuildErrorKind {
&self.kind
}
- pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
- Error { kind: ErrorKind::Syntax(err) }
+ #[cfg(feature = "syntax")]
+ pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError {
+ BuildError { kind: BuildErrorKind::Syntax(err) }
+ }
+
+ pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError {
+ BuildError { kind: BuildErrorKind::Captures(err) }
+ }
+
+ pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError {
+ BuildError { kind: BuildErrorKind::Word(err) }
}
- pub(crate) fn too_many_patterns(given: usize) -> Error {
+ pub(crate) fn too_many_patterns(given: usize) -> BuildError {
let limit = PatternID::LIMIT;
- Error { kind: ErrorKind::TooManyPatterns { given, limit } }
+ BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } }
}
- pub(crate) fn too_many_states(given: usize) -> Error {
+ pub(crate) fn too_many_states(given: usize) -> BuildError {
let limit = StateID::LIMIT;
- Error { kind: ErrorKind::TooManyStates { given, limit } }
+ BuildError { kind: BuildErrorKind::TooManyStates { given, limit } }
}
- pub(crate) fn exceeded_size_limit(limit: usize) -> Error {
- Error { kind: ErrorKind::ExceededSizeLimit { limit } }
+ pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
}
- pub(crate) fn invalid_capture_index(index: usize) -> Error {
- Error { kind: ErrorKind::InvalidCaptureIndex { index } }
+ pub(crate) fn invalid_capture_index(index: u32) -> BuildError {
+ BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
}
- pub(crate) fn unicode_word_unavailable() -> Error {
- Error { kind: ErrorKind::UnicodeWordUnavailable }
+ #[cfg(feature = "syntax")]
+ pub(crate) fn unsupported_captures() -> BuildError {
+ BuildError { kind: BuildErrorKind::UnsupportedCaptures }
}
}
#[cfg(feature = "std")]
-impl std::error::Error for Error {
+impl std::error::Error for BuildError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self.kind() {
- ErrorKind::Syntax(ref err) => Some(err),
- ErrorKind::TooManyPatterns { .. } => None,
- ErrorKind::TooManyStates { .. } => None,
- ErrorKind::ExceededSizeLimit { .. } => None,
- ErrorKind::InvalidCaptureIndex { .. } => None,
- ErrorKind::UnicodeWordUnavailable => None,
+ #[cfg(feature = "syntax")]
+ BuildErrorKind::Syntax(ref err) => Some(err),
+ BuildErrorKind::Captures(ref err) => Some(err),
+ _ => None,
}
}
}
-impl core::fmt::Display for Error {
+impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind() {
- ErrorKind::Syntax(_) => write!(f, "error parsing regex"),
- ErrorKind::TooManyPatterns { given, limit } => write!(
+ #[cfg(feature = "syntax")]
+ BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"),
+ BuildErrorKind::Captures(_) => {
+ write!(f, "error with capture groups")
+ }
+ BuildErrorKind::Word(_) => {
+ write!(f, "NFA contains Unicode word boundary")
+ }
+ BuildErrorKind::TooManyPatterns { given, limit } => write!(
f,
- "attemped to compile {} patterns, \
+ "attempted to compile {} patterns, \
which exceeds the limit of {}",
given, limit,
),
- ErrorKind::TooManyStates { given, limit } => write!(
+ BuildErrorKind::TooManyStates { given, limit } => write!(
f,
- "attemped to compile {} NFA states, \
+ "attempted to compile {} NFA states, \
which exceeds the limit of {}",
given, limit,
),
- ErrorKind::ExceededSizeLimit { limit } => write!(
+ BuildErrorKind::ExceededSizeLimit { limit } => write!(
f,
"heap usage during NFA compilation exceeded limit of {}",
limit,
),
- ErrorKind::InvalidCaptureIndex { index } => write!(
+ BuildErrorKind::InvalidCaptureIndex { index } => write!(
f,
"capture group index {} is invalid (too big or discontinuous)",
index,
),
- ErrorKind::UnicodeWordUnavailable => write!(
+ #[cfg(feature = "syntax")]
+ BuildErrorKind::UnsupportedCaptures => write!(
f,
- "crate has been compiled without Unicode word boundary \
- support, but the NFA contains Unicode word boundary \
- assertions",
+ "currently captures must be disabled when compiling \
+ a reverse NFA",
),
}
}
diff --git a/vendor/regex-automata/src/nfa/thompson/literal_trie.rs b/vendor/regex-automata/src/nfa/thompson/literal_trie.rs
new file mode 100644
index 000000000..7ed129afd
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/literal_trie.rs
@@ -0,0 +1,528 @@
+use core::mem;
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
+ util::primitives::{IteratorIndexExt, StateID},
+};
+
+/// A trie that preserves leftmost-first match semantics.
+///
+/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN'
+/// patterns. It can *only* handle alternations of literals, which makes it
+/// somewhat restricted in its scope, but literal alternations are fairly
+/// common.
+///
+/// At a 5,000 foot level, the main idea of this trie is make an alternation of
+/// literals look more like a DFA than an NFA via epsilon removal.
+///
+/// More precisely, the main issue is in how alternations are compiled into
+/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
+/// with an epsilon transition for every branch of the alternation pointing to
+/// an NFA state corresponding to the start of that branch. The main problem
+/// with this representation is the cost of computing an epsilon closure. Once
+/// you hit the alternation's start state, it acts as a sort of "clog" that
+/// requires you to traverse all of the epsilon transitions to compute the full
+/// closure.
+///
+/// While fixing such clogs in the general case is pretty tricky without going
+/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
+/// But at least in the case of an alternation of literals, we can convert
+/// that to a prefix trie without too much cost. In theory, that's all you
+/// really need to do: build the trie and then compile it to a Thompson NFA.
+/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it
+/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by
+/// reducing the number of epsilon transitions out of the alternation's start
+/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
+/// which we do below). It's a small effect here, but when your alternation is
+/// huge, the savings is also huge.
+///
+/// And that is... essentially what a LiteralTrie does. But there is one
+/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile
+/// that when leftmost-first semantics are used? If 'sam|samwise' was the
+/// entire regex, then you could just drop the 'samwise' branch entirely since
+/// it is impossible to match ('sam' will always take priority, and since it
+/// is a prefix of 'samwise', 'samwise' will never match). But what about the
+/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because
+/// it might match when 'sam' doesn't fall on a word boundary.
+///
+/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)',
+/// which is a precisely equivalent regex that also gets rid of the clog.
+///
+/// Another example is 'zapper|z|zap'. That gets translated to
+/// 'z(?:apper||ap)'.
+///
+/// We accomplish this by giving each state in the trie multiple "chunks" of
+/// transitions. Each chunk barrier represents a match. The idea is that once
+/// you know a match occurs, none of the transitions after the match can be
+/// re-ordered and mixed in with the transitions before the match. Otherwise,
+/// the match semantics could be changed.
+///
+/// See the 'State' data type for a bit more detail.
+///
+/// Future work:
+///
+/// * In theory, it would be nice to generalize the idea of removing clogs and
+/// apply it to the NFA graph itself. Then this could in theory work for
+/// case insensitive alternations of literals, or even just alternations where
+/// each branch starts with a non-epsilon transition.
+/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick
+/// crate deals with leftmost-first matches correctly, but I think this implies
+/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
+/// because failure transitions are just unconditional epsilon transitions?
+/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick
+/// directly? At time of writing, 0.7 is the current version of the
+/// aho-corasick crate, and that definitely cannot be used as-is. But if we
+/// expose the underlying finite state machine API, then could we use it? That
+/// would be super. If we could figure that out, it might also lend itself to
+/// more general composition of finite state machines.
+#[derive(Clone)]
+pub(crate) struct LiteralTrie {
+ /// The set of trie states. Each state contains one or more chunks, where
+ /// each chunk is a sparse set of transitions to other states. A leaf state
+ /// is always a match state that contains only empty chunks (i.e., no
+ /// transitions).
+ states: Vec<State>,
+ /// Whether to add literals in reverse to the trie. Useful when building
+ /// a reverse NFA automaton.
+ rev: bool,
+}
+
+impl LiteralTrie {
+ /// Create a new literal trie that adds literals in the forward direction.
+ pub(crate) fn forward() -> LiteralTrie {
+ let root = State::default();
+ LiteralTrie { states: vec![root], rev: false }
+ }
+
+ /// Create a new literal trie that adds literals in reverse.
+ pub(crate) fn reverse() -> LiteralTrie {
+ let root = State::default();
+ LiteralTrie { states: vec![root], rev: true }
+ }
+
+ /// Add the given literal to this trie.
+ ///
+ /// If the literal could not be added because the `StateID` space was
+ /// exhausted, then an error is returned. If an error returns, the trie
+ /// is in an unspecified state.
+ pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
+ let mut prev = StateID::ZERO;
+ let mut it = bytes.iter().copied();
+ while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
+ prev = self.get_or_add_state(prev, b)?;
+ }
+ self.states[prev].add_match();
+ Ok(())
+ }
+
+ /// If the given transition is defined, then return the next state ID.
+ /// Otherwise, add the transition to `from` and point it to a new state.
+ ///
+ /// If a new state ID could not be allocated, then an error is returned.
+ fn get_or_add_state(
+ &mut self,
+ from: StateID,
+ byte: u8,
+ ) -> Result<StateID, BuildError> {
+ let active = self.states[from].active_chunk();
+ match active.binary_search_by_key(&byte, |t| t.byte) {
+ Ok(i) => Ok(active[i].next),
+ Err(i) => {
+ // Add a new state and get its ID.
+ let next = StateID::new(self.states.len()).map_err(|_| {
+ BuildError::too_many_states(self.states.len())
+ })?;
+ self.states.push(State::default());
+ // Offset our position to account for all transitions and not
+ // just the ones in the active chunk.
+ let i = self.states[from].active_chunk_start() + i;
+ let t = Transition { byte, next };
+ self.states[from].transitions.insert(i, t);
+ Ok(next)
+ }
+ }
+ }
+
+ /// Compile this literal trie to the NFA builder given.
+ ///
+ /// This forwards any errors that may occur while using the given builder.
+ pub(crate) fn compile(
+ &self,
+ builder: &mut Builder,
+ ) -> Result<ThompsonRef, BuildError> {
+ // Compilation proceeds via depth-first traversal of the trie.
+ //
+ // This is overall pretty brutal. The recursive version of this is
+ // deliciously simple. (See 'compile_to_hir' below for what it might
+ // look like.) But recursion on a trie means your call stack grows
+ // in accordance with the longest literal, which just does not seem
+ // appropriate. So we push the call stack to the heap. But as a result,
+ // the trie traversal becomes pretty brutal because we essentially
+ // have to encode the state of a double for-loop into an explicit call
+ // frame. If someone can simplify this without using recursion, that'd
+ // be great.
+
+ // 'end' is our match state for this trie, but represented in the the
+ // NFA. Any time we see a match in the trie, we insert a transition
+ // from the current state we're in to 'end'.
+ let end = builder.add_empty()?;
+ let mut stack = vec![];
+ let mut f = Frame::new(&self.states[StateID::ZERO]);
+ loop {
+ if let Some(t) = f.transitions.next() {
+ if self.states[t.next].is_leaf() {
+ f.sparse.push(thompson::Transition {
+ start: t.byte,
+ end: t.byte,
+ next: end,
+ });
+ } else {
+ f.sparse.push(thompson::Transition {
+ start: t.byte,
+ end: t.byte,
+ // This is a little funny, but when the frame we create
+ // below completes, it will pop this parent frame off
+ // and modify this transition to point to the correct
+ // state.
+ next: StateID::ZERO,
+ });
+ stack.push(f);
+ f = Frame::new(&self.states[t.next]);
+ }
+ continue;
+ }
+ // At this point, we have visited all transitions in f.chunk, so
+ // add it as a sparse NFA state. Unless the chunk was empty, in
+ // which case, we don't do anything.
+ if !f.sparse.is_empty() {
+ let chunk_id = if f.sparse.len() == 1 {
+ builder.add_range(f.sparse.pop().unwrap())?
+ } else {
+ let sparse = mem::replace(&mut f.sparse, vec![]);
+ builder.add_sparse(sparse)?
+ };
+ f.union.push(chunk_id);
+ }
+ // Now we need to look to see if there are other chunks to visit.
+ if let Some(chunk) = f.chunks.next() {
+ // If we're here, it means we're on the second (or greater)
+ // chunk, which implies there is a match at this point. So
+ // connect this state to the final end state.
+ f.union.push(end);
+ // Advance to the next chunk.
+ f.transitions = chunk.iter();
+ continue;
+ }
+ // Now that we are out of chunks, we have completely visited
+ // this state. So turn our union of chunks into an NFA union
+ // state, and add that union state to the parent state's current
+ // sparse state. (If there is no parent, we're done.)
+ let start = builder.add_union(f.union)?;
+ match stack.pop() {
+ None => {
+ return Ok(ThompsonRef { start, end });
+ }
+ Some(mut parent) => {
+ // OK because the only way a frame gets pushed on to the
+ // stack (aside from the root) is when a transition has
+ // been added to 'sparse'.
+ parent.sparse.last_mut().unwrap().next = start;
+ f = parent;
+ }
+ }
+ }
+ }
+
+ /// Converts this trie to an equivalent HIR expression.
+ ///
+ /// We don't actually use this, but it's useful for tests. In particular,
+ /// it provides a (somewhat) human readable representation of the trie
+ /// itself.
+ #[cfg(test)]
+ fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
+ self.compile_state_to_hir(StateID::ZERO)
+ }
+
+ /// The recursive implementation of 'to_hir'.
+ ///
+ /// Notice how simple this is compared to 'compile' above. 'compile' could
+ /// be similarly simple, but we opt to not use recursion in order to avoid
+ /// overflowing the stack in the case of a longer literal.
+ #[cfg(test)]
+ fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
+ use regex_syntax::hir::Hir;
+
+ let mut alt = vec![];
+ for (i, chunk) in self.states[sid].chunks().enumerate() {
+ if i > 0 {
+ alt.push(Hir::empty());
+ }
+ if chunk.is_empty() {
+ continue;
+ }
+ let mut chunk_alt = vec![];
+ for t in chunk.iter() {
+ chunk_alt.push(Hir::concat(vec![
+ Hir::literal(vec![t.byte]),
+ self.compile_state_to_hir(t.next),
+ ]));
+ }
+ alt.push(Hir::alternation(chunk_alt));
+ }
+ Hir::alternation(alt)
+ }
+}
+
+impl core::fmt::Debug for LiteralTrie {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ writeln!(f, "LiteralTrie(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
+ }
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An explicit stack frame used for traversing the trie without using
+/// recursion.
+///
+/// Each frame is tied to the traversal of a single trie state. The frame is
+/// dropped once the entire state (and all of its children) have been visited.
+/// The "output" of compiling a state is the 'union' vector, which is turn
+/// converted to a NFA union state. Each branch of the union corresponds to a
+/// chunk in the trie state.
+///
+/// 'sparse' corresponds to the set of transitions for a particular chunk in a
+/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
+/// field, after being converted to a sparse NFA state, is reused for any
+/// subsequent chunks in the trie state, if any exist.
+#[derive(Debug)]
+struct Frame<'a> {
+ /// The remaining chunks to visit for a trie state.
+ chunks: StateChunksIter<'a>,
+ /// The transitions of the current chunk that we're iterating over. Since
+ /// every trie state has at least one chunk, every frame is initialized
+ /// with the first chunk's transitions ready to be consumed.
+ transitions: core::slice::Iter<'a, Transition>,
+ /// The NFA state IDs pointing to the start of each chunk compiled by
+ /// this trie state. This ultimately gets converted to an NFA union once
+ /// the entire trie state (and all of its children) have been compiled.
+ /// The order of these matters for leftmost-first match semantics, since
+ /// earlier matches in the union are preferred over later ones.
+ union: Vec<StateID>,
+ /// The actual NFA transitions for a single chunk in a trie state. This
+ /// gets converted to an NFA sparse state, and its corresponding NFA state
+ /// ID should get added to 'union'.
+ sparse: Vec<thompson::Transition>,
+}
+
+impl<'a> Frame<'a> {
+ /// Create a new stack frame for trie traversal. This initializes the
+ /// 'transitions' iterator to the transitions for the first chunk, with the
+ /// 'chunks' iterator being every chunk after the first one.
+ fn new(state: &'a State) -> Frame<'a> {
+ let mut chunks = state.chunks();
+ // every state has at least 1 chunk
+ let chunk = chunks.next().unwrap();
+ let transitions = chunk.iter();
+ Frame { chunks, transitions, union: vec![], sparse: vec![] }
+ }
+}
+
+/// A state in a trie.
+///
+/// This uses a sparse representation. Since we don't use literal tries
+/// for searching, and ultimately (and compilation requires visiting every
+/// transition anyway), we use a sparse representation for transitions. This
+/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
+/// a bit slower.
+///
+/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
+/// piece here is more unusual. In effect, 'chunks' defines a partitioning
+/// of 'transitions', where each chunk corresponds to a distinct set of
+/// transitions. The key invariant is that a transition in one chunk cannot
+/// be moved to another chunk. This is the secret sauce that preserve
+/// leftmost-first match semantics.
+///
+/// A new chunk is added whenever we mark a state as a match state. Once a
+/// new chunk is added, the old active chunk is frozen and is never mutated
+/// again. The new chunk becomes the active chunk, which is defined as
+/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where
+/// 'chunks' is empty actually contains one chunk. Thus, every state contains
+/// at least one (possibly empty) chunk.
+///
+/// A "leaf" state is a state that has no outgoing transitions (so
+/// 'transitions' is empty). Note that there is no way for a leaf state to be a
+/// non-matching state. (Although while building the trie, within 'add', a leaf
+/// state may exist while not containing any matches. But this invariant is
+/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
+#[derive(Clone, Default)]
+struct State {
+ transitions: Vec<Transition>,
+ chunks: Vec<(usize, usize)>,
+}
+
+impl State {
+ /// Mark this state as a match state and freeze the active chunk such that
+ /// it can not be further mutated.
+ fn add_match(&mut self) {
+ // This is not strictly necessary, but there's no point in recording
+ // another match by adding another chunk if the state has no
+ // transitions. Note though that we only skip this if we already know
+ // this is a match state, which is only true if 'chunks' is not empty.
+ // Basically, if we didn't do this, nothing semantically would change,
+ // but we'd end up pushing another chunk and potentially triggering an
+ // alloc.
+ if self.transitions.is_empty() && !self.chunks.is_empty() {
+ return;
+ }
+ let chunk_start = self.active_chunk_start();
+ let chunk_end = self.transitions.len();
+ self.chunks.push((chunk_start, chunk_end));
+ }
+
+ /// Returns true if and only if this state is a leaf state. That is, a
+ /// state that has no outgoing transitions.
+ fn is_leaf(&self) -> bool {
+ self.transitions.is_empty()
+ }
+
+ /// Returns an iterator over all of the chunks (including the currently
+ /// active chunk) in this state. Since the active chunk is included, the
+ /// iterator is guaranteed to always yield at least one chunk (although the
+ /// chunk may be empty).
+ fn chunks(&self) -> StateChunksIter<'_> {
+ StateChunksIter {
+ transitions: &*self.transitions,
+ chunks: self.chunks.iter(),
+ active: Some(self.active_chunk()),
+ }
+ }
+
+ /// Returns the active chunk as a slice of transitions.
+ fn active_chunk(&self) -> &[Transition] {
+ let start = self.active_chunk_start();
+ &self.transitions[start..]
+ }
+
+ /// Returns the index into 'transitions' where the active chunk starts.
+ fn active_chunk_start(&self) -> usize {
+ self.chunks.last().map_or(0, |&(_, end)| end)
+ }
+}
+
+impl core::fmt::Debug for State {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut spacing = " ";
+ for (i, chunk) in self.chunks().enumerate() {
+ if i > 0 {
+ write!(f, "{}MATCH", spacing)?;
+ }
+ spacing = "";
+ for (j, t) in chunk.iter().enumerate() {
+ spacing = " ";
+ if j == 0 && i > 0 {
+ write!(f, " ")?;
+ } else if j > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", t)?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all of the chunks in a state, including the active chunk.
+///
+/// This iterator is created by `State::chunks`. We name this iterator so that
+/// we can include it in the `Frame` type for non-recursive trie traversal.
+#[derive(Debug)]
+struct StateChunksIter<'a> {
+ transitions: &'a [Transition],
+ chunks: core::slice::Iter<'a, (usize, usize)>,
+ active: Option<&'a [Transition]>,
+}
+
+impl<'a> Iterator for StateChunksIter<'a> {
+ type Item = &'a [Transition];
+
+ fn next(&mut self) -> Option<&'a [Transition]> {
+ if let Some(&(start, end)) = self.chunks.next() {
+ return Some(&self.transitions[start..end]);
+ }
+ if let Some(chunk) = self.active.take() {
+ return Some(chunk);
+ }
+ None
+ }
+}
+
+/// A single transition in a trie to another state.
+#[derive(Clone, Copy)]
+struct Transition {
+ byte: u8,
+ next: StateID,
+}
+
+impl core::fmt::Debug for Transition {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "{:?} => {}",
+ crate::util::escape::DebugByte(self.byte),
+ self.next.as_usize()
+ )
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use bstr::B;
+ use regex_syntax::hir::Hir;
+
+ use super::*;
+
+ #[test]
+ fn zap() {
+ let mut trie = LiteralTrie::forward();
+ trie.add(b"zapper").unwrap();
+ trie.add(b"z").unwrap();
+ trie.add(b"zap").unwrap();
+
+ let got = trie.compile_to_hir();
+ let expected = Hir::concat(vec![
+ Hir::literal(B("z")),
+ Hir::alternation(vec![
+ Hir::literal(B("apper")),
+ Hir::empty(),
+ Hir::literal(B("ap")),
+ ]),
+ ]);
+ assert_eq!(expected, got);
+ }
+
+ #[test]
+ fn maker() {
+ let mut trie = LiteralTrie::forward();
+ trie.add(b"make").unwrap();
+ trie.add(b"maple").unwrap();
+ trie.add(b"maker").unwrap();
+
+ let got = trie.compile_to_hir();
+ let expected = Hir::concat(vec![
+ Hir::literal(B("ma")),
+ Hir::alternation(vec![
+ Hir::concat(vec![
+ Hir::literal(B("ke")),
+ Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
+ ]),
+ Hir::literal(B("ple")),
+ ]),
+ ]);
+ assert_eq!(expected, got);
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs
index 79ff63ca3..c36ce5386 100644
--- a/vendor/regex-automata/src/nfa/thompson/map.rs
+++ b/vendor/regex-automata/src/nfa/thompson/map.rs
@@ -25,17 +25,23 @@
// fast as the naive approach and typically winds up using less memory (since
// it generates smaller NFAs) despite the presence of the cache.
//
-// These maps effectively represent caching mechanisms for CState::Sparse and
-// CState::Range, respectively. The former represents a single NFA state with
-// many transitions of equivalent priority while the latter represents a single
-// NFA state with a single transition. (Neither state ever has or is an
-// epsilon transition.) Thus, they have different key types. It's likely we
-// could make one generic map, but the machinery didn't seem worth it. They
-// are simple enough.
+// These maps effectively represent caching mechanisms for sparse and
+// byte-range NFA states, respectively. The former represents a single NFA
+// state with many transitions of equivalent priority while the latter
+// represents a single NFA state with a single transition. (Neither state ever
+// has or is an epsilon transition.) Thus, they have different key types. It's
+// likely we could make one generic map, but the machinery didn't seem worth
+// it. They are simple enough.
use alloc::{vec, vec::Vec};
-use crate::{nfa::thompson::Transition, util::id::StateID};
+use crate::{
+ nfa::thompson::Transition,
+ util::{
+ int::{Usize, U64},
+ primitives::StateID,
+ },
+};
// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
@@ -137,11 +143,11 @@ impl Utf8BoundedMap {
pub fn hash(&self, key: &[Transition]) -> usize {
let mut h = INIT;
for t in key {
- h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
- h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
- h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME);
+ h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
+ h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
+ h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
}
- (h as usize) % self.map.len()
+ (h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
@@ -252,10 +258,10 @@ impl Utf8SuffixMap {
const INIT: u64 = 14695981039346656037;
let mut h = INIT;
- h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME);
- h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
- h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
- (h as usize) % self.map.len()
+ h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
+ h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
+ h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
+ (h % self.map.len().as_u64()).as_usize()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
diff --git a/vendor/regex-automata/src/nfa/thompson/mod.rs b/vendor/regex-automata/src/nfa/thompson/mod.rs
index 88a438e8e..cf426736d 100644
--- a/vendor/regex-automata/src/nfa/thompson/mod.rs
+++ b/vendor/regex-automata/src/nfa/thompson/mod.rs
@@ -1,1555 +1,81 @@
-use core::{convert::TryFrom, fmt, mem, ops::Range};
-
-use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
-
-use crate::util::{
- alphabet::{self, ByteClassSet},
- decode_last_utf8, decode_utf8,
- id::{IteratorIDExt, PatternID, PatternIDIter, StateID},
- is_word_byte, is_word_char_fwd, is_word_char_rev,
-};
-
-pub use self::{
- compiler::{Builder, Config},
- error::Error,
-};
-
+/*!
+Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and
+[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines.
+
+A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central
+data type in this library. It is the result of what is commonly referred to as
+"regex compilation." That is, turning a regex pattern from its concrete syntax
+string into something that can run a search looks roughly like this:
+
+* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast).
+* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir).
+* An `Hir` is compiled into a [`NFA`].
+* The `NFA` is then used to build one of a few different regex engines:
+ * An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines.
+ * An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's
+ transition table at search time.
+ * An `NFA`, assuming it is one-pass, is used to build a full
+ [one-pass DFA](crate::dfa::onepass) ahead of time.
+ * An `NFA` is used to build a [full DFA](crate::dfa) ahead of time.
+
+The [`meta`](crate::meta) regex engine makes all of these choices for you based
+on various criteria. However, if you have a lower level use case, _you_ can
+build any of the above regex engines and use them directly. But you must start
+here by building an `NFA`.
+
+# Details
+
+It is perhaps worth expanding a bit more on what it means to go through the
+`&str`->`Ast`->`Hir`->`NFA` process.
+
+* Parsing a string into an `Ast` gives it a structured representation.
+Crucially, the size and amount of work done in this step is proportional to the
+size of the original string. No optimization or Unicode handling is done at
+this point. This means that parsing into an `Ast` has very predictable costs.
+Moreover, an `Ast` can be roundtripped back to its original pattern string as
+written.
+* Translating an `Ast` into an `Hir` is a process by which the structured
+representation is simplified down to its most fundamental components.
+Translation deals with flags such as case insensitivity by converting things
+like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted
+to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each
+character class, regardless of how deeply nested it is, into a single sequence
+of non-overlapping ranges. All the various literal forms are thrown out in
+favor of one common representation. Overall, the `Hir` is small enough to fit
+into your head and makes analysis and other tasks much simpler.
+* Compiling an `Hir` into an `NFA` formulates the regex into a finite state
+machine whose transitions are defined over bytes. For example, an `Hir` might
+have a Unicode character class corresponding to a sequence of ranges defined
+in terms of `char`. Compilation is then responsible for turning those ranges
+into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding
+of just the codepoints specified by those ranges. Otherwise, the main job of
+an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be
+seen as a sequence of instructions for how to match a regex.
+*/
+
+#[cfg(feature = "nfa-backtrack")]
+pub mod backtrack;
+mod builder;
+#[cfg(feature = "syntax")]
mod compiler;
mod error;
+#[cfg(feature = "syntax")]
+mod literal_trie;
+#[cfg(feature = "syntax")]
mod map;
+mod nfa;
+#[cfg(feature = "nfa-pikevm")]
pub mod pikevm;
+#[cfg(feature = "syntax")]
mod range_trie;
-/// A map from capture group name to its corresponding capture index.
-///
-/// Since there are always two slots for each capture index, the pair of slots
-/// corresponding to the capture index for a pattern ID of 0 are indexed at
-/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`.
-///
-/// This type is actually wrapped inside a Vec indexed by pattern ID on the
-/// NFA, since multiple patterns may have the same capture group name.
-///
-/// Note that this is somewhat of a sub-optimal representation, since it
-/// requires a hashmap for each pattern. A better representation would be
-/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look
-/// up a capture index by name without producing a `Arc<str>`, which requires
-/// an allocation. To fix this, I think we'd need to define our own unsized
-/// type or something?
-#[cfg(feature = "std")]
-type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>;
-#[cfg(not(feature = "std"))]
-type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>;
-
-// The NFA API below is not something I'm terribly proud of at the moment. In
-// particular, it supports both mutating the NFA and actually using the NFA to
-// perform a search. I think combining these two things muddies the waters a
-// bit too much.
-//
-// I think the issue is that I saw the compiler as the 'builder,' and where
-// the compiler had the ability to manipulate the internal state of the NFA.
-// However, one of my goals was to make it possible for others to build their
-// own NFAs in a way that is *not* couple to the regex-syntax crate.
-//
-// So I think really, there should be an NFA, a NFABuilder and then the
-// internal compiler which uses the NFABuilder API to build an NFA. Alas, at
-// the time of writing, I kind of ran out of steam.
-
-/// A fully compiled Thompson NFA.
-///
-/// The states of the NFA are indexed by state IDs, which are how transitions
-/// are expressed.
-#[derive(Clone)]
-pub struct NFA {
- /// The state list. This list is guaranteed to be indexable by all starting
- /// state IDs, and it is also guaranteed to contain at most one `Match`
- /// state for each pattern compiled into this NFA. (A pattern may not have
- /// a corresponding `Match` state if a `Match` state is impossible to
- /// reach.)
- states: Vec<State>,
- /// The anchored starting state of this NFA.
- start_anchored: StateID,
- /// The unanchored starting state of this NFA.
- start_unanchored: StateID,
- /// The starting states for each individual pattern. Starting at any
- /// of these states will result in only an anchored search for the
- /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
- /// contains a single regex, then `start_pattern[0]` and `start_anchored`
- /// are always equivalent.
- start_pattern: Vec<StateID>,
- /// A map from PatternID to its corresponding range of capture slots. Each
- /// range is guaranteed to be contiguous with the previous range. The
- /// end of the last range corresponds to the total number of slots needed
- /// for this NFA.
- patterns_to_slots: Vec<Range<usize>>,
- /// A map from capture name to its corresponding index. So e.g., given
- /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name
- /// 'word' for pattern ID=0 would corresponding to the index '3'. Its
- /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'.
- capture_name_to_index: Vec<CaptureNameMap>,
- /// A map from pattern ID to capture group index to name, if one exists.
- /// This is effectively the inverse of 'capture_name_to_index'. The outer
- /// vec is indexed by pattern ID, while the inner vec is index by capture
- /// index offset for the corresponding pattern.
- ///
- /// The first capture group for each pattern is always unnamed and is thus
- /// always None.
- capture_index_to_name: Vec<Vec<Option<Arc<str>>>>,
- /// A representation of equivalence classes over the transitions in this
- /// NFA. Two bytes in the same equivalence class must not discriminate
- /// between a match or a non-match. This map can be used to shrink the
- /// total size of a DFA's transition table with a small match-time cost.
- ///
- /// Note that the NFA's transitions are *not* defined in terms of these
- /// equivalence classes. The NFA's transitions are defined on the original
- /// byte values. For the most part, this is because they wouldn't really
- /// help the NFA much since the NFA already uses a sparse representation
- /// to represent transitions. Byte classes are most effective in a dense
- /// representation.
- byte_class_set: ByteClassSet,
- /// Various facts about this NFA, which can be used to improve failure
- /// modes (e.g., rejecting DFA construction if an NFA has Unicode word
- /// boundaries) or for performing optimizations (avoiding an increase in
- /// states if there are no look-around states).
- facts: Facts,
- /// Heap memory used indirectly by NFA states. Since each state might use a
- /// different amount of heap, we need to keep track of this incrementally.
- memory_states: usize,
-}
-
-impl NFA {
- pub fn config() -> Config {
- Config::new()
- }
-
- pub fn builder() -> Builder {
- Builder::new()
- }
-
- /// Returns an NFA with no states. Its match semantics are unspecified.
- ///
- /// An empty NFA is useful as a starting point for building one. It is
- /// itself not intended to be used for matching. For example, its starting
- /// state identifiers are configured to be `0`, but since it has no states,
- /// the identifiers are invalid.
- ///
- /// If you need an NFA that never matches is anything and can be correctly
- /// used for matching, use [`NFA::never_match`].
- #[inline]
- pub fn empty() -> NFA {
- NFA {
- states: vec![],
- start_anchored: StateID::ZERO,
- start_unanchored: StateID::ZERO,
- start_pattern: vec![],
- patterns_to_slots: vec![],
- capture_name_to_index: vec![],
- capture_index_to_name: vec![],
- byte_class_set: ByteClassSet::empty(),
- facts: Facts::default(),
- memory_states: 0,
- }
- }
-
- /// Returns an NFA with a single regex that always matches at every
- /// position.
- #[inline]
- pub fn always_match() -> NFA {
- let mut nfa = NFA::empty();
- // Since we're only adding one pattern, these are guaranteed to work.
- let start = nfa.add_match().unwrap();
- assert_eq!(start.as_usize(), 0);
- let pid = nfa.finish_pattern(start).unwrap();
- assert_eq!(pid.as_usize(), 0);
- nfa
- }
-
- /// Returns an NFA that never matches at any position. It contains no
- /// regexes.
- #[inline]
- pub fn never_match() -> NFA {
- let mut nfa = NFA::empty();
- // Since we're only adding one state, this can never fail.
- nfa.add_fail().unwrap();
- nfa
- }
-
- /// Return the number of states in this NFA.
- ///
- /// This is guaranteed to be no bigger than [`StateID::LIMIT`].
- #[inline]
- pub fn len(&self) -> usize {
- self.states.len()
- }
-
- /// Returns the total number of distinct match states in this NFA.
- /// Stated differently, this returns the total number of regex patterns
- /// used to build this NFA.
- ///
- /// This may return zero if the NFA was constructed with no patterns. In
- /// this case, and only this case, the NFA can never produce a match for
- /// any input.
- ///
- /// This is guaranteed to be no bigger than [`PatternID::LIMIT`].
- #[inline]
- pub fn pattern_len(&self) -> usize {
- self.start_pattern.len()
- }
-
- /// Returns the pattern ID of the pattern currently being compiled by this
- /// NFA.
- fn current_pattern_id(&self) -> PatternID {
- // This always works because we never permit more patterns in
- // 'start_pattern' than can be addressed by PatternID. Also, we only
- // add a new entry to 'start_pattern' once we finish compiling a
- // pattern. Thus, the length refers to the ID of the current pattern
- // being compiled.
- PatternID::new(self.start_pattern.len()).unwrap()
- }
-
- /// Returns the total number of capturing groups in this NFA.
- ///
- /// This includes the special 0th capture group that is always present and
- /// captures the start and end offset of the entire match.
- ///
- /// This is a convenience routine for `nfa.capture_slot_len() / 2`.
- #[inline]
- pub fn capture_len(&self) -> usize {
- let slots = self.capture_slot_len();
- // This assert is guaranteed to pass since the NFA construction process
- // guarantees that it is always true.
- assert_eq!(slots % 2, 0, "capture slots must be divisible by 2");
- slots / 2
- }
-
- /// Returns the total number of capturing slots in this NFA.
- ///
- /// This value is guaranteed to be a multiple of 2. (Where each capturing
- /// group has precisely two capturing slots in the NFA.)
- #[inline]
- pub fn capture_slot_len(&self) -> usize {
- self.patterns_to_slots.last().map_or(0, |r| r.end)
- }
-
- /// Return a range of capture slots for the given pattern.
- ///
- /// The range returned is guaranteed to be contiguous with ranges for
- /// adjacent patterns.
- ///
- /// This panics if the given pattern ID is greater than or equal to the
- /// number of patterns in this NFA.
- #[inline]
- pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> {
- self.patterns_to_slots[pid].clone()
- }
-
- /// Return the capture group index corresponding to the given name in the
- /// given pattern. If no such capture group name exists in the given
- /// pattern, then this returns `None`.
- ///
- /// If the given pattern ID is invalid, then this panics.
- #[inline]
- pub fn capture_name_to_index(
- &self,
- pid: PatternID,
- name: &str,
- ) -> Option<usize> {
- assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID");
- self.capture_name_to_index[pid].get(name).cloned()
- }
-
- // TODO: add iterators over capture group names.
- // Do we also permit indexing?
-
- /// Returns an iterator over all pattern IDs in this NFA.
- #[inline]
- pub fn patterns(&self) -> PatternIter {
- PatternIter {
- it: PatternID::iter(self.pattern_len()),
- _marker: core::marker::PhantomData,
- }
- }
-
- /// Return the ID of the initial anchored state of this NFA.
- #[inline]
- pub fn start_anchored(&self) -> StateID {
- self.start_anchored
- }
-
- /// Set the anchored starting state ID for this NFA.
- #[inline]
- pub fn set_start_anchored(&mut self, id: StateID) {
- self.start_anchored = id;
- }
-
- /// Return the ID of the initial unanchored state of this NFA.
- #[inline]
- pub fn start_unanchored(&self) -> StateID {
- self.start_unanchored
- }
-
- /// Set the unanchored starting state ID for this NFA.
- #[inline]
- pub fn set_start_unanchored(&mut self, id: StateID) {
- self.start_unanchored = id;
- }
-
- /// Return the ID of the initial anchored state for the given pattern.
- ///
- /// If the pattern doesn't exist in this NFA, then this panics.
- #[inline]
- pub fn start_pattern(&self, pid: PatternID) -> StateID {
- self.start_pattern[pid]
- }
-
- /// Get the byte class set for this NFA.
- #[inline]
- pub fn byte_class_set(&self) -> &ByteClassSet {
- &self.byte_class_set
- }
-
- /// Return a reference to the NFA state corresponding to the given ID.
- #[inline]
- pub fn state(&self, id: StateID) -> &State {
- &self.states[id]
- }
-
- /// Returns a slice of all states in this NFA.
- ///
- /// The slice returned may be indexed by a `StateID` generated by `add`.
- #[inline]
- pub fn states(&self) -> &[State] {
- &self.states
- }
-
- #[inline]
- pub fn is_always_start_anchored(&self) -> bool {
- self.start_anchored() == self.start_unanchored()
- }
-
- #[inline]
- pub fn has_any_look(&self) -> bool {
- self.facts.has_any_look()
- }
-
- #[inline]
- pub fn has_any_anchor(&self) -> bool {
- self.facts.has_any_anchor()
- }
-
- #[inline]
- pub fn has_word_boundary(&self) -> bool {
- self.has_word_boundary_unicode() || self.has_word_boundary_ascii()
- }
-
- #[inline]
- pub fn has_word_boundary_unicode(&self) -> bool {
- self.facts.has_word_boundary_unicode()
- }
-
- #[inline]
- pub fn has_word_boundary_ascii(&self) -> bool {
- self.facts.has_word_boundary_ascii()
- }
-
- /// Returns the memory usage, in bytes, of this NFA.
- ///
- /// This does **not** include the stack size used up by this NFA. To
- /// compute that, use `std::mem::size_of::<NFA>()`.
- #[inline]
- pub fn memory_usage(&self) -> usize {
- self.states.len() * mem::size_of::<State>()
- + self.memory_states
- + self.start_pattern.len() * mem::size_of::<StateID>()
- }
-
- // Why do we define a bunch of 'add_*' routines below instead of just
- // defining a single 'add' routine that accepts a 'State'? Indeed, for most
- // of the 'add_*' routines below, such a simple API would be more than
- // appropriate. Unfortunately, adding capture states and, to a lesser
- // extent, match states, is a bit more complex. Namely, when we add a
- // capture state, we *really* want to know the corresponding capture
- // group's name and index and what not, so that we can update other state
- // inside this NFA. But, e.g., the capture group name is not and should
- // not be included in 'State::Capture'. So what are our choices?
- //
- // 1) Define one 'add' and require some additional optional parameters.
- // This feels quite ugly, and adds unnecessary complexity to more common
- // and simpler cases.
- //
- // 2) Do what we do below. The sad thing is that our API is bigger with
- // more methods. But each method is very specific and hopefully simple.
- //
- // 3) Define a new enum, say, 'StateWithInfo', or something that permits
- // providing both a State and some extra ancillary info in some cases. This
- // doesn't seem too bad to me, but seems slightly worse than (2) because of
- // the additional type required.
- //
- // 4) Abandon the idea that we have to specify things like the capture
- // group name when we add the Capture state to the NFA. We would then need
- // to add other methods that permit the caller to add this additional state
- // "out of band." Other than it introducing some additional complexity, I
- // decided against this because I wanted the NFA builder API to make it
- // as hard as possible to build a bad or invalid NFA. Using the approach
- // below, as you'll see, permits us to do a lot of strict checking of our
- // inputs and return an error if we see something we don't expect.
-
- pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> {
- self.byte_class_set.set_range(range.start, range.end);
- self.add_state(State::Range { range })
- }
-
- pub fn add_sparse(
- &mut self,
- sparse: SparseTransitions,
- ) -> Result<StateID, Error> {
- for range in sparse.ranges.iter() {
- self.byte_class_set.set_range(range.start, range.end);
- }
- self.add_state(State::Sparse(sparse))
- }
-
- pub fn add_look(
- &mut self,
- next: StateID,
- look: Look,
- ) -> Result<StateID, Error> {
- self.facts.set_has_any_look(true);
- look.add_to_byteset(&mut self.byte_class_set);
- match look {
- Look::StartLine
- | Look::EndLine
- | Look::StartText
- | Look::EndText => {
- self.facts.set_has_any_anchor(true);
- }
- Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => {
- self.facts.set_has_word_boundary_unicode(true);
- }
- Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => {
- self.facts.set_has_word_boundary_ascii(true);
- }
- }
- self.add_state(State::Look { look, next })
- }
-
- pub fn add_union(
- &mut self,
- alternates: Box<[StateID]>,
- ) -> Result<StateID, Error> {
- self.add_state(State::Union { alternates })
- }
-
- pub fn add_capture_start(
- &mut self,
- next_id: StateID,
- capture_index: u32,
- name: Option<Arc<str>>,
- ) -> Result<StateID, Error> {
- let pid = self.current_pattern_id();
- let capture_index = match usize::try_from(capture_index) {
- Err(_) => {
- return Err(Error::invalid_capture_index(core::usize::MAX))
- }
- Ok(capture_index) => capture_index,
- };
- // Do arithmetic to find our absolute slot index first, to make sure
- // the index is at least possibly valid (doesn't overflow).
- let relative_slot = match capture_index.checked_mul(2) {
- Some(relative_slot) => relative_slot,
- None => return Err(Error::invalid_capture_index(capture_index)),
- };
- let slot = match relative_slot.checked_add(self.capture_slot_len()) {
- Some(slot) => slot,
- None => return Err(Error::invalid_capture_index(capture_index)),
- };
- // Make sure we have space to insert our (pid,index)|-->name mapping.
- if pid.as_usize() >= self.capture_index_to_name.len() {
- // Note that we require that if you're adding capturing groups,
- // then there must be at least one capturing group per pattern.
- // Moreover, whenever we expand our space here, it should always
- // first be for the first capture group (at index==0).
- if pid.as_usize() > self.capture_index_to_name.len()
- || capture_index > 0
- {
- return Err(Error::invalid_capture_index(capture_index));
- }
- self.capture_name_to_index.push(CaptureNameMap::new());
- self.capture_index_to_name.push(vec![]);
- }
- if capture_index >= self.capture_index_to_name[pid].len() {
- // We require that capturing groups are added in correspondence
- // to their index. So no discontinuous indices. This is likely
- // overly strict, but also makes it simpler to provide guarantees
- // about our capturing group data.
- if capture_index > self.capture_index_to_name[pid].len() {
- return Err(Error::invalid_capture_index(capture_index));
- }
- self.capture_index_to_name[pid].push(None);
- }
- if let Some(ref name) = name {
- self.capture_name_to_index[pid]
- .insert(Arc::clone(name), capture_index);
- }
- self.capture_index_to_name[pid][capture_index] = name;
- self.add_state(State::Capture { next: next_id, slot })
- }
-
- pub fn add_capture_end(
- &mut self,
- next_id: StateID,
- capture_index: u32,
- ) -> Result<StateID, Error> {
- let pid = self.current_pattern_id();
- let capture_index = match usize::try_from(capture_index) {
- Err(_) => {
- return Err(Error::invalid_capture_index(core::usize::MAX))
- }
- Ok(capture_index) => capture_index,
- };
- // If we haven't already added this capture group via a corresponding
- // 'add_capture_start' call, then we consider the index given to be
- // invalid.
- if pid.as_usize() >= self.capture_index_to_name.len()
- || capture_index >= self.capture_index_to_name[pid].len()
- {
- return Err(Error::invalid_capture_index(capture_index));
- }
- // Since we've already confirmed that this capture index is invalid
- // and has a corresponding starting slot, we know the multiplcation
- // has already been done and succeeded.
- let relative_slot_start = capture_index.checked_mul(2).unwrap();
- let relative_slot = match relative_slot_start.checked_add(1) {
- Some(relative_slot) => relative_slot,
- None => return Err(Error::invalid_capture_index(capture_index)),
- };
- let slot = match relative_slot.checked_add(self.capture_slot_len()) {
- Some(slot) => slot,
- None => return Err(Error::invalid_capture_index(capture_index)),
- };
- self.add_state(State::Capture { next: next_id, slot })
- }
-
- pub fn add_fail(&mut self) -> Result<StateID, Error> {
- self.add_state(State::Fail)
- }
-
- /// Add a new match state to this NFA and return its state ID.
- pub fn add_match(&mut self) -> Result<StateID, Error> {
- let pattern_id = self.current_pattern_id();
- let sid = self.add_state(State::Match { id: pattern_id })?;
- Ok(sid)
- }
-
- /// Finish compiling the current pattern and return its identifier. The
- /// given ID should be the state ID corresponding to the anchored starting
- /// state for matching this pattern.
- pub fn finish_pattern(
- &mut self,
- start_id: StateID,
- ) -> Result<PatternID, Error> {
- // We've gotta make sure that we never permit the user to add more
- // patterns than we can identify. So if we're already at the limit,
- // then return an error. This is somewhat non-ideal since this won't
- // result in an error until trying to complete the compilation of a
- // pattern instead of starting it.
- if self.start_pattern.len() >= PatternID::LIMIT {
- return Err(Error::too_many_patterns(
- self.start_pattern.len().saturating_add(1),
- ));
- }
- let pid = self.current_pattern_id();
- self.start_pattern.push(start_id);
- // Add the number of new slots created by this pattern. This is always
- // equivalent to '2 * caps.len()', where 'caps.len()' is the number of
- // new capturing groups introduced by the pattern we're finishing.
- let new_cap_groups = self
- .capture_index_to_name
- .get(pid.as_usize())
- .map_or(0, |caps| caps.len());
- let new_slots = match new_cap_groups.checked_mul(2) {
- Some(new_slots) => new_slots,
- None => {
- // Just return the biggest index that we know exists.
- let index = new_cap_groups.saturating_sub(1);
- return Err(Error::invalid_capture_index(index));
- }
- };
- let slot_start = self.capture_slot_len();
- self.patterns_to_slots.push(slot_start..(slot_start + new_slots));
- Ok(pid)
- }
-
- fn add_state(&mut self, state: State) -> Result<StateID, Error> {
- let id = StateID::new(self.states.len())
- .map_err(|_| Error::too_many_states(self.states.len()))?;
- self.memory_states += state.memory_usage();
- self.states.push(state);
- Ok(id)
- }
-
- /// Remap the transitions in every state of this NFA using the given map.
- /// The given map should be indexed according to state ID namespace used by
- /// the transitions of the states currently in this NFA.
- ///
- /// This may be used during the final phases of an NFA compiler, which
- /// turns its intermediate NFA into the final NFA. Remapping may be
- /// required to bring the state pointers from the intermediate NFA to the
- /// final NFA.
- pub fn remap(&mut self, old_to_new: &[StateID]) {
- for state in &mut self.states {
- state.remap(old_to_new);
- }
- self.start_anchored = old_to_new[self.start_anchored];
- self.start_unanchored = old_to_new[self.start_unanchored];
- for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() {
- *id = old_to_new[*id];
- }
- }
-
- /// Clear this NFA such that it has zero states and is otherwise "empty."
- ///
- /// An empty NFA is useful as a starting point for building one. It is
- /// itself not intended to be used for matching. For example, its starting
- /// state identifiers are configured to be `0`, but since it has no states,
- /// the identifiers are invalid.
- pub fn clear(&mut self) {
- self.states.clear();
- self.start_anchored = StateID::ZERO;
- self.start_unanchored = StateID::ZERO;
- self.start_pattern.clear();
- self.patterns_to_slots.clear();
- self.capture_name_to_index.clear();
- self.capture_index_to_name.clear();
- self.byte_class_set = ByteClassSet::empty();
- self.facts = Facts::default();
- self.memory_states = 0;
- }
-}
-
-impl fmt::Debug for NFA {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- writeln!(f, "thompson::NFA(")?;
- for (sid, state) in self.states.iter().with_state_ids() {
- let status = if sid == self.start_anchored {
- '^'
- } else if sid == self.start_unanchored {
- '>'
- } else {
- ' '
- };
- writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
- }
- if self.pattern_len() > 1 {
- writeln!(f, "")?;
- for pid in self.patterns() {
- let sid = self.start_pattern(pid);
- writeln!(
- f,
- "START({:06?}): {:?}",
- pid.as_usize(),
- sid.as_usize()
- )?;
- }
- }
- writeln!(f, "")?;
- writeln!(
- f,
- "transition equivalence classes: {:?}",
- self.byte_class_set().byte_classes()
- )?;
- writeln!(f, ")")?;
- Ok(())
- }
-}
-
-/// A state in a final compiled NFA.
-#[derive(Clone, Eq, PartialEq)]
-pub enum State {
- /// A state that transitions to `next` if and only if the current input
- /// byte is in the range `[start, end]` (inclusive).
- ///
- /// This is a special case of Sparse in that it encodes only one transition
- /// (and therefore avoids the allocation).
- Range { range: Transition },
- /// A state with possibly many transitions, represented in a sparse
- /// fashion. Transitions are ordered lexicographically by input range. As
- /// such, this may only be used when every transition has equal priority.
- /// (In practice, this is only used for encoding UTF-8 automata.)
- Sparse(SparseTransitions),
- /// A conditional epsilon transition satisfied via some sort of
- /// look-around.
- Look { look: Look, next: StateID },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via earlier transitions
- /// are preferred over later transitions.
- Union { alternates: Box<[StateID]> },
- /// An empty state that records a capture location.
- ///
- /// From the perspective of finite automata, this is precisely equivalent
- /// to an epsilon transition, but serves the purpose of instructing NFA
- /// simulations to record additional state when the finite state machine
- /// passes through this epsilon transition.
- ///
- /// These transitions are treated as epsilon transitions with no additional
- /// effects in DFAs.
- ///
- /// 'slot' in this context refers to the specific capture group offset that
- /// is being recorded. Each capturing group has two slots corresponding to
- /// the start and end of the matching portion of that group.
- /// A fail state. When encountered, the automaton is guaranteed to never
- /// reach a match state.
- Capture { next: StateID, slot: usize },
- /// A state that cannot be transitioned out of. If a search reaches this
- /// state, then no match is possible and the search should terminate.
- Fail,
- /// A match state. There is exactly one such occurrence of this state for
- /// each regex compiled into the NFA.
- Match { id: PatternID },
-}
-
-impl State {
- /// Returns true if and only if this state contains one or more epsilon
- /// transitions.
- #[inline]
- pub fn is_epsilon(&self) -> bool {
- match *self {
- State::Range { .. }
- | State::Sparse { .. }
- | State::Fail
- | State::Match { .. } => false,
- State::Look { .. }
- | State::Union { .. }
- | State::Capture { .. } => true,
- }
- }
-
- /// Returns the heap memory usage of this NFA state in bytes.
- fn memory_usage(&self) -> usize {
- match *self {
- State::Range { .. }
- | State::Look { .. }
- | State::Capture { .. }
- | State::Match { .. }
- | State::Fail => 0,
- State::Sparse(SparseTransitions { ref ranges }) => {
- ranges.len() * mem::size_of::<Transition>()
- }
- State::Union { ref alternates } => {
- alternates.len() * mem::size_of::<StateID>()
- }
- }
- }
-
- /// Remap the transitions in this state using the given map. Namely, the
- /// given map should be indexed according to the transitions currently
- /// in this state.
- ///
- /// This is used during the final phase of the NFA compiler, which turns
- /// its intermediate NFA into the final NFA.
- fn remap(&mut self, remap: &[StateID]) {
- match *self {
- State::Range { ref mut range } => range.next = remap[range.next],
- State::Sparse(SparseTransitions { ref mut ranges }) => {
- for r in ranges.iter_mut() {
- r.next = remap[r.next];
- }
- }
- State::Look { ref mut next, .. } => *next = remap[*next],
- State::Union { ref mut alternates } => {
- for alt in alternates.iter_mut() {
- *alt = remap[*alt];
- }
- }
- State::Capture { ref mut next, .. } => *next = remap[*next],
- State::Fail => {}
- State::Match { .. } => {}
- }
- }
-}
-
-impl fmt::Debug for State {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match *self {
- State::Range { ref range } => range.fmt(f),
- State::Sparse(SparseTransitions { ref ranges }) => {
- let rs = ranges
- .iter()
- .map(|t| format!("{:?}", t))
- .collect::<Vec<String>>()
- .join(", ");
- write!(f, "sparse({})", rs)
- }
- State::Look { ref look, next } => {
- write!(f, "{:?} => {:?}", look, next.as_usize())
- }
- State::Union { ref alternates } => {
- let alts = alternates
- .iter()
- .map(|id| format!("{:?}", id.as_usize()))
- .collect::<Vec<String>>()
- .join(", ");
- write!(f, "alt({})", alts)
- }
- State::Capture { next, slot } => {
- write!(f, "capture({:?}) => {:?}", slot, next.as_usize())
- }
- State::Fail => write!(f, "FAIL"),
- State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()),
- }
- }
-}
-
-/// A collection of facts about an NFA.
-///
-/// There are no real cohesive principles behind what gets put in here. For
-/// the most part, it is implementation driven.
-#[derive(Clone, Copy, Debug, Default)]
-struct Facts {
- /// Various yes/no facts about this NFA.
- bools: u16,
-}
-
-impl Facts {
- define_bool!(0, has_any_look, set_has_any_look);
- define_bool!(1, has_any_anchor, set_has_any_anchor);
- define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode);
- define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii);
-}
-
-/// A sequence of transitions used to represent a sparse state.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct SparseTransitions {
- pub ranges: Box<[Transition]>,
-}
-
-impl SparseTransitions {
- pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
- haystack.get(at).and_then(|&b| self.matches_byte(b))
- }
-
- pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> {
- unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
- }
-
- pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
- for t in self.ranges.iter() {
- if t.start > byte {
- break;
- } else if t.matches_byte(byte) {
- return Some(t.next);
- }
- }
- None
-
- /*
- // This is an alternative implementation that uses binary search. In
- // some ad hoc experiments, like
- //
- // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
- // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
- //
- // I could not observe any improvement, and in fact, things seemed to
- // be a bit slower.
- self.ranges
- .binary_search_by(|t| {
- if t.end < byte {
- core::cmp::Ordering::Less
- } else if t.start > byte {
- core::cmp::Ordering::Greater
- } else {
- core::cmp::Ordering::Equal
- }
- })
- .ok()
- .map(|i| self.ranges[i].next)
- */
- }
-}
-
-/// A transition to another state, only if the given byte falls in the
-/// inclusive range specified.
-#[derive(Clone, Copy, Eq, Hash, PartialEq)]
-pub struct Transition {
- pub start: u8,
- pub end: u8,
- pub next: StateID,
-}
-
-impl Transition {
- pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
- haystack.get(at).map_or(false, |&b| self.matches_byte(b))
- }
-
- pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
- unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
- }
-
- pub fn matches_byte(&self, byte: u8) -> bool {
- self.start <= byte && byte <= self.end
- }
-}
-
-impl fmt::Debug for Transition {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- use crate::util::DebugByte;
-
- let Transition { start, end, next } = *self;
- if self.start == self.end {
- write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
- } else {
- write!(
- f,
- "{:?}-{:?} => {:?}",
- DebugByte(start),
- DebugByte(end),
- next.as_usize(),
- )
- }
- }
-}
-
-/// A conditional NFA epsilon transition.
-///
-/// A simulation of the NFA can only move through this epsilon transition if
-/// the current position satisfies some look-around property. Some assertions
-/// are look-behind (StartLine, StartText), some assertions are look-ahead
-/// (EndLine, EndText) while other assertions are both look-behind and
-/// look-ahead (WordBoundary*).
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum Look {
- /// The previous position is either `\n` or the current position is the
- /// beginning of the haystack (i.e., at position `0`).
- StartLine = 1 << 0,
- /// The next position is either `\n` or the current position is the end of
- /// the haystack (i.e., at position `haystack.len()`).
- EndLine = 1 << 1,
- /// The current position is the beginning of the haystack (i.e., at
- /// position `0`).
- StartText = 1 << 2,
- /// The current position is the end of the haystack (i.e., at position
- /// `haystack.len()`).
- EndText = 1 << 3,
- /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])`
- /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only
- /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
- /// `i=haystack.len()`, then `is_word(n)=false`.
- WordBoundaryUnicode = 1 << 4,
- /// Same as for `WordBoundaryUnicode`, but requires that
- /// `is_word(p) == is_word(n)`.
- WordBoundaryUnicodeNegate = 1 << 5,
- /// When tested at position `i`, where `p=haystack[i-1]` and
- /// `n=haystack[i]`, this assertion passes if and only if `is_word(p)
- /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
- /// `i=haystack.len()`, then `is_word(n)=false`.
- WordBoundaryAscii = 1 << 6,
- /// Same as for `WordBoundaryAscii`, but requires that
- /// `is_word(p) == is_word(n)`.
- ///
- /// Note that it is possible for this assertion to match at positions that
- /// split the UTF-8 encoding of a codepoint. For this reason, this may only
- /// be used when UTF-8 mode is disable in the regex syntax.
- WordBoundaryAsciiNegate = 1 << 7,
-}
-
-impl Look {
- #[inline(always)]
- pub fn matches(&self, bytes: &[u8], at: usize) -> bool {
- match *self {
- Look::StartLine => at == 0 || bytes[at - 1] == b'\n',
- Look::EndLine => at == bytes.len() || bytes[at] == b'\n',
- Look::StartText => at == 0,
- Look::EndText => at == bytes.len(),
- Look::WordBoundaryUnicode => {
- let word_before = is_word_char_rev(bytes, at);
- let word_after = is_word_char_fwd(bytes, at);
- word_before != word_after
- }
- Look::WordBoundaryUnicodeNegate => {
- // This is pretty subtle. Why do we need to do UTF-8 decoding
- // here? Well... at time of writing, the is_word_char_{fwd,rev}
- // routines will only return true if there is a valid UTF-8
- // encoding of a "word" codepoint, and false in every other
- // case (including invalid UTF-8). This means that in regions
- // of invalid UTF-8 (which might be a subset of valid UTF-8!),
- // it would result in \B matching. While this would be
- // questionable in the context of truly invalid UTF-8, it is
- // *certainly* wrong to report match boundaries that split the
- // encoding of a codepoint. So to work around this, we ensure
- // that we can decode a codepoint on either side of `at`. If
- // either direction fails, then we don't permit \B to match at
- // all.
- //
- // Now, this isn't exactly optimal from a perf perspective. We
- // could try and detect this in is_word_char_{fwd,rev}, but
- // it's not clear if it's worth it. \B is, after all, rarely
- // used.
- //
- // And in particular, we do *not* have to do this with \b,
- // because \b *requires* that at least one side of `at` be a
- // "word" codepoint, which in turn implies one side of `at`
- // must be valid UTF-8. This in turn implies that \b can never
- // split a valid UTF-8 encoding of a codepoint. In the case
- // where one side of `at` is truly invalid UTF-8 and the other
- // side IS a word codepoint, then we want \b to match since it
- // represents a valid UTF-8 boundary. It also makes sense. For
- // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
- let word_before = at > 0
- && match decode_last_utf8(&bytes[..at]) {
- None | Some(Err(_)) => return false,
- Some(Ok(_)) => is_word_char_rev(bytes, at),
- };
- let word_after = at < bytes.len()
- && match decode_utf8(&bytes[at..]) {
- None | Some(Err(_)) => return false,
- Some(Ok(_)) => is_word_char_fwd(bytes, at),
- };
- word_before == word_after
- }
- Look::WordBoundaryAscii => {
- let word_before = at > 0 && is_word_byte(bytes[at - 1]);
- let word_after = at < bytes.len() && is_word_byte(bytes[at]);
- word_before != word_after
- }
- Look::WordBoundaryAsciiNegate => {
- let word_before = at > 0 && is_word_byte(bytes[at - 1]);
- let word_after = at < bytes.len() && is_word_byte(bytes[at]);
- word_before == word_after
- }
- }
- }
-
- /// Create a look-around assertion from its corresponding integer (as
- /// defined in `Look`). If the given integer does not correspond to any
- /// assertion, then None is returned.
- fn from_int(n: u8) -> Option<Look> {
- match n {
- 0b0000_0001 => Some(Look::StartLine),
- 0b0000_0010 => Some(Look::EndLine),
- 0b0000_0100 => Some(Look::StartText),
- 0b0000_1000 => Some(Look::EndText),
- 0b0001_0000 => Some(Look::WordBoundaryUnicode),
- 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate),
- 0b0100_0000 => Some(Look::WordBoundaryAscii),
- 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate),
- _ => None,
- }
- }
-
- /// Flip the look-around assertion to its equivalent for reverse searches.
- fn reversed(&self) -> Look {
- match *self {
- Look::StartLine => Look::EndLine,
- Look::EndLine => Look::StartLine,
- Look::StartText => Look::EndText,
- Look::EndText => Look::StartText,
- Look::WordBoundaryUnicode => Look::WordBoundaryUnicode,
- Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate,
- Look::WordBoundaryAscii => Look::WordBoundaryAscii,
- Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate,
- }
- }
-
- /// Split up the given byte classes into equivalence classes in a way that
- /// is consistent with this look-around assertion.
- fn add_to_byteset(&self, set: &mut ByteClassSet) {
- match *self {
- Look::StartText | Look::EndText => {}
- Look::StartLine | Look::EndLine => {
- set.set_range(b'\n', b'\n');
- }
- Look::WordBoundaryUnicode
- | Look::WordBoundaryUnicodeNegate
- | Look::WordBoundaryAscii
- | Look::WordBoundaryAsciiNegate => {
- // We need to mark all ranges of bytes whose pairs result in
- // evaluating \b differently. This isn't technically correct
- // for Unicode word boundaries, but DFAs can't handle those
- // anyway, and thus, the byte classes don't need to either
- // since they are themselves only used in DFAs.
- let iswb = regex_syntax::is_word_byte;
- let mut b1: u16 = 0;
- let mut b2: u16;
- while b1 <= 255 {
- b2 = b1 + 1;
- while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
- b2 += 1;
- }
- set.set_range(b1 as u8, (b2 - 1) as u8);
- b1 = b2;
- }
- }
- }
- }
-}
-
-/// LookSet is a memory-efficient set of look-around assertions. Callers may
-/// idempotently insert or remove any look-around assertion from a set.
-#[repr(transparent)]
-#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
-pub(crate) struct LookSet {
- set: u8,
-}
-
-impl LookSet {
- /// Return a LookSet from its representation.
- pub(crate) fn from_repr(repr: u8) -> LookSet {
- LookSet { set: repr }
- }
-
- /// Return a mutable LookSet from a mutable pointer to its representation.
- pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet {
- // SAFETY: This is safe since a LookSet is repr(transparent) where its
- // repr is a u8.
- unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) }
- }
-
- /// Return true if and only if this set is empty.
- pub(crate) fn is_empty(&self) -> bool {
- self.set == 0
- }
-
- /// Clears this set such that it has no assertions in it.
- pub(crate) fn clear(&mut self) {
- self.set = 0;
- }
-
- /// Insert the given look-around assertion into this set. If the assertion
- /// already exists, then this is a no-op.
- pub(crate) fn insert(&mut self, look: Look) {
- self.set |= look as u8;
- }
-
- /// Remove the given look-around assertion from this set. If the assertion
- /// is not in this set, then this is a no-op.
- #[cfg(test)]
- pub(crate) fn remove(&mut self, look: Look) {
- self.set &= !(look as u8);
- }
-
- /// Return true if and only if the given assertion is in this set.
- pub(crate) fn contains(&self, look: Look) -> bool {
- (look as u8) & self.set != 0
- }
-
- /// Subtract the given `other` set from the `self` set and return a new
- /// set.
- pub(crate) fn subtract(&self, other: LookSet) -> LookSet {
- LookSet { set: self.set & !other.set }
- }
-
- /// Return the intersection of the given `other` set with the `self` set
- /// and return the resulting set.
- pub(crate) fn intersect(&self, other: LookSet) -> LookSet {
- LookSet { set: self.set & other.set }
- }
-}
-
-impl core::fmt::Debug for LookSet {
- fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- let mut members = vec![];
- for i in 0..8 {
- let look = match Look::from_int(1 << i) {
- None => continue,
- Some(look) => look,
- };
- if self.contains(look) {
- members.push(look);
- }
- }
- f.debug_tuple("LookSet").field(&members).finish()
- }
-}
-
-/// An iterator over all pattern IDs in an NFA.
-pub struct PatternIter<'a> {
- it: PatternIDIter,
- /// We explicitly associate a lifetime with this iterator even though we
- /// don't actually borrow anything from the NFA. We do this for backward
- /// compatibility purposes. If we ever do need to borrow something from
- /// the NFA, then we can and just get rid of this marker without breaking
- /// the public API.
- _marker: core::marker::PhantomData<&'a ()>,
-}
-
-impl<'a> Iterator for PatternIter<'a> {
- type Item = PatternID;
-
- fn next(&mut self) -> Option<PatternID> {
- self.it.next()
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- // TODO: Replace tests using DFA with NFA matching engine once implemented.
- use crate::dfa::{dense, Automaton};
-
- #[test]
- fn always_match() {
- let nfa = NFA::always_match();
- let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
- let find = |input, start, end| {
- dfa.find_leftmost_fwd_at(None, None, input, start, end)
- .unwrap()
- .map(|m| m.offset())
- };
-
- assert_eq!(Some(0), find(b"", 0, 0));
- assert_eq!(Some(0), find(b"a", 0, 1));
- assert_eq!(Some(1), find(b"a", 1, 1));
- assert_eq!(Some(0), find(b"ab", 0, 2));
- assert_eq!(Some(1), find(b"ab", 1, 2));
- assert_eq!(Some(2), find(b"ab", 2, 2));
- }
-
- #[test]
- fn never_match() {
- let nfa = NFA::never_match();
- let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
- let find = |input, start, end| {
- dfa.find_leftmost_fwd_at(None, None, input, start, end)
- .unwrap()
- .map(|m| m.offset())
- };
-
- assert_eq!(None, find(b"", 0, 0));
- assert_eq!(None, find(b"a", 0, 1));
- assert_eq!(None, find(b"a", 1, 1));
- assert_eq!(None, find(b"ab", 0, 2));
- assert_eq!(None, find(b"ab", 1, 2));
- assert_eq!(None, find(b"ab", 2, 2));
- }
-
- #[test]
- fn look_set() {
- let mut f = LookSet::default();
- assert!(!f.contains(Look::StartText));
- assert!(!f.contains(Look::EndText));
- assert!(!f.contains(Look::StartLine));
- assert!(!f.contains(Look::EndLine));
- assert!(!f.contains(Look::WordBoundaryUnicode));
- assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
- assert!(!f.contains(Look::WordBoundaryAscii));
- assert!(!f.contains(Look::WordBoundaryAsciiNegate));
-
- f.insert(Look::StartText);
- assert!(f.contains(Look::StartText));
- f.remove(Look::StartText);
- assert!(!f.contains(Look::StartText));
-
- f.insert(Look::EndText);
- assert!(f.contains(Look::EndText));
- f.remove(Look::EndText);
- assert!(!f.contains(Look::EndText));
-
- f.insert(Look::StartLine);
- assert!(f.contains(Look::StartLine));
- f.remove(Look::StartLine);
- assert!(!f.contains(Look::StartLine));
-
- f.insert(Look::EndLine);
- assert!(f.contains(Look::EndLine));
- f.remove(Look::EndLine);
- assert!(!f.contains(Look::EndLine));
-
- f.insert(Look::WordBoundaryUnicode);
- assert!(f.contains(Look::WordBoundaryUnicode));
- f.remove(Look::WordBoundaryUnicode);
- assert!(!f.contains(Look::WordBoundaryUnicode));
-
- f.insert(Look::WordBoundaryUnicodeNegate);
- assert!(f.contains(Look::WordBoundaryUnicodeNegate));
- f.remove(Look::WordBoundaryUnicodeNegate);
- assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
-
- f.insert(Look::WordBoundaryAscii);
- assert!(f.contains(Look::WordBoundaryAscii));
- f.remove(Look::WordBoundaryAscii);
- assert!(!f.contains(Look::WordBoundaryAscii));
-
- f.insert(Look::WordBoundaryAsciiNegate);
- assert!(f.contains(Look::WordBoundaryAsciiNegate));
- f.remove(Look::WordBoundaryAsciiNegate);
- assert!(!f.contains(Look::WordBoundaryAsciiNegate));
- }
-
- #[test]
- fn look_matches_start_line() {
- let look = Look::StartLine;
-
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("\n"), 0));
- assert!(look.matches(B("\n"), 1));
- assert!(look.matches(B("a"), 0));
- assert!(look.matches(B("\na"), 1));
-
- assert!(!look.matches(B("a"), 1));
- assert!(!look.matches(B("a\na"), 1));
- }
-
- #[test]
- fn look_matches_end_line() {
- let look = Look::EndLine;
-
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("\n"), 1));
- assert!(look.matches(B("\na"), 0));
- assert!(look.matches(B("\na"), 2));
- assert!(look.matches(B("a\na"), 1));
-
- assert!(!look.matches(B("a"), 0));
- assert!(!look.matches(B("\na"), 1));
- assert!(!look.matches(B("a\na"), 0));
- assert!(!look.matches(B("a\na"), 2));
- }
-
- #[test]
- fn look_matches_start_text() {
- let look = Look::StartText;
-
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("\n"), 0));
- assert!(look.matches(B("a"), 0));
-
- assert!(!look.matches(B("\n"), 1));
- assert!(!look.matches(B("\na"), 1));
- assert!(!look.matches(B("a"), 1));
- assert!(!look.matches(B("a\na"), 1));
- }
-
- #[test]
- fn look_matches_end_text() {
- let look = Look::EndText;
-
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("\n"), 1));
- assert!(look.matches(B("\na"), 2));
-
- assert!(!look.matches(B("\na"), 0));
- assert!(!look.matches(B("a\na"), 1));
- assert!(!look.matches(B("a"), 0));
- assert!(!look.matches(B("\na"), 1));
- assert!(!look.matches(B("a\na"), 0));
- assert!(!look.matches(B("a\na"), 2));
- }
-
- #[test]
- fn look_matches_word_unicode() {
- let look = Look::WordBoundaryUnicode;
-
- // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
- // \xF0\x90\x86\x80 = 𐆀 (not in \w)
-
- // Simple ASCII word boundaries.
- assert!(look.matches(B("a"), 0));
- assert!(look.matches(B("a"), 1));
- assert!(look.matches(B("a "), 1));
- assert!(look.matches(B(" a "), 1));
- assert!(look.matches(B(" a "), 2));
-
- // Unicode word boundaries with a non-ASCII codepoint.
- assert!(look.matches(B("𝛃"), 0));
- assert!(look.matches(B("𝛃"), 4));
- assert!(look.matches(B("𝛃 "), 4));
- assert!(look.matches(B(" 𝛃 "), 1));
- assert!(look.matches(B(" 𝛃 "), 5));
-
- // Unicode word boundaries between non-ASCII codepoints.
- assert!(look.matches(B("𝛃𐆀"), 0));
- assert!(look.matches(B("𝛃𐆀"), 4));
-
- // Non word boundaries for ASCII.
- assert!(!look.matches(B(""), 0));
- assert!(!look.matches(B("ab"), 1));
- assert!(!look.matches(B("a "), 2));
- assert!(!look.matches(B(" a "), 0));
- assert!(!look.matches(B(" a "), 3));
-
- // Non word boundaries with a non-ASCII codepoint.
- assert!(!look.matches(B("𝛃b"), 4));
- assert!(!look.matches(B("𝛃 "), 5));
- assert!(!look.matches(B(" 𝛃 "), 0));
- assert!(!look.matches(B(" 𝛃 "), 6));
- assert!(!look.matches(B("𝛃"), 1));
- assert!(!look.matches(B("𝛃"), 2));
- assert!(!look.matches(B("𝛃"), 3));
-
- // Non word boundaries with non-ASCII codepoints.
- assert!(!look.matches(B("𝛃𐆀"), 1));
- assert!(!look.matches(B("𝛃𐆀"), 2));
- assert!(!look.matches(B("𝛃𐆀"), 3));
- assert!(!look.matches(B("𝛃𐆀"), 5));
- assert!(!look.matches(B("𝛃𐆀"), 6));
- assert!(!look.matches(B("𝛃𐆀"), 7));
- assert!(!look.matches(B("𝛃𐆀"), 8));
- }
-
- #[test]
- fn look_matches_word_ascii() {
- let look = Look::WordBoundaryAscii;
-
- // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
- // \xF0\x90\x86\x80 = 𐆀 (not in \w)
-
- // Simple ASCII word boundaries.
- assert!(look.matches(B("a"), 0));
- assert!(look.matches(B("a"), 1));
- assert!(look.matches(B("a "), 1));
- assert!(look.matches(B(" a "), 1));
- assert!(look.matches(B(" a "), 2));
-
- // Unicode word boundaries with a non-ASCII codepoint. Since this is
- // an ASCII word boundary, none of these match.
- assert!(!look.matches(B("𝛃"), 0));
- assert!(!look.matches(B("𝛃"), 4));
- assert!(!look.matches(B("𝛃 "), 4));
- assert!(!look.matches(B(" 𝛃 "), 1));
- assert!(!look.matches(B(" 𝛃 "), 5));
-
- // Unicode word boundaries between non-ASCII codepoints. Again, since
- // this is an ASCII word boundary, none of these match.
- assert!(!look.matches(B("𝛃𐆀"), 0));
- assert!(!look.matches(B("𝛃𐆀"), 4));
-
- // Non word boundaries for ASCII.
- assert!(!look.matches(B(""), 0));
- assert!(!look.matches(B("ab"), 1));
- assert!(!look.matches(B("a "), 2));
- assert!(!look.matches(B(" a "), 0));
- assert!(!look.matches(B(" a "), 3));
-
- // Non word boundaries with a non-ASCII codepoint.
- assert!(look.matches(B("𝛃b"), 4));
- assert!(!look.matches(B("𝛃 "), 5));
- assert!(!look.matches(B(" 𝛃 "), 0));
- assert!(!look.matches(B(" 𝛃 "), 6));
- assert!(!look.matches(B("𝛃"), 1));
- assert!(!look.matches(B("𝛃"), 2));
- assert!(!look.matches(B("𝛃"), 3));
-
- // Non word boundaries with non-ASCII codepoints.
- assert!(!look.matches(B("𝛃𐆀"), 1));
- assert!(!look.matches(B("𝛃𐆀"), 2));
- assert!(!look.matches(B("𝛃𐆀"), 3));
- assert!(!look.matches(B("𝛃𐆀"), 5));
- assert!(!look.matches(B("𝛃𐆀"), 6));
- assert!(!look.matches(B("𝛃𐆀"), 7));
- assert!(!look.matches(B("𝛃𐆀"), 8));
- }
-
- #[test]
- fn look_matches_word_unicode_negate() {
- let look = Look::WordBoundaryUnicodeNegate;
-
- // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
- // \xF0\x90\x86\x80 = 𐆀 (not in \w)
-
- // Simple ASCII word boundaries.
- assert!(!look.matches(B("a"), 0));
- assert!(!look.matches(B("a"), 1));
- assert!(!look.matches(B("a "), 1));
- assert!(!look.matches(B(" a "), 1));
- assert!(!look.matches(B(" a "), 2));
-
- // Unicode word boundaries with a non-ASCII codepoint.
- assert!(!look.matches(B("𝛃"), 0));
- assert!(!look.matches(B("𝛃"), 4));
- assert!(!look.matches(B("𝛃 "), 4));
- assert!(!look.matches(B(" 𝛃 "), 1));
- assert!(!look.matches(B(" 𝛃 "), 5));
-
- // Unicode word boundaries between non-ASCII codepoints.
- assert!(!look.matches(B("𝛃𐆀"), 0));
- assert!(!look.matches(B("𝛃𐆀"), 4));
-
- // Non word boundaries for ASCII.
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("ab"), 1));
- assert!(look.matches(B("a "), 2));
- assert!(look.matches(B(" a "), 0));
- assert!(look.matches(B(" a "), 3));
-
- // Non word boundaries with a non-ASCII codepoint.
- assert!(look.matches(B("𝛃b"), 4));
- assert!(look.matches(B("𝛃 "), 5));
- assert!(look.matches(B(" 𝛃 "), 0));
- assert!(look.matches(B(" 𝛃 "), 6));
- // These don't match because they could otherwise return an offset that
- // splits the UTF-8 encoding of a codepoint.
- assert!(!look.matches(B("𝛃"), 1));
- assert!(!look.matches(B("𝛃"), 2));
- assert!(!look.matches(B("𝛃"), 3));
-
- // Non word boundaries with non-ASCII codepoints. These also don't
- // match because they could otherwise return an offset that splits the
- // UTF-8 encoding of a codepoint.
- assert!(!look.matches(B("𝛃𐆀"), 1));
- assert!(!look.matches(B("𝛃𐆀"), 2));
- assert!(!look.matches(B("𝛃𐆀"), 3));
- assert!(!look.matches(B("𝛃𐆀"), 5));
- assert!(!look.matches(B("𝛃𐆀"), 6));
- assert!(!look.matches(B("𝛃𐆀"), 7));
- // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
- // of the haystack. So the "end" of the haystack isn't a word and 𐆀
- // isn't a word, thus, \B matches.
- assert!(look.matches(B("𝛃𐆀"), 8));
- }
-
- #[test]
- fn look_matches_word_ascii_negate() {
- let look = Look::WordBoundaryAsciiNegate;
-
- // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
- // \xF0\x90\x86\x80 = 𐆀 (not in \w)
-
- // Simple ASCII word boundaries.
- assert!(!look.matches(B("a"), 0));
- assert!(!look.matches(B("a"), 1));
- assert!(!look.matches(B("a "), 1));
- assert!(!look.matches(B(" a "), 1));
- assert!(!look.matches(B(" a "), 2));
-
- // Unicode word boundaries with a non-ASCII codepoint. Since this is
- // an ASCII word boundary, none of these match.
- assert!(look.matches(B("𝛃"), 0));
- assert!(look.matches(B("𝛃"), 4));
- assert!(look.matches(B("𝛃 "), 4));
- assert!(look.matches(B(" 𝛃 "), 1));
- assert!(look.matches(B(" 𝛃 "), 5));
-
- // Unicode word boundaries between non-ASCII codepoints. Again, since
- // this is an ASCII word boundary, none of these match.
- assert!(look.matches(B("𝛃𐆀"), 0));
- assert!(look.matches(B("𝛃𐆀"), 4));
-
- // Non word boundaries for ASCII.
- assert!(look.matches(B(""), 0));
- assert!(look.matches(B("ab"), 1));
- assert!(look.matches(B("a "), 2));
- assert!(look.matches(B(" a "), 0));
- assert!(look.matches(B(" a "), 3));
-
- // Non word boundaries with a non-ASCII codepoint.
- assert!(!look.matches(B("𝛃b"), 4));
- assert!(look.matches(B("𝛃 "), 5));
- assert!(look.matches(B(" 𝛃 "), 0));
- assert!(look.matches(B(" 𝛃 "), 6));
- assert!(look.matches(B("𝛃"), 1));
- assert!(look.matches(B("𝛃"), 2));
- assert!(look.matches(B("𝛃"), 3));
-
- // Non word boundaries with non-ASCII codepoints.
- assert!(look.matches(B("𝛃𐆀"), 1));
- assert!(look.matches(B("𝛃𐆀"), 2));
- assert!(look.matches(B("𝛃𐆀"), 3));
- assert!(look.matches(B("𝛃𐆀"), 5));
- assert!(look.matches(B("𝛃𐆀"), 6));
- assert!(look.matches(B("𝛃𐆀"), 7));
- assert!(look.matches(B("𝛃𐆀"), 8));
- }
-
- fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] {
- string.as_ref()
- }
-}
+pub use self::{
+ builder::Builder,
+ error::BuildError,
+ nfa::{
+ DenseTransitions, PatternIter, SparseTransitions, State, Transition,
+ NFA,
+ },
+};
+#[cfg(feature = "syntax")]
+pub use compiler::{Compiler, Config, WhichCaptures};
diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs
new file mode 100644
index 000000000..2108fa338
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs
@@ -0,0 +1,2101 @@
+use core::{fmt, mem};
+
+use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
+
+#[cfg(feature = "syntax")]
+use crate::nfa::thompson::{
+ compiler::{Compiler, Config},
+ error::BuildError,
+};
+use crate::{
+ nfa::thompson::builder::Builder,
+ util::{
+ alphabet::{self, ByteClassSet, ByteClasses},
+ captures::{GroupInfo, GroupInfoError},
+ look::{Look, LookMatcher, LookSet},
+ primitives::{
+ IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID,
+ },
+ sparse_set::SparseSet,
+ },
+};
+
+/// A byte oriented Thompson non-deterministic finite automaton (NFA).
+///
+/// A Thompson NFA is a finite state machine that permits unconditional epsilon
+/// transitions, but guarantees that there exists at most one non-epsilon
+/// transition for each element in the alphabet for each state.
+///
+/// An NFA may be used directly for searching, for analysis or to build
+/// a deterministic finite automaton (DFA).
+///
+/// # Cheap clones
+///
+/// Since an NFA is a core data type in this crate that many other regex
+/// engines are based on top of, it is convenient to give ownership of an NFA
+/// to said regex engines. Because of this, an NFA uses reference counting
+/// internally. Therefore, it is cheap to clone and it is encouraged to do so.
+///
+/// # Capabilities
+///
+/// Using an NFA for searching via the
+/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount
+/// of "power" of any regex engine in this crate. Namely, it supports the
+/// following in all cases:
+///
+/// 1. Detection of a match.
+/// 2. Location of a match, including both the start and end offset, in a
+/// single pass of the haystack.
+/// 3. Location of matching capturing groups.
+/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are
+/// present.
+///
+/// # Capturing Groups
+///
+/// Groups refer to parenthesized expressions inside a regex pattern. They look
+/// like this, where `exp` is an arbitrary regex:
+///
+/// * `(exp)` - An unnamed capturing group.
+/// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group.
+/// * `(?:exp)` - A non-capturing group.
+/// * `(?i:exp)` - A non-capturing group that sets flags.
+///
+/// Only the first two forms are said to be _capturing_. Capturing
+/// means that the last position at which they match is reportable. The
+/// [`Captures`](crate::util::captures::Captures) type provides convenient
+/// access to the match positions of capturing groups, which includes looking
+/// up capturing groups by their name.
+///
+/// # Byte oriented
+///
+/// This NFA is byte oriented, which means that all of its transitions are
+/// defined on bytes. In other words, the alphabet of an NFA consists of the
+/// 256 different byte values.
+///
+/// While DFAs nearly demand that they be byte oriented for performance
+/// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed,
+/// a previous version of this NFA supported both byte and codepoint oriented
+/// modes. A codepoint oriented mode can work because an NFA fundamentally uses
+/// a sparse representation of transitions, which works well with the large
+/// sparse space of Unicode codepoints.
+///
+/// Nevertheless, this NFA is only byte oriented. This choice is primarily
+/// driven by implementation simplicity, and also in part memory usage. In
+/// practice, performance between the two is roughly comparable. However,
+/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA.
+/// So if we do have a codepoint oriented NFA, then we also need to generate
+/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only
+/// generating byte oriented NFAs, we can produce one less NFA. In other words,
+/// if we made our NFA codepoint oriented, we'd need to *also* make it support
+/// a byte oriented mode, which is more complicated. But a byte oriented mode
+/// can support everything.
+///
+/// # Differences with DFAs
+///
+/// At the theoretical level, the precise difference between an NFA and a DFA
+/// is that, in a DFA, for every state, an input symbol unambiguously refers
+/// to a single transition _and_ that an input symbol is required for each
+/// transition. At a practical level, this permits DFA implementations to be
+/// implemented at their core with a small constant number of CPU instructions
+/// for each byte of input searched. In practice, this makes them quite a bit
+/// faster than NFAs _in general_. Namely, in order to execute a search for any
+/// Thompson NFA, one needs to keep track of a _set_ of states, and execute
+/// the possible transitions on all of those states for each input symbol.
+/// Overall, this results in much more overhead. To a first approximation, one
+/// can expect DFA searches to be about an order of magnitude faster.
+///
+/// So why use an NFA at all? The main advantage of an NFA is that it takes
+/// linear time (in the size of the pattern string after repetitions have been
+/// expanded) to build and linear memory usage. A DFA, on the other hand, may
+/// take exponential time and/or space to build. Even in non-pathological
+/// cases, DFAs often take quite a bit more memory than their NFA counterparts,
+/// _especially_ if large Unicode character classes are involved. Of course,
+/// an NFA also provides additional capabilities. For example, it can match
+/// Unicode word boundaries on non-ASCII text and resolve the positions of
+/// capturing groups.
+///
+/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a
+/// good balance between an NFA and a DFA. It avoids the exponential build time
+/// of a DFA while maintaining its fast search time. The downside of a hybrid
+/// NFA/DFA is that in some cases it can be slower at search time than the NFA.
+/// (It also has less functionality than a pure NFA. It cannot handle Unicode
+/// word boundaries on non-ASCII text and cannot resolve capturing groups.)
+///
+/// # Example
+///
+/// This shows how to build an NFA with the default configuration and execute a
+/// search using the Pike VM.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+///
+/// let re = PikeVM::new(r"foo[0-9]+")?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+///
+/// let expected = Some(Match::must(0, 0..8));
+/// re.captures(&mut cache, b"foo12345", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: resolving capturing groups
+///
+/// This example shows how to parse some simple dates and extract the
+/// components of each date via capturing groups.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// util::captures::Captures,
+/// };
+///
+/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?;
+/// let mut cache = vm.create_cache();
+///
+/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05";
+/// let all: Vec<Captures> = vm.captures_iter(
+/// &mut cache, haystack.as_bytes()
+/// ).collect();
+/// // There should be a total of 3 matches.
+/// assert_eq!(3, all.len());
+/// // The year from the second match is '2013'.
+/// let span = all[1].get_group_by_name("y").unwrap();
+/// assert_eq!("2013", &haystack[span]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This example shows that only the last match of a capturing group is
+/// reported, even if it had to match multiple times for an overall match
+/// to occur.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+///
+/// let re = PikeVM::new(r"([a-z]){4}")?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+///
+/// let haystack = b"quux";
+/// re.captures(&mut cache, haystack, &mut caps);
+/// assert!(caps.is_match());
+/// assert_eq!(Some(Span::from(3..4)), caps.get_group(1));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct NFA(
+ // We make NFAs reference counted primarily for two reasons. First is that
+ // the NFA type itself is quite large (at least 0.5KB), and so it makes
+ // sense to put it on the heap by default anyway. Second is that, for Arc
+ // specifically, this enables cheap clones. This tends to be useful because
+ // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA)
+ // all want to hang on to an NFA for use during search time. We could
+ // provide the NFA at search time via a function argument, but this makes
+ // for an unnecessarily annoying API. Instead, we just let each structure
+ // share ownership of the NFA. Using a deep clone would not be smart, since
+ // the NFA can use quite a bit of heap space.
+ Arc<Inner>,
+);
+
+impl NFA {
+ /// Parse the given regular expression using a default configuration and
+ /// build an NFA from it.
+ ///
+ /// If you want a non-default configuration, then use the NFA
+ /// [`Compiler`] with a [`Config`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new(r"foo[0-9]+")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 0..8));
+ /// re.captures(&mut cache, b"foo12345", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<NFA, BuildError> {
+ NFA::compiler().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// build a multi-NFA from them.
+ ///
+ /// If you want a non-default configuration, then use the NFA
+ /// [`Compiler`] with a [`Config`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(1, 0..3));
+ /// re.captures(&mut cache, b"foo12345bar", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> {
+ NFA::compiler().build_many(patterns)
+ }
+
+ /// Returns an NFA with a single regex pattern that always matches at every
+ /// position.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let re = PikeVM::new_from_nfa(NFA::always_match())?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 0..0));
+ /// re.captures(&mut cache, b"", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ /// re.captures(&mut cache, b"foo", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> NFA {
+ // We could use NFA::new("") here and we'd get the same semantics, but
+ // hand-assembling the NFA (as below) does the same thing with a fewer
+ // number of states. It also avoids needing the 'syntax' feature
+ // enabled.
+ //
+ // Technically all we need is the "match" state, but we add the
+ // "capture" states so that the PikeVM can use this NFA.
+ //
+ // The unwraps below are OK because we add so few states that they will
+ // never exhaust any default limits in any environment.
+ let mut builder = Builder::new();
+ let pid = builder.start_pattern().unwrap();
+ assert_eq!(pid.as_usize(), 0);
+ let start_id =
+ builder.add_capture_start(StateID::ZERO, 0, None).unwrap();
+ let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap();
+ let match_id = builder.add_match().unwrap();
+ builder.patch(start_id, end_id).unwrap();
+ builder.patch(end_id, match_id).unwrap();
+ let pid = builder.finish_pattern(start_id).unwrap();
+ assert_eq!(pid.as_usize(), 0);
+ builder.build(start_id, start_id).unwrap()
+ }
+
+ /// Returns an NFA that never matches at any position.
+ ///
+ /// This is a convenience routine for creating an NFA with zero patterns.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
+ ///
+ /// let re = PikeVM::new_from_nfa(NFA::never_match())?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, b"", &mut caps);
+ /// assert!(!caps.is_match());
+ /// re.captures(&mut cache, b"foo", &mut caps);
+ /// assert!(!caps.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> NFA {
+ // This always succeeds because it only requires one NFA state, which
+ // will never exhaust any (default) limits.
+ let mut builder = Builder::new();
+ let sid = builder.add_fail().unwrap();
+ builder.build(sid, sid).unwrap()
+ }
+
+ /// Return a default configuration for an `NFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of an NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build an NFA with a small size limit that
+ /// results in a compilation error for any regex that tries to use more
+ /// heap memory than the configured limit.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
+ ///
+ /// let result = PikeVM::builder()
+ /// .thompson(NFA::config().nfa_size_limit(Some(1_000)))
+ /// // Remember, \w is Unicode-aware by default and thus huge.
+ /// .build(r"\w+");
+ /// assert!(result.is_err());
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a compiler for configuring the construction of an `NFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Compiler`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build an NFA that is permitted match invalid
+ /// UTF-8. Without the additional syntax configuration here, compilation of
+ /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"[a-z]+(?-u:.)")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 1..5));
+ /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn compiler() -> Compiler {
+ Compiler::new()
+ }
+
+ /// Returns an iterator over all pattern identifiers in this NFA.
+ ///
+ /// Pattern IDs are allocated in sequential order starting from zero,
+ /// where the order corresponds to the order of patterns provided to the
+ /// [`NFA::new_many`] constructor.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// let pids: Vec<PatternID> = nfa.patterns().collect();
+ /// assert_eq!(pids, vec![
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// PatternID::must(2),
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn patterns(&self) -> PatternIter<'_> {
+ PatternIter {
+ it: PatternID::iter(self.pattern_len()),
+ _marker: core::marker::PhantomData,
+ }
+ }
+
+ /// Returns the total number of regex patterns in this NFA.
+ ///
+ /// This may return zero if the NFA was constructed with no patterns. In
+ /// this case, the NFA can never produce a match for any input.
+ ///
+ /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
+ /// NFA construction will fail if too many patterns are added.
+ ///
+ /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(3, nfa.pattern_len());
+ ///
+ /// let nfa = NFA::never_match();
+ /// assert_eq!(0, nfa.pattern_len());
+ ///
+ /// let nfa = NFA::always_match();
+ /// assert_eq!(1, nfa.pattern_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.0.start_pattern.len()
+ }
+
+ /// Return the state identifier of the initial anchored state of this NFA.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Example
+ ///
+ /// This example shows a somewhat contrived example where we can easily
+ /// predict the anchored starting state.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build("a")?;
+ /// let state = nfa.state(nfa.start_anchored());
+ /// match *state {
+ /// State::ByteRange { trans } => {
+ /// assert_eq!(b'a', trans.start);
+ /// assert_eq!(b'a', trans.end);
+ /// }
+ /// _ => unreachable!("unexpected state"),
+ /// }
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_anchored(&self) -> StateID {
+ self.0.start_anchored
+ }
+
+ /// Return the state identifier of the initial unanchored state of this
+ /// NFA.
+ ///
+ /// This is equivalent to the identifier returned by
+ /// [`NFA::start_anchored`] when the NFA has no unanchored starting state.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Example
+ ///
+ /// This example shows that the anchored and unanchored starting states
+ /// are equivalent when an anchored NFA is built.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new("^a")?;
+ /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_unanchored(&self) -> StateID {
+ self.0.start_unanchored
+ }
+
+ /// Return the state identifier of the initial anchored state for the given
+ /// pattern, or `None` if there is no pattern corresponding to the given
+ /// identifier.
+ ///
+ /// If one uses the starting state for a particular pattern, then the only
+ /// match that can be returned is for the corresponding pattern.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Errors
+ ///
+ /// If the pattern doesn't exist in this NFA, then this returns an error.
+ /// This occurs when `pid.as_usize() >= nfa.pattern_len()`.
+ ///
+ /// # Example
+ ///
+ /// This example shows that the anchored and unanchored starting states
+ /// are equivalent when an anchored NFA is built.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["^a", "^b"])?;
+ /// // The anchored and unanchored states for the entire NFA are the same,
+ /// // since all of the patterns are anchored.
+ /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
+ /// // But the anchored starting states for each pattern are distinct,
+ /// // because these starting states can only lead to matches for the
+ /// // corresponding pattern.
+ /// let anchored = Some(nfa.start_anchored());
+ /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0)));
+ /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1)));
+ /// // Requesting a pattern not in the NFA will result in None:
+ /// assert_eq!(None, nfa.start_pattern(PatternID::must(2)));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> {
+ self.0.start_pattern.get(pid.as_usize()).copied()
+ }
+
+ /// Get the byte class set for this NFA.
+ ///
+ /// A byte class set is a partitioning of this NFA's alphabet into
+ /// equivalence classes. Any two bytes in the same equivalence class are
+ /// guaranteed to never discriminate between a match or a non-match. (The
+ /// partitioning may not be minimal.)
+ ///
+ /// Byte classes are used internally by this crate when building DFAs.
+ /// Namely, among other optimizations, they enable a space optimization
+ /// where the DFA's internal alphabet is defined over the equivalence
+ /// classes of bytes instead of all possible byte values. The former is
+ /// often quite a bit smaller than the latter, which permits the DFA to use
+ /// less space for its transition table.
+ #[inline]
+ pub(crate) fn byte_class_set(&self) -> &ByteClassSet {
+ &self.0.byte_class_set
+ }
+
+ /// Get the byte classes for this NFA.
+ ///
+ /// Byte classes represent a partitioning of this NFA's alphabet into
+ /// equivalence classes. Any two bytes in the same equivalence class are
+ /// guaranteed to never discriminate between a match or a non-match. (The
+ /// partitioning may not be minimal.)
+ ///
+ /// Byte classes are used internally by this crate when building DFAs.
+ /// Namely, among other optimizations, they enable a space optimization
+ /// where the DFA's internal alphabet is defined over the equivalence
+ /// classes of bytes instead of all possible byte values. The former is
+ /// often quite a bit smaller than the latter, which permits the DFA to use
+ /// less space for its transition table.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to query the class of various bytes.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new("[a-z]+")?;
+ /// let classes = nfa.byte_classes();
+ /// // 'a' and 'z' are in the same class for this regex.
+ /// assert_eq!(classes.get(b'a'), classes.get(b'z'));
+ /// // But 'a' and 'A' are not.
+ /// assert_ne!(classes.get(b'a'), classes.get(b'A'));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.0.byte_classes
+ }
+
+ /// Return a reference to the NFA state corresponding to the given ID.
+ ///
+ /// This is a convenience routine for `nfa.states()[id]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when the given identifier does not reference a valid state.
+ /// That is, when `id.as_usize() >= nfa.states().len()`.
+ ///
+ /// # Example
+ ///
+ /// The anchored state for a pattern will typically correspond to a
+ /// capturing state for that pattern. (Although, this is not an API
+ /// guarantee!)
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID};
+ ///
+ /// let nfa = NFA::new("a")?;
+ /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap());
+ /// match *state {
+ /// State::Capture { slot, .. } => {
+ /// assert_eq!(0, slot.as_usize());
+ /// }
+ /// _ => unreachable!("unexpected state"),
+ /// }
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states()[id]
+ }
+
+ /// Returns a slice of all states in this NFA.
+ ///
+ /// The slice returned is indexed by `StateID`. This provides a convenient
+ /// way to access states while following transitions among those states.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates that disabling UTF-8 mode can shrink the size of the
+ /// NFA considerably in some cases, especially when using Unicode character
+ /// classes.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa_unicode = NFA::new(r"\w")?;
+ /// let nfa_ascii = NFA::new(r"(?-u)\w")?;
+ /// // Yes, a factor of 45 difference. No lie.
+ /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn states(&self) -> &[State] {
+ &self.0.states
+ }
+
+ /// Returns the capturing group info for this NFA.
+ ///
+ /// The [`GroupInfo`] provides a way to map to and from capture index
+ /// and capture name for each pattern. It also provides a mapping from
+ /// each of the capturing groups in every pattern to their corresponding
+ /// slot offsets encoded in [`State::Capture`] states.
+ ///
+ /// Note that `GroupInfo` uses reference counting internally, such that
+ /// cloning a `GroupInfo` is very cheap.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get a list of all capture group names for
+ /// a particular pattern.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
+ /// // The first is the implicit group that is always unnammed. The next
+ /// // 5 groups are the explicit groups found in the concrete syntax above.
+ /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
+ /// let got: Vec<Option<&str>> =
+ /// nfa.group_info().pattern_names(PatternID::ZERO).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// // Using an invalid pattern ID will result in nothing yielded.
+ /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
+ /// assert_eq!(0, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn group_info(&self) -> &GroupInfo {
+ &self.0.group_info()
+ }
+
+ /// Returns true if and only if this NFA has at least one
+ /// [`Capture`](State::Capture) in its sequence of states.
+ ///
+ /// This is useful as a way to perform a quick test before attempting
+ /// something that does or does not require capture states. For example,
+ /// some regex engines (like the PikeVM) require capture states in order to
+ /// work at all.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different NFAs and whether they have captures
+ /// or not.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
+ ///
+ /// // Obviously has capture states.
+ /// let nfa = NFA::new("(a)")?;
+ /// assert!(nfa.has_capture());
+ ///
+ /// // Less obviously has capture states, because every pattern has at
+ /// // least one anonymous capture group corresponding to the match for the
+ /// // entire pattern.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.has_capture());
+ ///
+ /// // Other than hand building your own NFA, this is the only way to build
+ /// // an NFA without capturing groups. In general, you should only do this
+ /// // if you don't intend to use any of the NFA-oriented regex engines.
+ /// // Overall, capturing groups don't have many downsides. Although they
+ /// // can add a bit of noise to simple NFAs, so it can be nice to disable
+ /// // them for debugging purposes.
+ /// //
+ /// // Notice that 'has_capture' is false here even when we have an
+ /// // explicit capture group in the pattern.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build("(a)")?;
+ /// assert!(!nfa.has_capture());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn has_capture(&self) -> bool {
+ self.0.has_capture
+ }
+
+ /// Returns true if and only if this NFA can match the empty string.
+ /// When it returns false, all possible matches are guaranteed to have a
+ /// non-zero length.
+ ///
+ /// This is useful as cheap way to know whether code needs to handle the
+ /// case of a zero length match. This is particularly important when UTF-8
+ /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
+ /// split a codepoint must never be reported. This extra handling can
+ /// sometimes be costly, and since regexes matching an empty string are
+ /// somewhat rare, it can be beneficial to treat such regexes specially.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different NFAs and whether they match the
+ /// empty string or not. Notice the empty string isn't merely a matter
+ /// of a string of length literally `0`, but rather, whether a match can
+ /// occur between specific pairs of bytes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::syntax};
+ ///
+ /// // The empty regex matches the empty string.
+ /// let nfa = NFA::new("")?;
+ /// assert!(nfa.has_empty(), "empty matches empty");
+ /// // The '+' repetition operator requires at least one match, and so
+ /// // does not match the empty string.
+ /// let nfa = NFA::new("a+")?;
+ /// assert!(!nfa.has_empty(), "+ does not match empty");
+ /// // But the '*' repetition operator does.
+ /// let nfa = NFA::new("a*")?;
+ /// assert!(nfa.has_empty(), "* does match empty");
+ /// // And wrapping '+' in an operator that can match an empty string also
+ /// // causes it to match the empty string too.
+ /// let nfa = NFA::new("(a+)*")?;
+ /// assert!(nfa.has_empty(), "+ inside of * matches empty");
+ ///
+ /// // If a regex is just made of a look-around assertion, even if the
+ /// // assertion requires some kind of non-empty string around it (such as
+ /// // \b), then it is still treated as if it matches the empty string.
+ /// // Namely, if a match occurs of just a look-around assertion, then the
+ /// // match returned is empty.
+ /// let nfa = NFA::compiler()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
+ /// assert!(nfa.has_empty(), "assertions match empty");
+ /// // Even when an assertion is wrapped in a '+', it still matches the
+ /// // empty string.
+ /// let nfa = NFA::new(r"\b+")?;
+ /// assert!(nfa.has_empty(), "+ of an assertion matches empty");
+ ///
+ /// // An alternation with even one branch that can match the empty string
+ /// // is also said to match the empty string overall.
+ /// let nfa = NFA::new("foo|(bar)?|quux")?;
+ /// assert!(nfa.has_empty(), "alternations can match empty");
+ ///
+ /// // An NFA that matches nothing does not match the empty string.
+ /// let nfa = NFA::new("[a&&b]")?;
+ /// assert!(!nfa.has_empty(), "never matching means not matching empty");
+ /// // But if it's wrapped in something that doesn't require a match at
+ /// // all, then it can match the empty string!
+ /// let nfa = NFA::new("[a&&b]*")?;
+ /// assert!(nfa.has_empty(), "* on never-match still matches empty");
+ /// // Since a '+' requires a match, using it on something that can never
+ /// // match will itself produce a regex that can never match anything,
+ /// // and thus does not match the empty string.
+ /// let nfa = NFA::new("[a&&b]+")?;
+ /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn has_empty(&self) -> bool {
+ self.0.has_empty
+ }
+
+ /// Whether UTF-8 mode is enabled for this NFA or not.
+ ///
+ /// When UTF-8 mode is enabled, all matches reported by a regex engine
+ /// derived from this NFA are guaranteed to correspond to spans of valid
+ /// UTF-8. This includes zero-width matches. For example, the regex engine
+ /// must guarantee that the empty regex will not match at the positions
+ /// between code units in the UTF-8 encoding of a single codepoint.
+ ///
+ /// See [`Config::utf8`] for more information.
+ ///
+ /// This is enabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new("")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let mut input = Input::new("☃");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
+ ///
+ /// input.set_start(2);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
+ ///
+ /// input.set_start(3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// input.set_start(4);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_utf8(&self) -> bool {
+ self.0.utf8
+ }
+
+ /// Returns true when this NFA is meant to be matched in reverse.
+ ///
+ /// Generally speaking, when this is true, it means the NFA is supposed to
+ /// be used in conjunction with moving backwards through the haystack. That
+ /// is, from a higher memory address to a lower memory address.
+ ///
+ /// It is often the case that lower level routines dealing with an NFA
+ /// don't need to care about whether it is "meant" to be matched in reverse
+ /// or not. However, there are some specific cases where it matters. For
+ /// example, the implementation of CRLF-aware `^` and `$` line anchors
+ /// needs to know whether the search is in the forward or reverse
+ /// direction. In the forward direction, neither `^` nor `$` should match
+ /// when a `\r` has been seen previously and a `\n` is next. However, in
+ /// the reverse direction, neither `^` nor `$` should match when a `\n`
+ /// has been seen previously and a `\r` is next. This fundamentally changes
+ /// how the state machine is constructed, and thus needs to be altered
+ /// based on the direction of the search.
+ ///
+ /// This is automatically set when using a [`Compiler`] with a configuration
+ /// where [`Config::reverse`] is enabled. If you're building your own NFA
+ /// by hand via a [`Builder`]
+ #[inline]
+ pub fn is_reverse(&self) -> bool {
+ self.0.reverse
+ }
+
+ /// Returns true if and only if all starting states for this NFA correspond
+ /// to the beginning of an anchored search.
+ ///
+ /// Typically, an NFA will have both an anchored and an unanchored starting
+ /// state. Namely, because it tends to be useful to have both and the cost
+ /// of having an unanchored starting state is almost zero (for an NFA).
+ /// However, if all patterns in the NFA are themselves anchored, then even
+ /// the unanchored starting state will correspond to an anchored search
+ /// since the pattern doesn't permit anything else.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different scenarios where this method's
+ /// return value varies.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// // The unanchored starting state permits matching this pattern anywhere
+ /// // in a haystack, instead of just at the beginning.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // In this case, the pattern is itself anchored, so there is no way
+ /// // to run an unanchored search.
+ /// let nfa = NFA::new("^a")?;
+ /// assert!(nfa.is_always_start_anchored());
+ ///
+ /// // When multiline mode is enabled, '^' can match at the start of a line
+ /// // in addition to the start of a haystack, so an unanchored search is
+ /// // actually possible.
+ /// let nfa = NFA::new("(?m)^a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // Weird cases also work. A pattern is only considered anchored if all
+ /// // matches may only occur at the start of a haystack.
+ /// let nfa = NFA::new("(^a)|a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // When multiple patterns are present, if they are all anchored, then
+ /// // the NFA is always anchored too.
+ /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?;
+ /// assert!(nfa.is_always_start_anchored());
+ ///
+ /// // But if one pattern is unanchored, then the NFA must permit an
+ /// // unanchored search.
+ /// let nfa = NFA::new_many(&["^a", "b", "^c"])?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_always_start_anchored(&self) -> bool {
+ self.start_anchored() == self.start_unanchored()
+ }
+
+ /// Returns the look-around matcher associated with this NFA.
+ ///
+ /// A look-around matcher determines how to match look-around assertions.
+ /// In particular, some assertions are configurable. For example, the
+ /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
+ /// from the default of `\n` to any other byte.
+ ///
+ /// If the NFA was built using a [`Compiler`], then this matcher
+ /// can be set via the [`Config::look_matcher`] configuration
+ /// knob. Otherwise, if you've built an NFA by hand, it is set via
+ /// [`Builder::set_look_matcher`].
+ ///
+ /// # Example
+ ///
+ /// This shows how to change the line terminator for multi-line assertions.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// util::look::LookMatcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let mut lookm = LookMatcher::new();
+ /// lookm.set_line_terminator(b'\x00');
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().look_matcher(lookm))
+ /// .build(r"(?m)^[a-z]+$")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// // Multi-line assertions now use NUL as a terminator.
+ /// assert_eq!(
+ /// Some(Match::must(0, 1..4)),
+ /// re.find(&mut cache, b"\x00abc\x00"),
+ /// );
+ /// // ... and \n is no longer recognized as a terminator.
+ /// assert_eq!(
+ /// None,
+ /// re.find(&mut cache, b"\nabc\n"),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_matcher(&self) -> &LookMatcher {
+ &self.0.look_matcher
+ }
+
+ /// Returns the union of all look-around assertions used throughout this
+ /// NFA. When the returned set is empty, it implies that the NFA has no
+ /// look-around assertions and thus zero conditional epsilon transitions.
+ ///
+ /// This is useful in some cases enabling optimizations. It is not
+ /// unusual, for example, for optimizations to be of the form, "for any
+ /// regex with zero conditional epsilon transitions, do ..." where "..."
+ /// is some kind of optimization.
+ ///
+ /// This isn't only helpful for optimizations either. Sometimes look-around
+ /// assertions are difficult to support. For example, many of the DFAs in
+ /// this crate don't support Unicode word boundaries or handle them using
+ /// heuristics. Handling that correctly typically requires some kind of
+ /// cheap check of whether the NFA has a Unicode word boundary in the first
+ /// place.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_any().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the union,
+ /// // it will include look-around assertions that only appear in one
+ /// // pattern.
+ /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?;
+ /// assert!(nfa.look_set_any().contains(Look::Start));
+ ///
+ /// // Some groups of assertions have various shortcuts. For example:
+ /// let nfa = NFA::new(r"(?-u:\b)")?;
+ /// assert!(nfa.look_set_any().contains_word());
+ /// assert!(!nfa.look_set_any().contains_word_unicode());
+ /// assert!(nfa.look_set_any().contains_word_ascii());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_any(&self) -> LookSet {
+ self.0.look_set_any
+ }
+
+ /// Returns the union of all prefix look-around assertions for every
+ /// pattern in this NFA. When the returned set is empty, it implies none of
+ /// the patterns require moving through a conditional epsilon transition
+ /// before inspecting the first byte in the haystack.
+ ///
+ /// This can be useful for determining what kinds of assertions need to be
+ /// satisfied at the beginning of a search. For example, typically DFAs
+ /// in this crate will build a distinct starting state for each possible
+ /// starting configuration that might result in look-around assertions
+ /// being satisfied differently. However, if the set returned here is
+ /// empty, then you know that the start state is invariant because there
+ /// are no conditional epsilon transitions to consider.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_prefix_any().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the union,
+ /// // it will include look-around assertions that only appear in one
+ /// // pattern. But it will only include assertions that are in the prefix
+ /// // of a pattern. For example, this includes '^' but not '$' even though
+ /// // '$' does appear.
+ /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?;
+ /// assert!(nfa.look_set_prefix_any().contains(Look::Start));
+ /// assert!(!nfa.look_set_prefix_any().contains(Look::End));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_prefix_any(&self) -> LookSet {
+ self.0.look_set_prefix_any
+ }
+
+ // FIXME: The `look_set_prefix_all` computation was not correct, and it
+ // seemed a little tricky to fix it. Since I wasn't actually using it for
+ // anything, I just decided to remove it in the run up to the regex 1.9
+ // release. If you need this, please file an issue.
+ /*
+ /// Returns the intersection of all prefix look-around assertions for every
+ /// pattern in this NFA. When the returned set is empty, it implies at
+ /// least one of the patterns does not require moving through a conditional
+ /// epsilon transition before inspecting the first byte in the haystack.
+ /// Conversely, when the set contains an assertion, it implies that every
+ /// pattern in the NFA also contains that assertion in its prefix.
+ ///
+ /// This can be useful for determining what kinds of assertions need to be
+ /// satisfied at the beginning of a search. For example, if you know that
+ /// [`Look::Start`] is in the prefix intersection set returned here, then
+ /// you know that all searches, regardless of input configuration, will be
+ /// anchored.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_prefix_all().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the
+ /// // intersection, it will only include assertions present in every
+ /// // prefix, and only the prefix.
+ /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?;
+ /// assert!(nfa.look_set_prefix_all().contains(Look::Start));
+ /// assert!(!nfa.look_set_prefix_all().contains(Look::End));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_prefix_all(&self) -> LookSet {
+ self.0.look_set_prefix_all
+ }
+ */
+
+ /// Returns the memory usage, in bytes, of this NFA.
+ ///
+ /// This does **not** include the stack size used up by this NFA. To
+ /// compute that, use `std::mem::size_of::<NFA>()`.
+ ///
+ /// # Example
+ ///
+ /// This example shows that large Unicode character classes can use quite
+ /// a bit of memory.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa_unicode = NFA::new(r"\w")?;
+ /// let nfa_ascii = NFA::new(r"(?-u:\w)")?;
+ ///
+ /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ size_of::<Inner>() // allocated on the heap via Arc
+ + self.0.states.len() * size_of::<State>()
+ + self.0.start_pattern.len() * size_of::<StateID>()
+ + self.0.group_info.memory_usage()
+ + self.0.memory_extra
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ self.0.fmt(f)
+ }
+}
+
+/// The "inner" part of the NFA. We split this part out so that we can easily
+/// wrap it in an `Arc` above in the definition of `NFA`.
+///
+/// See builder.rs for the code that actually builds this type. This module
+/// does provide (internal) mutable methods for adding things to this
+/// NFA before finalizing it, but the high level construction process is
+/// controlled by the builder abstraction. (Which is complicated enough to
+/// get its own module.)
+#[derive(Default)]
+pub(super) struct Inner {
+ /// The state sequence. This sequence is guaranteed to be indexable by all
+ /// starting state IDs, and it is also guaranteed to contain at most one
+ /// `Match` state for each pattern compiled into this NFA. (A pattern may
+ /// not have a corresponding `Match` state if a `Match` state is impossible
+ /// to reach.)
+ states: Vec<State>,
+ /// The anchored starting state of this NFA.
+ start_anchored: StateID,
+ /// The unanchored starting state of this NFA.
+ start_unanchored: StateID,
+ /// The starting states for each individual pattern. Starting at any
+ /// of these states will result in only an anchored search for the
+ /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
+ /// contains a single regex, then `start_pattern[0]` and `start_anchored`
+ /// are always equivalent.
+ start_pattern: Vec<StateID>,
+ /// Info about the capturing groups in this NFA. This is responsible for
+ /// mapping groups to slots, mapping groups to names and names to groups.
+ group_info: GroupInfo,
+ /// A representation of equivalence classes over the transitions in this
+ /// NFA. Two bytes in the same equivalence class must not discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_class_set: ByteClassSet,
+ /// This is generated from `byte_class_set`, and essentially represents the
+ /// same thing but supports different access patterns. Namely, this permits
+ /// looking up the equivalence class of a byte very cheaply.
+ ///
+ /// Ideally we would just store this, but because of annoying code
+ /// structure reasons, we keep both this and `byte_class_set` around for
+ /// now. I think I would prefer that `byte_class_set` were computed in the
+ /// `Builder`, but right now, we compute it as states are added to the
+ /// `NFA`.
+ byte_classes: ByteClasses,
+ /// Whether this NFA has a `Capture` state anywhere.
+ has_capture: bool,
+ /// When the empty string is in the language matched by this NFA.
+ has_empty: bool,
+ /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that
+ /// all non-empty matches produced by this NFA correspond to spans of valid
+ /// UTF-8, and any empty matches produced by this NFA that split a UTF-8
+ /// encoded codepoint should be filtered out by the corresponding regex
+ /// engine.
+ utf8: bool,
+ /// Whether this NFA is meant to be matched in reverse or not.
+ reverse: bool,
+ /// The matcher to be used for look-around assertions.
+ look_matcher: LookMatcher,
+ /// The union of all look-around assertions that occur anywhere within
+ /// this NFA. If this set is empty, then it means there are precisely zero
+ /// conditional epsilon transitions in the NFA.
+ look_set_any: LookSet,
+ /// The union of all look-around assertions that occur as a zero-length
+ /// prefix for any of the patterns in this NFA.
+ look_set_prefix_any: LookSet,
+ /*
+ /// The intersection of all look-around assertions that occur as a
+ /// zero-length prefix for any of the patterns in this NFA.
+ look_set_prefix_all: LookSet,
+ */
+ /// Heap memory used indirectly by NFA states and other things (like the
+ /// various capturing group representations above). Since each state
+ /// might use a different amount of heap, we need to keep track of this
+ /// incrementally.
+ memory_extra: usize,
+}
+
+impl Inner {
+ /// Runs any last finalization bits and turns this into a full NFA.
+ pub(super) fn into_nfa(mut self) -> NFA {
+ self.byte_classes = self.byte_class_set.byte_classes();
+ // Do epsilon closure from the start state of every pattern in order
+ // to compute various properties such as look-around assertions and
+ // whether the empty string can be matched.
+ let mut stack = vec![];
+ let mut seen = SparseSet::new(self.states.len());
+ for &start_id in self.start_pattern.iter() {
+ stack.push(start_id);
+ seen.clear();
+ // let mut prefix_all = LookSet::full();
+ let mut prefix_any = LookSet::empty();
+ while let Some(sid) = stack.pop() {
+ if !seen.insert(sid) {
+ continue;
+ }
+ match self.states[sid] {
+ State::ByteRange { .. }
+ | State::Dense { .. }
+ | State::Fail => continue,
+ State::Sparse(_) => {
+ // This snippet below will rewrite this sparse state
+ // as a dense state. By doing it here, we apply this
+ // optimization to all hot "sparse" states since these
+ // are the states that are reachable from the start
+ // state via an epsilon closure.
+ //
+ // Unfortunately, this optimization did not seem to
+ // help much in some very limited ad hoc benchmarking.
+ //
+ // I left the 'Dense' state type in place in case we
+ // want to revisit this, but I suspect the real way
+ // to make forward progress is a more fundamental
+ // rearchitecting of how data in the NFA is laid out.
+ // I think we should consider a single contiguous
+ // allocation instead of all this indirection and
+ // potential heap allocations for every state. But this
+ // is a large re-design and will require API breaking
+ // changes.
+ // self.memory_extra -= self.states[sid].memory_usage();
+ // let trans = DenseTransitions::from_sparse(sparse);
+ // self.states[sid] = State::Dense(trans);
+ // self.memory_extra += self.states[sid].memory_usage();
+ continue;
+ }
+ State::Match { .. } => self.has_empty = true,
+ State::Look { look, next } => {
+ prefix_any = prefix_any.insert(look);
+ stack.push(next);
+ }
+ State::Union { ref alternates } => {
+ // Order doesn't matter here, since we're just dealing
+ // with look-around sets. But if we do richer analysis
+ // here that needs to care about preference order, then
+ // this should be done in reverse.
+ stack.extend(alternates.iter());
+ }
+ State::BinaryUnion { alt1, alt2 } => {
+ stack.push(alt2);
+ stack.push(alt1);
+ }
+ State::Capture { next, .. } => {
+ stack.push(next);
+ }
+ }
+ }
+ self.look_set_prefix_any =
+ self.look_set_prefix_any.union(prefix_any);
+ }
+ NFA(Arc::new(self))
+ }
+
+ /// Returns the capturing group info for this NFA.
+ pub(super) fn group_info(&self) -> &GroupInfo {
+ &self.group_info
+ }
+
+ /// Add the given state to this NFA after allocating a fresh identifier for
+ /// it.
+ ///
+ /// This panics if too many states are added such that a fresh identifier
+ /// could not be created. (Currently, the only caller of this routine is
+ /// a `Builder`, and it upholds this invariant.)
+ pub(super) fn add(&mut self, state: State) -> StateID {
+ match state {
+ State::ByteRange { ref trans } => {
+ self.byte_class_set.set_range(trans.start, trans.end);
+ }
+ State::Sparse(ref sparse) => {
+ for trans in sparse.transitions.iter() {
+ self.byte_class_set.set_range(trans.start, trans.end);
+ }
+ }
+ State::Dense { .. } => unreachable!(),
+ State::Look { look, .. } => {
+ self.look_matcher
+ .add_to_byteset(look, &mut self.byte_class_set);
+ self.look_set_any = self.look_set_any.insert(look);
+ }
+ State::Capture { .. } => {
+ self.has_capture = true;
+ }
+ State::Union { .. }
+ | State::BinaryUnion { .. }
+ | State::Fail
+ | State::Match { .. } => {}
+ }
+
+ let id = StateID::new(self.states.len()).unwrap();
+ self.memory_extra += state.memory_usage();
+ self.states.push(state);
+ id
+ }
+
+ /// Set the starting state identifiers for this NFA.
+ ///
+ /// `start_anchored` and `start_unanchored` may be equivalent. When they
+ /// are, then the NFA can only execute anchored searches. This might
+ /// occur, for example, for patterns that are unconditionally anchored.
+ /// e.g., `^foo`.
+ pub(super) fn set_starts(
+ &mut self,
+ start_anchored: StateID,
+ start_unanchored: StateID,
+ start_pattern: &[StateID],
+ ) {
+ self.start_anchored = start_anchored;
+ self.start_unanchored = start_unanchored;
+ self.start_pattern = start_pattern.to_vec();
+ }
+
+ /// Sets the UTF-8 mode of this NFA.
+ pub(super) fn set_utf8(&mut self, yes: bool) {
+ self.utf8 = yes;
+ }
+
+ /// Sets the reverse mode of this NFA.
+ pub(super) fn set_reverse(&mut self, yes: bool) {
+ self.reverse = yes;
+ }
+
+ /// Sets the look-around assertion matcher for this NFA.
+ pub(super) fn set_look_matcher(&mut self, m: LookMatcher) {
+ self.look_matcher = m;
+ }
+
+ /// Set the capturing groups for this NFA.
+ ///
+ /// The given slice should contain the capturing groups for each pattern,
+ /// The capturing groups in turn should correspond to the total number of
+ /// capturing groups in the pattern, including the anonymous first capture
+ /// group for each pattern. If a capturing group does have a name, then it
+ /// should be provided as a Arc<str>.
+ ///
+ /// This returns an error if a corresponding `GroupInfo` could not be
+ /// built.
+ pub(super) fn set_captures(
+ &mut self,
+ captures: &[Vec<Option<Arc<str>>>],
+ ) -> Result<(), GroupInfoError> {
+ self.group_info = GroupInfo::new(
+ captures.iter().map(|x| x.iter().map(|y| y.as_ref())),
+ )?;
+ Ok(())
+ }
+
+ /// Remap the transitions in every state of this NFA using the given map.
+ /// The given map should be indexed according to state ID namespace used by
+ /// the transitions of the states currently in this NFA.
+ ///
+ /// This is particularly useful to the NFA builder, since it is convenient
+ /// to add NFA states in order to produce their final IDs. Then, after all
+ /// of the intermediate "empty" states (unconditional epsilon transitions)
+ /// have been removed from the builder's representation, we can re-map all
+ /// of the transitions in the states already added to their final IDs.
+ pub(super) fn remap(&mut self, old_to_new: &[StateID]) {
+ for state in &mut self.states {
+ state.remap(old_to_new);
+ }
+ self.start_anchored = old_to_new[self.start_anchored];
+ self.start_unanchored = old_to_new[self.start_unanchored];
+ for id in self.start_pattern.iter_mut() {
+ *id = old_to_new[*id];
+ }
+ }
+}
+
+impl fmt::Debug for Inner {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "thompson::NFA(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ let status = if sid == self.start_anchored {
+ '^'
+ } else if sid == self.start_unanchored {
+ '>'
+ } else {
+ ' '
+ };
+ writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
+ }
+ let pattern_len = self.start_pattern.len();
+ if pattern_len > 1 {
+ writeln!(f, "")?;
+ for pid in 0..pattern_len {
+ let sid = self.start_pattern[pid];
+ writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?;
+ }
+ }
+ writeln!(f, "")?;
+ writeln!(
+ f,
+ "transition equivalence classes: {:?}",
+ self.byte_classes,
+ )?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// A state in an NFA.
+///
+/// In theory, it can help to conceptualize an `NFA` as a graph consisting of
+/// `State`s. Each `State` contains its complete set of outgoing transitions.
+///
+/// In practice, it can help to conceptualize an `NFA` as a sequence of
+/// instructions for a virtual machine. Each `State` says what to do and where
+/// to go next.
+///
+/// Strictly speaking, the practical interpretation is the most correct one,
+/// because of the [`Capture`](State::Capture) state. Namely, a `Capture`
+/// state always forwards execution to another state unconditionally. Its only
+/// purpose is to cause a side effect: the recording of the current input
+/// position at a particular location in memory. In this sense, an `NFA`
+/// has more power than a theoretical non-deterministic finite automaton.
+///
+/// For most uses of this crate, it is likely that one may never even need to
+/// be aware of this type at all. The main use cases for looking at `State`s
+/// directly are if you need to write your own search implementation or if you
+/// need to do some kind of analysis on the NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state with a single transition that can only be taken if the current
+ /// input symbol is in a particular range of bytes.
+ ByteRange {
+ /// The transition from this state to the next.
+ trans: Transition,
+ },
+ /// A state with possibly many transitions represented in a sparse fashion.
+ /// Transitions are non-overlapping and ordered lexicographically by input
+ /// range.
+ ///
+ /// In practice, this is used for encoding UTF-8 automata. Its presence is
+ /// primarily an optimization that avoids many additional unconditional
+ /// epsilon transitions (via [`Union`](State::Union) states), and thus
+ /// decreases the overhead of traversing the NFA. This can improve both
+ /// matching time and DFA construction time.
+ Sparse(SparseTransitions),
+ /// A dense representation of a state with multiple transitions.
+ Dense(DenseTransitions),
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around. Look-around is limited to anchor and word boundary
+ /// assertions.
+ ///
+ /// Look-around states are meant to be evaluated while performing epsilon
+ /// closure (computing the set of states reachable from a particular state
+ /// via only epsilon transitions). If the current position in the haystack
+ /// satisfies the look-around assertion, then you're permitted to follow
+ /// that epsilon transition.
+ Look {
+ /// The look-around assertion that must be satisfied before moving
+ /// to `next`.
+ look: Look,
+ /// The state to transition to if the look-around assertion is
+ /// satisfied.
+ next: StateID,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union {
+ /// An ordered sequence of unconditional epsilon transitions to other
+ /// states. Transitions earlier in the sequence are preferred over
+ /// transitions later in the sequence.
+ alternates: Box<[StateID]>,
+ },
+ /// An alternation such that there exists precisely two unconditional
+ /// epsilon transitions, where matches found via `alt1` are preferred over
+ /// matches found via `alt2`.
+ ///
+ /// This state exists as a common special case of Union where there are
+ /// only two alternates. In this case, we don't need any allocations to
+ /// represent the state. This saves a bit of memory and also saves an
+ /// additional memory access when traversing the NFA.
+ BinaryUnion {
+ /// An unconditional epsilon transition to another NFA state. This
+ /// is preferred over `alt2`.
+ alt1: StateID,
+ /// An unconditional epsilon transition to another NFA state. Matches
+ /// reported via this transition should only be reported if no matches
+ /// were found by following `alt1`.
+ alt2: StateID,
+ },
+ /// An empty state that records a capture location.
+ ///
+ /// From the perspective of finite automata, this is precisely equivalent
+ /// to an unconditional epsilon transition, but serves the purpose of
+ /// instructing NFA simulations to record additional state when the finite
+ /// state machine passes through this epsilon transition.
+ ///
+ /// `slot` in this context refers to the specific capture group slot
+ /// offset that is being recorded. Each capturing group has two slots
+ /// corresponding to the start and end of the matching portion of that
+ /// group.
+ ///
+ /// The pattern ID and capture group index are also included in this state
+ /// in case they are useful. But mostly, all you'll need is `next` and
+ /// `slot`.
+ Capture {
+ /// The state to transition to, unconditionally.
+ next: StateID,
+ /// The pattern ID that this capture belongs to.
+ pattern_id: PatternID,
+ /// The capture group index that this capture belongs to. Capture group
+ /// indices are local to each pattern. For example, when capturing
+ /// groups are enabled, every pattern has a capture group at index
+ /// `0`.
+ group_index: SmallIndex,
+ /// The slot index for this capture. Every capturing group has two
+ /// slots: one for the start haystack offset and one for the end
+ /// haystack offset. Unlike capture group indices, slot indices are
+ /// global across all patterns in this NFA. That is, each slot belongs
+ /// to a single pattern, but there is only one slot at index `i`.
+ slot: SmallIndex,
+ },
+ /// A state that cannot be transitioned out of. This is useful for cases
+ /// where you want to prevent matching from occurring. For example, if your
+ /// regex parser permits empty character classes, then one could choose
+ /// a `Fail` state to represent them. (An empty character class can be
+ /// thought of as an empty set. Since nothing is in an empty set, they can
+ /// never match anything.)
+ Fail,
+ /// A match state. There is at least one such occurrence of this state for
+ /// each regex that can match that is in this NFA.
+ Match {
+ /// The matching pattern ID.
+ pattern_id: PatternID,
+ },
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ ///
+ /// In practice, a state has no outgoing transitions (like `Match`), has
+ /// only non-epsilon transitions (like `ByteRange`) or has only epsilon
+ /// transitions (like `Union`).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{State, Transition},
+ /// util::primitives::{PatternID, StateID, SmallIndex},
+ /// };
+ ///
+ /// // Capture states are epsilon transitions.
+ /// let state = State::Capture {
+ /// next: StateID::ZERO,
+ /// pattern_id: PatternID::ZERO,
+ /// group_index: SmallIndex::ZERO,
+ /// slot: SmallIndex::ZERO,
+ /// };
+ /// assert!(state.is_epsilon());
+ ///
+ /// // ByteRange states are not.
+ /// let state = State::ByteRange {
+ /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO },
+ /// };
+ /// assert!(!state.is_epsilon());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::ByteRange { .. }
+ | State::Sparse { .. }
+ | State::Dense { .. }
+ | State::Fail
+ | State::Match { .. } => false,
+ State::Look { .. }
+ | State::Union { .. }
+ | State::BinaryUnion { .. }
+ | State::Capture { .. } => true,
+ }
+ }
+
+ /// Returns the heap memory usage of this NFA state in bytes.
+ fn memory_usage(&self) -> usize {
+ match *self {
+ State::ByteRange { .. }
+ | State::Look { .. }
+ | State::BinaryUnion { .. }
+ | State::Capture { .. }
+ | State::Match { .. }
+ | State::Fail => 0,
+ State::Sparse(SparseTransitions { ref transitions }) => {
+ transitions.len() * mem::size_of::<Transition>()
+ }
+ State::Dense { .. } => 256 * mem::size_of::<StateID>(),
+ State::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::ByteRange { ref mut trans } => {
+ trans.next = remap[trans.next]
+ }
+ State::Sparse(SparseTransitions { ref mut transitions }) => {
+ for t in transitions.iter_mut() {
+ t.next = remap[t.next];
+ }
+ }
+ State::Dense(DenseTransitions { ref mut transitions }) => {
+ for sid in transitions.iter_mut() {
+ *sid = remap[*sid];
+ }
+ }
+ State::Look { ref mut next, .. } => *next = remap[*next],
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::BinaryUnion { ref mut alt1, ref mut alt2 } => {
+ *alt1 = remap[*alt1];
+ *alt2 = remap[*alt2];
+ }
+ State::Capture { ref mut next, .. } => *next = remap[*next],
+ State::Fail => {}
+ State::Match { .. } => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ State::ByteRange { ref trans } => trans.fmt(f),
+ State::Sparse(SparseTransitions { ref transitions }) => {
+ let rs = transitions
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Dense(ref dense) => {
+ write!(f, "dense(")?;
+ for (i, t) in dense.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", t)?;
+ }
+ write!(f, ")")
+ }
+ State::Look { ref look, next } => {
+ write!(f, "{:?} => {:?}", look, next.as_usize())
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{:?}", id.as_usize()))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "union({})", alts)
+ }
+ State::BinaryUnion { alt1, alt2 } => {
+ write!(
+ f,
+ "binary-union({}, {})",
+ alt1.as_usize(),
+ alt2.as_usize()
+ )
+ }
+ State::Capture { next, pattern_id, group_index, slot } => {
+ write!(
+ f,
+ "capture(pid={:?}, group={:?}, slot={:?}) => {:?}",
+ pattern_id.as_usize(),
+ group_index.as_usize(),
+ slot.as_usize(),
+ next.as_usize(),
+ )
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match { pattern_id } => {
+ write!(f, "MATCH({:?})", pattern_id.as_usize())
+ }
+ }
+ }
+}
+
+/// A sequence of transitions used to represent a sparse state.
+///
+/// This is the primary representation of a [`Sparse`](State::Sparse) state.
+/// It corresponds to a sorted sequence of transitions with non-overlapping
+/// byte ranges. If the byte at the current position in the haystack matches
+/// one of the byte ranges, then the finite state machine should take the
+/// corresponding transition.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SparseTransitions {
+ /// The sorted sequence of non-overlapping transitions.
+ pub transitions: Box<[Transition]>,
+}
+
+impl SparseTransitions {
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a matching byte
+ /// range (there is at most one) corresponding to the position `at` in
+ /// `haystack`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+ haystack.get(at).and_then(|&b| self.matches_byte(b))
+ }
+
+ /// This follows the matching transition for any member of the alphabet.
+ ///
+ /// The matching transition is found by looking for a matching byte
+ /// range (there is at most one) corresponding to the position `at` in
+ /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi),
+ /// then this always returns `None`.
+ #[inline]
+ pub(crate) fn matches_unit(
+ &self,
+ unit: alphabet::Unit,
+ ) -> Option<StateID> {
+ unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+ }
+
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a matching byte range
+ /// (there is at most one) corresponding to the byte given.
+ #[inline]
+ pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+ for t in self.transitions.iter() {
+ if t.start > byte {
+ break;
+ } else if t.matches_byte(byte) {
+ return Some(t.next);
+ }
+ }
+ None
+
+ /*
+ // This is an alternative implementation that uses binary search. In
+ // some ad hoc experiments, like
+ //
+ // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
+ // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+ //
+ // I could not observe any improvement, and in fact, things seemed to
+ // be a bit slower. I can see an improvement in at least one benchmark:
+ //
+ // allcpssmall=all-codepoints-utf8-10x
+ // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+ //
+ // Where total search time goes from 3.2s to 2.4s when using binary
+ // search.
+ self.transitions
+ .binary_search_by(|t| {
+ if t.end < byte {
+ core::cmp::Ordering::Less
+ } else if t.start > byte {
+ core::cmp::Ordering::Greater
+ } else {
+ core::cmp::Ordering::Equal
+ }
+ })
+ .ok()
+ .map(|i| self.transitions[i].next)
+ */
+ }
+}
+
+/// A sequence of transitions used to represent a dense state.
+///
+/// This is the primary representation of a [`Dense`](State::Dense) state. It
+/// provides constant time matching. That is, given a byte in a haystack and
+/// a `DenseTransitions`, one can determine if the state matches in constant
+/// time.
+///
+/// This is in contrast to `SparseTransitions`, whose time complexity is
+/// necessarily bigger than constant time. Also in contrast, `DenseTransitions`
+/// usually requires (much) more heap memory.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct DenseTransitions {
+ /// A dense representation of this state's transitions on the heap. This
+ /// always has length 256.
+ pub transitions: Box<[StateID]>,
+}
+
+impl DenseTransitions {
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
+ /// position in `haystack`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+ haystack.get(at).and_then(|&b| self.matches_byte(b))
+ }
+
+ /// This follows the matching transition for any member of the alphabet.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
+ /// position in `haystack`.
+ ///
+ /// If `at >= haystack.len()` or if the given alphabet unit is
+ /// [`EOI`](alphabet::Unit::eoi), then this returns `None`.
+ #[inline]
+ pub(crate) fn matches_unit(
+ &self,
+ unit: alphabet::Unit,
+ ) -> Option<StateID> {
+ unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+ }
+
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the given `byte`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+ let next = self.transitions[usize::from(byte)];
+ if next == StateID::ZERO {
+ None
+ } else {
+ Some(next)
+ }
+ }
+
+ /*
+ /// The dense state optimization isn't currently enabled, so permit a
+ /// little bit of dead code.
+ pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions {
+ let mut dense = vec![StateID::ZERO; 256];
+ for t in sparse.transitions.iter() {
+ for b in t.start..=t.end {
+ dense[usize::from(b)] = t.next;
+ }
+ }
+ DenseTransitions { transitions: dense.into_boxed_slice() }
+ }
+ */
+
+ /// Returns an iterator over all transitions that don't point to
+ /// `StateID::ZERO`.
+ pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ {
+ use crate::util::int::Usize;
+ self.transitions
+ .iter()
+ .enumerate()
+ .filter(|&(_, &sid)| sid != StateID::ZERO)
+ .map(|(byte, &next)| Transition {
+ start: byte.as_u8(),
+ end: byte.as_u8(),
+ next,
+ })
+ }
+}
+
+/// A single transition to another state.
+///
+/// This transition may only be followed if the current byte in the haystack
+/// falls in the inclusive range of bytes specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ /// The inclusive start of the byte range.
+ pub start: u8,
+ /// The inclusive end of the byte range.
+ pub end: u8,
+ /// The identifier of the state to transition to.
+ pub next: StateID,
+}
+
+impl Transition {
+ /// Returns true if the position `at` in `haystack` falls in this
+ /// transition's range of bytes.
+ ///
+ /// If `at >= haystack.len()`, then this returns `false`.
+ pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
+ haystack.get(at).map_or(false, |&b| self.matches_byte(b))
+ }
+
+ /// Returns true if the given alphabet unit falls in this transition's
+ /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then
+ /// this returns `false`.
+ pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
+ unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
+ }
+
+ /// Returns true if the given byte falls in this transition's range of
+ /// bytes.
+ pub fn matches_byte(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::util::escape::DebugByte;
+
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )
+ }
+ }
+}
+
+/// An iterator over all pattern IDs in an NFA.
+///
+/// This iterator is created by [`NFA::patterns`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the NFA from which
+/// this pattern iterator was created.
+#[derive(Debug)]
+pub struct PatternIter<'a> {
+ it: PatternIDIter,
+ /// We explicitly associate a lifetime with this iterator even though we
+ /// don't actually borrow anything from the NFA. We do this for backward
+ /// compatibility purposes. If we ever do need to borrow something from
+ /// the NFA, then we can and just get rid of this marker without breaking
+ /// the public API.
+ _marker: core::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> Iterator for PatternIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ self.it.next()
+ }
+}
+
+#[cfg(all(test, feature = "nfa-pikevm"))]
+mod tests {
+ use super::*;
+ use crate::{nfa::thompson::pikevm::PikeVM, Input};
+
+ // This asserts that an NFA state doesn't have its size changed. It is
+ // *really* easy to accidentally increase the size, and thus potentially
+ // dramatically increase the memory usage of every NFA.
+ //
+ // This assert doesn't mean we absolutely cannot increase the size of an
+ // NFA state. We can. It's just here to make sure we do it knowingly and
+ // intentionally.
+ #[test]
+ fn state_has_small_size() {
+ #[cfg(target_pointer_width = "64")]
+ assert_eq!(24, core::mem::size_of::<State>());
+ #[cfg(target_pointer_width = "32")]
+ assert_eq!(20, core::mem::size_of::<State>());
+ }
+
+ #[test]
+ fn always_match() {
+ let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap();
+ let mut cache = re.create_cache();
+ let mut caps = re.create_captures();
+ let mut find = |haystack, start, end| {
+ let input = Input::new(haystack).range(start..end);
+ re.search(&mut cache, &input, &mut caps);
+ caps.get_match().map(|m| m.end())
+ };
+
+ assert_eq!(Some(0), find("", 0, 0));
+ assert_eq!(Some(0), find("a", 0, 1));
+ assert_eq!(Some(1), find("a", 1, 1));
+ assert_eq!(Some(0), find("ab", 0, 2));
+ assert_eq!(Some(1), find("ab", 1, 2));
+ assert_eq!(Some(2), find("ab", 2, 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap();
+ let mut cache = re.create_cache();
+ let mut caps = re.create_captures();
+ let mut find = |haystack, start, end| {
+ let input = Input::new(haystack).range(start..end);
+ re.search(&mut cache, &input, &mut caps);
+ caps.get_match().map(|m| m.end())
+ };
+
+ assert_eq!(None, find("", 0, 0));
+ assert_eq!(None, find("a", 0, 1));
+ assert_eq!(None, find("a", 1, 1));
+ assert_eq!(None, find("ab", 0, 2));
+ assert_eq!(None, find("ab", 1, 2));
+ assert_eq!(None, find("ab", 2, 2));
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/pikevm.rs b/vendor/regex-automata/src/nfa/thompson/pikevm.rs
index 7572f9f10..0128c151a 100644
--- a/vendor/regex-automata/src/nfa/thompson/pikevm.rs
+++ b/vendor/regex-automata/src/nfa/thompson/pikevm.rs
@@ -1,18 +1,71 @@
-use alloc::{sync::Arc, vec, vec::Vec};
+/*!
+An NFA backed Pike VM for executing regex searches with capturing groups.
+
+This module provides a [`PikeVM`] that works by simulating an NFA and
+resolving all spans of capturing groups that participate in a match.
+*/
+
+#[cfg(feature = "internal-instrument-pikevm")]
+use core::cell::RefCell;
+
+use alloc::{vec, vec::Vec};
use crate::{
- nfa::thompson::{self, Error, State, NFA},
+ nfa::thompson::{self, BuildError, State, NFA},
util::{
- id::{PatternID, StateID},
- matchtypes::MultiMatch,
+ captures::Captures,
+ empty, iter,
+ prefilter::Prefilter,
+ primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
+ search::{
+ Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
+ },
sparse_set::SparseSet,
},
};
-#[derive(Clone, Copy, Debug, Default)]
+/// A simple macro for conditionally executing instrumentation logic when
+/// the 'trace' log level is enabled. This is a compile-time no-op when the
+/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
+/// this makes it easier to avoid doing extra work when instrumentation isn't
+/// enabled.
+///
+/// This macro accepts a closure of type `|&mut Counters|`. The closure can
+/// then increment counters (or whatever) in accordance with what one wants
+/// to track.
+macro_rules! instrument {
+ ($fun:expr) => {
+ #[cfg(feature = "internal-instrument-pikevm")]
+ {
+ let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
+ COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
+ }
+ };
+}
+
+#[cfg(feature = "internal-instrument-pikevm")]
+std::thread_local! {
+ /// Effectively global state used to keep track of instrumentation
+ /// counters. The "proper" way to do this is to thread it through the
+ /// PikeVM, but it makes the code quite icky. Since this is just a
+ /// debugging feature, we're content to relegate it to thread local
+ /// state. When instrumentation is enabled, the counters are reset at the
+ /// beginning of every search and printed (with the 'trace' log level) at
+ /// the end of every search.
+ static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
+}
+
+/// The configuration used for building a [`PikeVM`].
+///
+/// A PikeVM configuration is a simple data object that is typically used with
+/// [`Builder::configure`]. It can be cheaply cloned.
+///
+/// A default configuration can be created either with `Config::new`, or
+/// perhaps more conveniently, with [`PikeVM::config`].
+#[derive(Clone, Debug, Default)]
pub struct Config {
- anchored: Option<bool>,
- utf8: Option<bool>,
+ match_kind: Option<MatchKind>,
+ pre: Option<Option<Prefilter>>,
}
impl Config {
@@ -21,37 +74,172 @@ impl Config {
Config::default()
}
- pub fn anchored(mut self, yes: bool) -> Config {
- self.anchored = Some(yes);
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
+ /// where all possible matches are visited in the NFA by the `PikeVM`.
+ ///
+ /// Typically, `All` is used when one wants to execute an overlapping
+ /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+ /// sense to use `All` with the various "leftmost" find routines, since the
+ /// leftmost routines depend on the `LeftmostFirst` automata construction
+ /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM`
+ /// simulating dead states as a way to terminate the search and report a
+ /// match. `LeftmostFirst` also supports non-greedy matches using this
+ /// strategy where as `All` does not.
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
self
}
- pub fn utf8(mut self, yes: bool) -> Config {
- self.utf8 = Some(yes);
+ /// Set a prefilter to be used whenever a start state is entered.
+ ///
+ /// A [`Prefilter`] in this context is meant to accelerate searches by
+ /// looking for literal prefixes that every match for the corresponding
+ /// pattern (or patterns) must start with. Once a prefilter produces a
+ /// match, the underlying search routine continues on to try and confirm
+ /// the match.
+ ///
+ /// Be warned that setting a prefilter does not guarantee that the search
+ /// will be faster. While it's usually a good bet, if the prefilter
+ /// produces a lot of false positive candidates (i.e., positions matched
+ /// by the prefilter but not by the regex), then the overall result can
+ /// be slower than if you had just executed the regex engine without any
+ /// prefilters.
+ ///
+ /// By default no prefilter is set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::prefilter::Prefilter,
+ /// Input, Match, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
+ /// let re = PikeVM::builder()
+ /// .configure(PikeVM::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Be warned though that an incorrect prefilter can lead to incorrect
+ /// results!
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
+ /// let re = PikeVM::builder()
+ /// .configure(PikeVM::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("foo1 barfox bar");
+ /// // No match reported even though there clearly is one!
+ /// assert_eq!(None, re.find(&mut cache, input));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
+ self.pre = Some(pre);
self
}
- pub fn get_anchored(&self) -> bool {
- self.anchored.unwrap_or(false)
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
}
- pub fn get_utf8(&self) -> bool {
- self.utf8.unwrap_or(true)
+ /// Returns the prefilter set in this configuration, if one at all.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
}
- pub(crate) fn overwrite(self, o: Config) -> Config {
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
Config {
- anchored: o.anchored.or(self.anchored),
- utf8: o.utf8.or(self.utf8),
+ match_kind: o.match_kind.or(self.match_kind),
+ pre: o.pre.or_else(|| self.pre.clone()),
}
}
}
-/// A builder for a PikeVM.
+/// A builder for a `PikeVM`.
+///
+/// This builder permits configuring options for the syntax of a pattern,
+/// the NFA construction and the `PikeVM` construction. This builder is
+/// different from a general purpose regex builder in that it permits fine
+/// grain configuration of the construction process. The trade off for this is
+/// complexity, and the possibility of setting a configuration that might not
+/// make sense. For example, there are two different UTF-8 modes:
+///
+/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8)
+/// controls whether the pattern itself can contain sub-expressions that match
+/// invalid UTF-8.
+/// * [`thompson::Config::utf8`] controls whether empty matches that split a
+/// Unicode codepoint are reported or not.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{self, pikevm::PikeVM},
+/// util::syntax,
+/// Match,
+/// };
+///
+/// let re = PikeVM::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find_iter(&mut cache, haystack).next();
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on a PikeVM Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
#[derive(Clone, Debug)]
pub struct Builder {
config: Config,
- thompson: thompson::Builder,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
}
impl Builder {
@@ -59,53 +247,58 @@ impl Builder {
pub fn new() -> Builder {
Builder {
config: Config::default(),
- thompson: thompson::Builder::new(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
}
}
- pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
+ /// Build a `PikeVM` from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(feature = "syntax")]
+ pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> {
self.build_many(&[pattern])
}
+ /// Build a `PikeVM` from the given patterns.
+ #[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
- ) -> Result<PikeVM, Error> {
+ ) -> Result<PikeVM, BuildError> {
let nfa = self.thompson.build_many(patterns)?;
- self.build_from_nfa(Arc::new(nfa))
- }
-
- pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
- // TODO: Check that this is correct.
- // if !cfg!(all(
- // feature = "dfa",
- // feature = "syntax",
- // feature = "unicode-perl"
- // )) {
- if !cfg!(feature = "syntax") {
- if nfa.has_word_boundary_unicode() {
- return Err(Error::unicode_word_unavailable());
- }
- }
- Ok(PikeVM { config: self.config, nfa })
+ self.build_from_nfa(nfa)
+ }
+
+ /// Build a `PikeVM` directly from its NFA.
+ ///
+ /// Note that when using this method, any configuration that applies to the
+ /// construction of the NFA itself will of course be ignored, since the NFA
+ /// given here is already built.
+ pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
+ nfa.look_set_any().available().map_err(BuildError::word)?;
+ Ok(PikeVM { config: self.config.clone(), nfa })
}
+ /// Apply the given `PikeVM` configuration options to this builder.
pub fn configure(&mut self, config: Config) -> &mut Builder {
self.config = self.config.overwrite(config);
self
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
/// These settings only apply when constructing a PikeVM directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.thompson.syntax(config);
self
@@ -119,259 +312,1395 @@ impl Builder {
///
/// These settings only apply when constructing a PikeVM directly from a
/// pattern.
+ #[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.thompson.configure(config);
self
}
}
+/// A virtual machine for executing regex searches with capturing groups.
+///
+/// # Infallible APIs
+///
+/// Unlike most other regex engines in this crate, a `PikeVM` never returns an
+/// error at search time. It supports all [`Anchored`] configurations, never
+/// quits and works on haystacks of arbitrary length.
+///
+/// There are two caveats to mention though:
+///
+/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`],
+/// then the PikeVM will report "no match." This is consistent with all other
+/// regex engines in this crate.
+/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`]
+/// that has insufficient capacity to store all valid pattern IDs, then if a
+/// match occurs for a `PatternID` that cannot be inserted, it is silently
+/// dropped as if it did not match.
+///
+/// # Advice
+///
+/// The `PikeVM` is generally the most "powerful" regex engine in this crate.
+/// "Powerful" in this context means that it can handle any regular expression
+/// that is parseable by `regex-syntax` and any size haystack. Regretably,
+/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in
+/// practice. This results in an annoying situation where one generally tries
+/// to pick any other regex engine (or perhaps none at all) before being
+/// forced to fall back to a `PikeVM`.
+///
+/// For example, a common strategy for dealing with capturing groups is to
+/// actually look for the overall match of the regex using a faster regex
+/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall
+/// match is found, one can then run the `PikeVM` on just the match span to
+/// find the spans of the capturing groups. In this way, the faster regex
+/// engine does the majority of the work, while the `PikeVM` only lends its
+/// power in a more limited role.
+///
+/// Unfortunately, this isn't always possible because the faster regex engines
+/// don't support all of the regex features in `regex-syntax`. This notably
+/// includes (and is currently limited to) Unicode word boundaries. So if
+/// your pattern has Unicode word boundaries, you typically can't use a
+/// DFA-based regex engine at all (unless you [enable heuristic support for
+/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass
+/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for
+/// anchored searches only, but in a cruel sort of joke, many Unicode features
+/// tend to result in making the regex _not_ one-pass.)
+///
+/// # Example
+///
+/// This example shows that the `PikeVM` implements Unicode word boundaries
+/// correctly by default.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+///
+/// let re = PikeVM::new(r"\b\w+\b")?;
+/// let mut cache = re.create_cache();
+///
+/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс");
+/// assert_eq!(Some(Match::must(0, 0..12)), it.next());
+/// assert_eq!(Some(Match::must(0, 13..23)), it.next());
+/// assert_eq!(None, it.next());
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
#[derive(Clone, Debug)]
pub struct PikeVM {
config: Config,
- nfa: Arc<NFA>,
+ nfa: NFA,
}
impl PikeVM {
- pub fn new(pattern: &str) -> Result<PikeVM, Error> {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding `PikeVM`.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 3..14)),
+ /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<PikeVM, BuildError> {
PikeVM::builder().build(pattern)
}
- pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<PikeVM, BuildError> {
PikeVM::builder().build_many(patterns)
}
+ /// Like `new`, but builds a PikeVM directly from an NFA. This is useful
+ /// if you already have an NFA, or even if you hand-assembled the NFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to hand assemble a regular expression via its HIR,
+ /// compile an NFA from it and build a PikeVM from the NFA.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.captures(&mut cache, "!@#A#@!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> {
+ PikeVM::builder().build_from_nfa(nfa)
+ }
+
+ /// Create a new `PikeVM` that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::always_match()?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let expected = Match::must(0, 0..0);
+ /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next());
+ /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<PikeVM, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ PikeVM::new_from_nfa(nfa)
+ }
+
+ /// Create a new `PikeVM` that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::never_match()?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert_eq!(None, re.find_iter(&mut cache, "").next());
+ /// assert_eq!(None, re.find_iter(&mut cache, "foo").next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<PikeVM, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ PikeVM::new_from_nfa(nfa)
+ }
+
+ /// Return a default configuration for a `PikeVM`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a `PikeVM`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
+ /// disabled, zero-width matches that split a codepoint are allowed.
+ /// Otherwise they are never reported.
+ ///
+ /// In the code below, notice that `""` is permitted to match positions
+ /// that split the encoding of a codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match};
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z";
+ /// let mut it = re.find_iter(&mut cache, haystack);
+ /// assert_eq!(Some(Match::must(0, 0..0)), it.next());
+ /// assert_eq!(Some(Match::must(0, 1..1)), it.next());
+ /// assert_eq!(Some(Match::must(0, 2..2)), it.next());
+ /// assert_eq!(Some(Match::must(0, 3..3)), it.next());
+ /// assert_eq!(Some(Match::must(0, 4..4)), it.next());
+ /// assert_eq!(Some(Match::must(0, 5..5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn config() -> Config {
Config::new()
}
+ /// Return a builder for configuring the construction of a `PikeVM`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 1..9));
+ /// re.captures(&mut cache, haystack, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
pub fn builder() -> Builder {
Builder::new()
}
+ /// Create a new empty set of capturing groups that is guaranteed to be
+ /// valid for the search APIs on this `PikeVM`.
+ ///
+ /// A `Captures` value created for a specific `PikeVM` cannot be used with
+ /// any other `PikeVM`.
+ ///
+ /// This is a convenience function for [`Captures::all`]. See the
+ /// [`Captures`] documentation for an explanation of its alternative
+ /// constructors that permit the `PikeVM` to do less work during a search,
+ /// and thus might make it faster.
+ pub fn create_captures(&self) -> Captures {
+ Captures::all(self.get_nfa().group_info().clone())
+ }
+
+ /// Create a new cache for this `PikeVM`.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then
+ /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently,
+ /// [`PikeVM::reset_cache`]).
pub fn create_cache(&self) -> Cache {
- Cache::new(self.nfa())
+ Cache::new(self)
}
- pub fn create_captures(&self) -> Captures {
- Captures::new(self.nfa())
+ /// Reset the given cache such that it can be used for searching with the
+ /// this `PikeVM` (and only this `PikeVM`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `PikeVM`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `PikeVM`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re1 = PikeVM::new(r"\w")?;
+ /// let re2 = PikeVM::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find_iter(&mut cache, "Δ").next(),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the PikeVM we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find_iter(&mut cache, "☃").next(),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ cache.reset(self);
}
- pub fn nfa(&self) -> &Arc<NFA> {
+ /// Returns the total number of patterns compiled into this `PikeVM`.
+ ///
+ /// In the case of a `PikeVM` that contains no patterns, this returns `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern length for a `PikeVM` that never
+ /// matches:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::never_match()?;
+ /// assert_eq!(re.pattern_len(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a `PikeVM` that matches at every position:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::always_match()?;
+ /// assert_eq!(re.pattern_len(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a `PikeVM` that was constructed from multiple patterns:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(re.pattern_len(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_len(&self) -> usize {
+ self.nfa.pattern_len()
+ }
+
+ /// Return the config for this `PikeVM`.
+ #[inline]
+ pub fn get_config(&self) -> &Config {
+ &self.config
+ }
+
+ /// Returns a reference to the underlying NFA.
+ #[inline]
+ pub fn get_nfa(&self) -> &NFA {
&self.nfa
}
+}
- pub fn find_leftmost_iter<'r, 'c, 't>(
+impl PikeVM {
+ /// Returns true if and only if this `PikeVM` matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future
+ /// input will never lead to a different result. In particular, if the
+ /// underlying NFA enters a match state, then this routine will return
+ /// `true` immediately without inspecting any future input. (Consider how
+ /// this might make a difference given the regex `a+` on the haystack
+ /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
+ /// but routines like `find` need to continue searching because `+` is
+ /// greedy by default.)
+ ///
+ /// # Example
+ ///
+ /// This shows basic usage:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "foo12345bar"));
+ /// assert!(!re.is_match(&mut cache, "foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: consistency with search APIs
+ ///
+ /// `is_match` is guaranteed to return `true` whenever `find` returns a
+ /// match. This includes searches that are executed entirely within a
+ /// codepoint:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
+ ///
+ /// let re = PikeVM::new("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Notice that when UTF-8 mode is disabled, then the above reports a
+ /// match because the restriction against zero-width matches that split a
+ /// codepoint has been lifted:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(NFA::config().utf8(false))
+ /// .build("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> bool {
+ let input = input.into().earliest(true);
+ self.search_slots(cache, &input, &mut []).is_some()
+ }
+
+ /// Executes a leftmost forward search and returns a `Match` if one exists.
+ ///
+ /// This routine only includes the overall match span. To get access to the
+ /// individual spans of each capturing group, use [`PikeVM::captures`].
+ ///
+ /// # Example
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..8);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over later parts.
+ /// let re = PikeVM::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..3);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> Option<Match> {
+ let input = input.into();
+ if self.get_nfa().pattern_len() == 1 {
+ let mut slots = [None, None];
+ let pid = self.search_slots(cache, &input, &mut slots)?;
+ let start = slots[0]?.get();
+ let end = slots[1]?.get();
+ return Some(Match::new(pid, Span { start, end }));
+ }
+ let ginfo = self.get_nfa().group_info();
+ let slots_len = ginfo.implicit_slot_len();
+ let mut slots = vec![None; slots_len];
+ let pid = self.search_slots(cache, &input, &mut slots)?;
+ let start = slots[pid.as_usize() * 2]?.get();
+ let end = slots[pid.as_usize() * 2 + 1]?.get();
+ Some(Match::new(pid, Span { start, end }))
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+ ///
+ /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "2010-03-14", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
+ /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ caps: &mut Captures,
+ ) {
+ self.search(cache, &input.into(), caps)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
+ /// assert_eq!(matches, vec![
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ cache: &'c mut Cache,
+ input: I,
+ ) -> FindMatches<'r, 'c, 'h> {
+ let caps = Captures::matches(self.get_nfa().group_info().clone());
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, cache, caps, it }
+ }
+
+ /// Returns an iterator over all non-overlapping `Captures` values. If no
+ /// match exists, then the iterator yields no elements.
+ ///
+ /// This yields the same matches as [`PikeVM::find_iter`], but it includes
+ /// the spans of all capturing groups that participate in each match.
+ ///
+ /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
+ /// how to correctly iterate over all matches in a haystack while avoiding
+ /// the creation of a new `Captures` value for every match. (Which you are
+ /// forced to do with an `Iterator`.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+ ///
+ /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Span> = re
+ /// .captures_iter(&mut cache, text)
+ /// // The unwrap is OK since 'numbers' matches if the pattern matches.
+ /// .map(|caps| caps.get_group_by_name("numbers").unwrap())
+ /// .collect();
+ /// assert_eq!(matches, vec![
+ /// Span::from(3..4),
+ /// Span::from(8..10),
+ /// Span::from(14..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
&'r self,
cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches::new(self, cache, haystack)
- }
-
- // BREADCRUMBS:
- //
- // 1) Don't forget about prefilters.
- //
- // 2) Consider the case of using a PikeVM with an NFA that has Capture
- // states, but where we don't want to track capturing groups (other than
- // group 0). This potentially saves a lot of copying around and what not. I
- // believe the current regex crate does this, for example. The interesting
- // bit here is how to handle the case of multiple patterns...
- //
- // 3) Permit the caller to specify a pattern ID to run an anchored-only
- // search on.
- //
- // 4) How to do overlapping? The way multi-regex support works in the regex
- // crate currently is to run the PikeVM until either we reach the end of
- // the haystack or when we know all regexes have matched. The latter case
- // is probably quite rare, so the common case is likely that we're always
- // searching the entire input. The question is: can we emulate that with
- // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
- // all we need to do is provide an overlapping API on the PikeVM that
- // roughly matches the ones we provide on DFAs. For those APIs, the only
- // thing they need over non-overlapping APIs is "caller state." For DFAs,
- // the caller state is simple: it contains the last state visited and the
- // last match reported. For the PikeVM (and NFAs in general), the "last
- // state" is actually a *set* of NFA states. So I think what happens here
- // is that we can just force the `Cache` to subsume this role. We'll still
- // need some additional state to track the last match reported though.
- // Because when two or more patterns match at the same location, we need a
- // way to know to iterate over them. Although maybe it's not match index we
- // need, but the state index of the last NFA state processed in the cache.
- // Then we just pick up where we left off. There might be another match
- // state, in which case, we report it.
-
- pub fn find_leftmost_at(
+ input: I,
+ ) -> CapturesMatches<'r, 'c, 'h> {
+ let caps = self.create_captures();
+ let it = iter::Searcher::new(input.into());
+ CapturesMatches { re: self, cache, caps, it }
+ }
+}
+
+impl PikeVM {
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided [`Captures`]
+ /// value. If no match was found, then [`Captures::is_match`] is guaranteed
+ /// to return `false`.
+ ///
+ /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
+ /// instead of an `Into<Input>`.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-PikeVM that permits searching
+ /// for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Anchored, Match, PatternID, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123";
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(Match::must(0, 0..6));
+ /// re.search(&mut cache, &Input::new(haystack), &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(Match::must(1, 0..6));
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
+ ///
+ /// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let input = Input::new(haystack).range(3..6);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search(
&self,
cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
+ input: &Input<'_>,
caps: &mut Captures,
- ) -> Option<MultiMatch> {
- let anchored =
- self.config.get_anchored() || self.nfa.is_always_start_anchored();
- let mut at = start;
- let mut matched_pid = None;
- cache.clear();
- 'LOOP: loop {
- if cache.clist.set.is_empty() {
- if matched_pid.is_some() || (anchored && at > start) {
- break 'LOOP;
+ ) {
+ caps.set_pattern(None);
+ let pid = self.search_slots(cache, input, caps.slots_mut());
+ caps.set_pattern(pid);
+ }
+
+ /// Executes a leftmost forward search and writes the spans of capturing
+ /// groups that participated in a match into the provided `slots`, and
+ /// returns the matching pattern ID. The contents of the slots for patterns
+ /// other than the matching pattern are unspecified. If no match was found,
+ /// then `None` is returned and the contents of `slots` is unspecified.
+ ///
+ /// This is like [`PikeVM::search`], but it accepts a raw slots slice
+ /// instead of a `Captures` value. This is useful in contexts where you
+ /// don't want or need to allocate a `Captures`.
+ ///
+ /// It is legal to pass _any_ number of slots to this routine. If the regex
+ /// engine would otherwise write a slot offset that doesn't fit in the
+ /// provided slice, then it is simply skipped. In general though, there are
+ /// usually three slice lengths you might want to use:
+ ///
+ /// * An empty slice, if you only care about which pattern matched.
+ /// * A slice with
+ /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
+ /// slots, if you only care about the overall match spans for each matching
+ /// pattern.
+ /// * A slice with
+ /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
+ /// permits recording match offsets for every capturing group in every
+ /// pattern.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"\pL+",
+ /// r"\d+",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("!@#123");
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.search_slots(&mut cache, &input, &mut slots);
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ if !utf8empty {
+ let hm = self.search_slots_imp(cache, input, slots)?;
+ return Some(hm.pattern());
+ }
+ // There is an unfortunate special case where if the regex can
+ // match the empty string and UTF-8 mode is enabled, the search
+ // implementation requires that the slots have at least as much space
+ // to report the bounds of any match. This is so zero-width matches
+ // that split a codepoint can be filtered out.
+ //
+ // Note that if utf8empty is true, we specialize the case for when
+ // the number of patterns is 1. In that case, we can just use a stack
+ // allocation. Otherwise we resort to a heap allocation, which we
+ // convince ourselves we're fine with due to the pathological nature of
+ // this case.
+ let min = self.get_nfa().group_info().implicit_slot_len();
+ if slots.len() >= min {
+ let hm = self.search_slots_imp(cache, input, slots)?;
+ return Some(hm.pattern());
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut enough = [None, None];
+ let got = self.search_slots_imp(cache, input, &mut enough);
+ // This is OK because we know `enough` is strictly bigger than
+ // `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ return got.map(|hm| hm.pattern());
+ }
+ let mut enough = vec![None; min];
+ let got = self.search_slots_imp(cache, input, &mut enough);
+ // This is OK because we know `enough` is strictly bigger than `slots`,
+ // otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ got.map(|hm| hm.pattern())
+ }
+
+ /// This is the actual implementation of `search_slots_imp` that
+ /// doesn't account for the special case when 1) the NFA has UTF-8 mode
+ /// enabled, 2) the NFA can match the empty string and 3) the caller has
+ /// provided an insufficient number of slots to record match offsets.
+ #[inline(never)]
+ fn search_slots_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<HalfMatch> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ let hm = match self.search_imp(cache, input, slots) {
+ None => return None,
+ Some(hm) if !utf8empty => return Some(hm),
+ Some(hm) => hm,
+ };
+ empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+ Ok(self
+ .search_imp(cache, input, slots)
+ .map(|hm| (hm, hm.offset())))
+ })
+ // OK because the PikeVM never errors.
+ .unwrap()
+ }
+
+ /// Writes the set of patterns that match anywhere in the given search
+ /// configuration to `patset`. If multiple patterns match at the same
+ /// position and this `PikeVM` was configured with [`MatchKind::All`]
+ /// semantics, then all matching patterns are written to the given set.
+ ///
+ /// Unless all of the patterns in this `PikeVM` are anchored, then
+ /// generally speaking, this will visit every byte in the haystack.
+ ///
+ /// This search routine *does not* clear the pattern set. This gives some
+ /// flexibility to the caller (e.g., running multiple searches with the
+ /// same pattern set), but does make the API bug-prone if you're reusing
+ /// the same pattern set for multiple searches but intended them to be
+ /// independent.
+ ///
+ /// If a pattern ID matched but the given `PatternSet` does not have
+ /// sufficient capacity to store it, then it is not inserted and silently
+ /// dropped.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find all matching patterns in a haystack,
+ /// even when some patterns match at the same position as other patterns.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Input, MatchKind, PatternSet,
+ /// };
+ ///
+ /// let patterns = &[
+ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
+ /// ];
+ /// let re = PikeVM::builder()
+ /// .configure(PikeVM::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(re.pattern_len());
+ /// re.which_overlapping_matches(&mut cache, &input, &mut patset);
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn which_overlapping_matches(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ self.which_overlapping_imp(cache, input, patset)
+ }
+}
+
+impl PikeVM {
+ /// The implementation of standard leftmost search.
+ ///
+ /// Capturing group spans are written to `slots`, but only if requested.
+ /// `slots` can be any length. Any slot in the NFA that is activated but
+ /// which is out of bounds for the given `slots` is ignored.
+ fn search_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<HalfMatch> {
+ cache.setup_search(slots.len());
+ if input.is_done() {
+ return None;
+ }
+ // Why do we even care about this? Well, in our 'Captures'
+ // representation, we use usize::MAX as a sentinel to indicate "no
+ // match." This isn't problematic so long as our haystack doesn't have
+ // a maximal length. Byte slices are guaranteed by Rust to have a
+ // length that fits into isize, and so this assert should always pass.
+ // But we put it here to make our assumption explicit.
+ assert!(
+ input.haystack().len() < core::usize::MAX,
+ "byte slice lengths must be less than usize MAX",
+ );
+ instrument!(|c| c.reset(&self.nfa));
+
+ // Whether we want to visit all match states instead of emulating the
+ // 'leftmost' semantics of typical backtracking regex engines.
+ let allmatches =
+ self.config.get_match_kind().continue_past_first_match();
+ let (anchored, start_id) = match self.start_config(input) {
+ None => return None,
+ Some(config) => config,
+ };
+
+ let pre =
+ if anchored { None } else { self.get_config().get_prefilter() };
+ let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
+ let mut hm = None;
+ // Yes, our search doesn't end at input.end(), but includes it. This
+ // is necessary because matches are delayed by one byte, just like
+ // how the DFA engines work. The delay is used to handle look-behind
+ // assertions. In the case of the PikeVM, the delay is implemented
+ // by not considering a match to exist until it is visited in
+ // 'steps'. Technically, we know a match exists in the previous
+ // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
+ // determinization. We don't mark a DFA state as a match state if it
+ // contains an NFA match state, but rather, whether the DFA state was
+ // generated by a transition from a DFA state that contains an NFA
+ // match state.)
+ let mut at = input.start();
+ while at <= input.end() {
+ // If we have no states left to visit, then there are some cases
+ // where we know we can quit early or even skip ahead.
+ if curr.set.is_empty() {
+ // We have a match and we haven't been instructed to continue
+ // on even after finding a match, so we can quit.
+ if hm.is_some() && !allmatches {
+ break;
+ }
+ // If we're running an anchored search and we've advanced
+ // beyond the start position with no other states to try, then
+ // we will never observe a match and thus can stop.
+ if anchored && at > input.start() {
+ break;
+ }
+ // If there no states left to explore at this position and we
+ // know we can't terminate early, then we are effectively at
+ // the starting state of the NFA. If we fell through here,
+ // we'd end up adding our '(?s-u:.)*?' prefix and it would be
+ // the only thing in 'curr'. So we might as well just skip
+ // ahead until we find something that we know might advance us
+ // forward.
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => break,
+ Some(ref span) => at = span.start,
+ }
}
- // TODO: prefilter
}
- if (!anchored && matched_pid.is_none())
- || cache.clist.set.is_empty()
+ // Instead of using the NFA's unanchored start state, we actually
+ // always use its anchored starting state. As a result, when doing
+ // an unanchored search, we need to simulate our own '(?s-u:.)*?'
+ // prefix, to permit a match to appear anywhere.
+ //
+ // Now, we don't *have* to do things this way. We could use the
+ // NFA's unanchored starting state and do one 'epsilon_closure'
+ // call from that starting state before the main loop here. And
+ // that is just as correct. However, it turns out to be slower
+ // than our approach here because it slightly increases the cost
+ // of processing each byte by requiring us to visit more NFA
+ // states to deal with the additional NFA states in the unanchored
+ // prefix. By simulating it explicitly here, we lower those costs
+ // substantially. The cost is itself small, but it adds up for
+ // large haystacks.
+ //
+ // In order to simulate the '(?s-u:.)*?' prefix---which is not
+ // greedy---we are careful not to perform an epsilon closure on
+ // the start state if we already have a match. Namely, if we
+ // did otherwise, we would never reach a terminating condition
+ // because there would always be additional states to process.
+ // In effect, the exclusion of running 'epsilon_closure' when
+ // we have a match corresponds to the "dead" states we have in
+ // our DFA regex engines. Namely, in a DFA, match states merely
+ // instruct the search execution to record the current offset as
+ // the most recently seen match. It is the dead state that actually
+ // indicates when to stop the search (other than EOF or quit
+ // states).
+ //
+ // However, when 'allmatches' is true, the caller has asked us to
+ // leave in every possible match state. This tends not to make a
+ // whole lot of sense in unanchored searches, because it means the
+ // search really cannot terminate until EOF. And often, in that
+ // case, you wind up skipping over a bunch of matches and are left
+ // with the "last" match. Arguably, it just doesn't make a lot of
+ // sense to run a 'leftmost' search (which is what this routine is)
+ // with 'allmatches' set to true. But the DFAs support it and this
+ // matches their behavior. (Generally, 'allmatches' is useful for
+ // overlapping searches or leftmost anchored searches to find the
+ // longest possible match by ignoring match priority.)
+ //
+ // Additionally, when we're running an anchored search, this
+ // epsilon closure should only be computed at the beginning of the
+ // search. If we re-computed it at every position, we would be
+ // simulating an unanchored search when we were tasked to perform
+ // an anchored search.
+ if (!hm.is_some() || allmatches)
+ && (!anchored || at == input.start())
{
- self.epsilon_closure(
- &mut cache.clist,
- &mut caps.slots,
- &mut cache.stack,
- self.nfa.start_anchored(),
- haystack,
- at,
- );
+ // Since we are adding to the 'curr' active states and since
+ // this is for the start ID, we use a slots slice that is
+ // guaranteed to have the right length but where every element
+ // is absent. This is exactly what we want, because this
+ // epsilon closure is responsible for simulating an unanchored
+ // '(?s:.)*?' prefix. It is specifically outside of any
+ // capturing groups, and thus, using slots that are always
+ // absent is correct.
+ //
+ // Note though that we can't just use '&mut []' here, since
+ // this epsilon closure may traverse through 'Captures' epsilon
+ // transitions, and thus must be able to write offsets to the
+ // slots given which are later copied to slot values in 'curr'.
+ let slots = next.slot_table.all_absent();
+ self.epsilon_closure(stack, slots, curr, input, at, start_id);
}
- for i in 0..cache.clist.set.len() {
- let sid = cache.clist.set.get(i);
- let pid = match self.step(
- &mut cache.nlist,
- &mut caps.slots,
- cache.clist.caps(sid),
- &mut cache.stack,
- sid,
- haystack,
- at,
- ) {
- None => continue,
- Some(pid) => pid,
- };
- matched_pid = Some(pid);
- break;
+ if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
+ {
+ hm = Some(HalfMatch::new(pid, at));
}
- if at >= end {
+ // Unless the caller asked us to return early, we need to mush on
+ // to see if we can extend our match. (But note that 'nexts' will
+ // quit right after seeing a match when match_kind==LeftmostFirst,
+ // as is consistent with leftmost-first match priority.)
+ if input.get_earliest() && hm.is_some() {
break;
}
+ core::mem::swap(curr, next);
+ next.set.clear();
at += 1;
- cache.swap();
- cache.nlist.set.clear();
}
- matched_pid.map(|pid| {
- let slots = self.nfa.pattern_slots(pid);
- let (start, end) = (slots.start, slots.start + 1);
- MultiMatch::new(
- pid,
- caps.slots[start].unwrap(),
- caps.slots[end].unwrap(),
- )
- })
+ instrument!(|c| c.eprint(&self.nfa));
+ hm
+ }
+
+ /// The implementation for the 'which_overlapping_matches' API. Basically,
+ /// we do a single scan through the entire haystack (unless our regex
+ /// or search is anchored) and record every pattern that matched. In
+ /// particular, when MatchKind::All is used, this supports overlapping
+ /// matches. So if we have the regexes 'sam' and 'samwise', they will
+ /// *both* be reported in the pattern set when searching the haystack
+ /// 'samwise'.
+ fn which_overlapping_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) {
+ // NOTE: This is effectively a copy of 'search_imp' above, but with no
+ // captures support and instead writes patterns that matched directly
+ // to 'patset'. See that routine for better commentary about what's
+ // going on in this routine. We probably could unify the routines using
+ // generics or more helper routines, but I'm not sure it's worth it.
+ //
+ // NOTE: We somewhat go out of our way here to support things like
+ // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither
+ // of those seem particularly relevant to this routine, but they are
+ // both supported by the DFA analogs of this routine by construction
+ // and composition, so it seems like good sense to have the PikeVM
+ // match that behavior.
+
+ cache.setup_search(0);
+ if input.is_done() {
+ return;
+ }
+ assert!(
+ input.haystack().len() < core::usize::MAX,
+ "byte slice lengths must be less than usize MAX",
+ );
+ instrument!(|c| c.reset(&self.nfa));
+
+ let allmatches =
+ self.config.get_match_kind().continue_past_first_match();
+ let (anchored, start_id) = match self.start_config(input) {
+ None => return,
+ Some(config) => config,
+ };
+
+ let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
+ for at in input.start()..=input.end() {
+ let any_matches = !patset.is_empty();
+ if curr.set.is_empty() {
+ if any_matches && !allmatches {
+ break;
+ }
+ if anchored && at > input.start() {
+ break;
+ }
+ }
+ if !any_matches || allmatches {
+ let slots = &mut [];
+ self.epsilon_closure(stack, slots, curr, input, at, start_id);
+ }
+ self.nexts_overlapping(stack, curr, next, input, at, patset);
+ // If we found a match and filled our set, then there is no more
+ // additional info that we can provide. Thus, we can quit. We also
+ // quit if the caller asked us to stop at the earliest point that
+ // we know a match exists.
+ if patset.is_full() || input.get_earliest() {
+ break;
+ }
+ core::mem::swap(curr, next);
+ next.set.clear();
+ }
+ instrument!(|c| c.eprint(&self.nfa));
+ }
+
+ /// Process the active states in 'curr' to find the states (written to
+ /// 'next') we should process for the next byte in the haystack.
+ ///
+ /// 'stack' is used to perform a depth first traversal of the NFA when
+ /// computing an epsilon closure.
+ ///
+ /// When a match is found, the slots for that match state (in 'curr') are
+ /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
+ /// stops (unless the PikeVM was configured with MatchKind::All semantics).
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn nexts(
+ &self,
+ stack: &mut Vec<FollowEpsilon>,
+ curr: &mut ActiveStates,
+ next: &mut ActiveStates,
+ input: &Input<'_>,
+ at: usize,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Option<PatternID> {
+ instrument!(|c| c.record_state_set(&curr.set));
+ let mut pid = None;
+ let ActiveStates { ref set, ref mut slot_table } = *curr;
+ for sid in set.iter() {
+ pid = match self.next(stack, slot_table, next, input, at, sid) {
+ None => continue,
+ Some(pid) => Some(pid),
+ };
+ slots.copy_from_slice(slot_table.for_state(sid));
+ if !self.config.get_match_kind().continue_past_first_match() {
+ break;
+ }
+ }
+ pid
}
- #[inline(always)]
- fn step(
+ /// Like 'nexts', but for the overlapping case. This doesn't write any
+ /// slots, and instead just writes which pattern matched in 'patset'.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn nexts_overlapping(
&self,
- nlist: &mut Threads,
- slots: &mut [Slot],
- thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
- sid: StateID,
- haystack: &[u8],
+ curr: &mut ActiveStates,
+ next: &mut ActiveStates,
+ input: &Input<'_>,
at: usize,
+ patset: &mut PatternSet,
+ ) {
+ instrument!(|c| c.record_state_set(&curr.set));
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ let ActiveStates { ref set, ref mut slot_table } = *curr;
+ for sid in set.iter() {
+ let pid = match self.next(stack, slot_table, next, input, at, sid)
+ {
+ None => continue,
+ Some(pid) => pid,
+ };
+ // This handles the case of finding a zero-width match that splits
+ // a codepoint. Namely, if we're in UTF-8 mode AND we know we can
+ // match the empty string, then the only valid way of getting to
+ // this point with an offset that splits a codepoint is when we
+ // have an empty match. Such matches, in UTF-8 mode, must not be
+ // reported. So we just skip them here and pretend as if we did
+ // not see a match.
+ if utf8empty && !input.is_char_boundary(at) {
+ continue;
+ }
+ let _ = patset.try_insert(pid);
+ if !self.config.get_match_kind().continue_past_first_match() {
+ break;
+ }
+ }
+ }
+
+ /// Starting from 'sid', if the position 'at' in the 'input' haystack has a
+ /// transition defined out of 'sid', then add the state transitioned to and
+ /// its epsilon closure to the 'next' set of states to explore.
+ ///
+ /// 'stack' is used by the epsilon closure computation to perform a depth
+ /// first traversal of the NFA.
+ ///
+ /// 'curr_slot_table' should be the table of slots for the current set of
+ /// states being explored. If there is a transition out of 'sid', then
+ /// sid's row in the slot table is used to perform the epsilon closure.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn next(
+ &self,
+ stack: &mut Vec<FollowEpsilon>,
+ curr_slot_table: &mut SlotTable,
+ next: &mut ActiveStates,
+ input: &Input<'_>,
+ at: usize,
+ sid: StateID,
) -> Option<PatternID> {
+ instrument!(|c| c.record_step(sid));
match *self.nfa.state(sid) {
State::Fail
| State::Look { .. }
| State::Union { .. }
+ | State::BinaryUnion { .. }
| State::Capture { .. } => None,
- State::Range { ref range } => {
- if range.matches(haystack, at) {
+ State::ByteRange { ref trans } => {
+ if trans.matches(input.haystack(), at) {
+ let slots = curr_slot_table.for_state(sid);
+ // OK because 'at <= haystack.len() < usize::MAX', so
+ // adding 1 will never wrap.
+ let at = at.wrapping_add(1);
self.epsilon_closure(
- nlist,
- thread_caps,
- stack,
- range.next,
- haystack,
- at + 1,
+ stack, slots, next, input, at, trans.next,
);
}
None
}
State::Sparse(ref sparse) => {
- if let Some(next) = sparse.matches(haystack, at) {
+ if let Some(next_sid) = sparse.matches(input.haystack(), at) {
+ let slots = curr_slot_table.for_state(sid);
+ // OK because 'at <= haystack.len() < usize::MAX', so
+ // adding 1 will never wrap.
+ let at = at.wrapping_add(1);
self.epsilon_closure(
- nlist,
- thread_caps,
- stack,
- next,
- haystack,
- at + 1,
+ stack, slots, next, input, at, next_sid,
);
}
None
}
- State::Match { id } => {
- slots.copy_from_slice(thread_caps);
- Some(id)
+ State::Dense(ref dense) => {
+ if let Some(next_sid) = dense.matches(input.haystack(), at) {
+ let slots = curr_slot_table.for_state(sid);
+ // OK because 'at <= haystack.len() < usize::MAX', so
+ // adding 1 will never wrap.
+ let at = at.wrapping_add(1);
+ self.epsilon_closure(
+ stack, slots, next, input, at, next_sid,
+ );
+ }
+ None
}
+ State::Match { pattern_id } => Some(pattern_id),
}
}
- #[inline(always)]
+ /// Compute the epsilon closure of 'sid', writing the closure into 'next'
+ /// while copying slot values from 'curr_slots' into corresponding states
+ /// in 'next'. 'curr_slots' should be the slot values corresponding to
+ /// 'sid'.
+ ///
+ /// The given 'stack' is used to perform a depth first traversal of the
+ /// NFA by recursively following all epsilon transitions out of 'sid'.
+ /// Conditional epsilon transitions are followed if and only if they are
+ /// satisfied for the position 'at' in the 'input' haystack.
+ ///
+ /// While this routine may write to 'curr_slots', once it returns, any
+ /// writes are undone and the original values (even if absent) are
+ /// restored.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn epsilon_closure(
&self,
- nlist: &mut Threads,
- thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
- sid: StateID,
- haystack: &[u8],
+ curr_slots: &mut [Option<NonMaxUsize>],
+ next: &mut ActiveStates,
+ input: &Input<'_>,
at: usize,
+ sid: StateID,
) {
- stack.push(FollowEpsilon::StateID(sid));
+ instrument!(|c| {
+ c.record_closure(sid);
+ c.record_stack_push(sid);
+ });
+ stack.push(FollowEpsilon::Explore(sid));
while let Some(frame) = stack.pop() {
match frame {
- FollowEpsilon::StateID(sid) => {
- self.epsilon_closure_step(
- nlist,
- thread_caps,
- stack,
- sid,
- haystack,
- at,
- );
+ FollowEpsilon::RestoreCapture { slot, offset: pos } => {
+ curr_slots[slot] = pos;
}
- FollowEpsilon::Capture { slot, pos } => {
- thread_caps[slot] = pos;
+ FollowEpsilon::Explore(sid) => {
+ self.epsilon_closure_explore(
+ stack, curr_slots, next, input, at, sid,
+ );
}
}
}
}
- #[inline(always)]
- fn epsilon_closure_step(
+ /// Explore all of the epsilon transitions out of 'sid'. This is mostly
+ /// split out from 'epsilon_closure' in order to clearly delineate
+ /// the actual work of computing an epsilon closure from the stack
+ /// book-keeping.
+ ///
+ /// This will push any additional explorations needed on to 'stack'.
+ ///
+ /// 'curr_slots' should refer to the slots for the currently active NFA
+ /// state. That is, the current state we are stepping through. These
+ /// slots are mutated in place as new 'Captures' states are traversed
+ /// during epsilon closure, but the slots are restored to their original
+ /// values once the full epsilon closure is completed. The ultimate use of
+ /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
+ /// the capturing group spans are forwarded from the currently active state
+ /// to the next.
+ ///
+ /// 'next' refers to the next set of active states. Computing an epsilon
+ /// closure may increase the next set of active states.
+ ///
+ /// 'input' refers to the caller's input configuration and 'at' refers to
+ /// the current position in the haystack. These are used to check whether
+ /// conditional epsilon transitions (like look-around) are satisfied at
+ /// the current position. If they aren't, then the epsilon closure won't
+ /// include them.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn epsilon_closure_explore(
&self,
- nlist: &mut Threads,
- thread_caps: &mut [Slot],
stack: &mut Vec<FollowEpsilon>,
- mut sid: StateID,
- haystack: &[u8],
+ curr_slots: &mut [Option<NonMaxUsize>],
+ next: &mut ActiveStates,
+ input: &Input<'_>,
at: usize,
+ mut sid: StateID,
) {
+ // We can avoid pushing some state IDs on to our stack in precisely
+ // the cases where a 'push(x)' would be immediately followed by a 'x
+ // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
+ // to be the next state ID we want to explore once we're done with
+ // our initial exploration. In practice, this avoids a lot of stack
+ // thrashing.
loop {
- if !nlist.set.insert(sid) {
+ instrument!(|c| c.record_set_insert(sid));
+ // Record this state as part of our next set of active states. If
+ // we've already explored it, then no need to do it again.
+ if !next.set.insert(sid) {
return;
}
match *self.nfa.state(sid) {
State::Fail
- | State::Range { .. }
+ | State::Match { .. }
+ | State::ByteRange { .. }
| State::Sparse { .. }
- | State::Match { .. } => {
- let t = &mut nlist.caps(sid);
- t.copy_from_slice(thread_caps);
+ | State::Dense { .. } => {
+ next.slot_table.for_state(sid).copy_from_slice(curr_slots);
return;
}
State::Look { look, next } => {
- if !look.matches(haystack, at) {
+ // OK because we don't permit building a searcher with a
+ // Unicode word boundary if the requisite Unicode data is
+ // unavailable.
+ if !self.nfa.look_matcher().matches_inline(
+ look,
+ input.haystack(),
+ at,
+ ) {
return;
}
sid = next;
@@ -381,174 +1710,650 @@ impl PikeVM {
None => return,
Some(&sid) => sid,
};
+ instrument!(|c| {
+ for &alt in &alternates[1..] {
+ c.record_stack_push(alt);
+ }
+ });
stack.extend(
alternates[1..]
.iter()
.copied()
.rev()
- .map(FollowEpsilon::StateID),
+ .map(FollowEpsilon::Explore),
);
}
- State::Capture { next, slot } => {
- if slot < thread_caps.len() {
- stack.push(FollowEpsilon::Capture {
+ State::BinaryUnion { alt1, alt2 } => {
+ sid = alt1;
+ instrument!(|c| c.record_stack_push(sid));
+ stack.push(FollowEpsilon::Explore(alt2));
+ }
+ State::Capture { next, slot, .. } => {
+ // There's no need to do anything with slots that
+ // ultimately won't be copied into the caller-provided
+ // 'Captures' value. So we just skip dealing with them at
+ // all.
+ if slot.as_usize() < curr_slots.len() {
+ instrument!(|c| c.record_stack_push(sid));
+ stack.push(FollowEpsilon::RestoreCapture {
slot,
- pos: thread_caps[slot],
+ offset: curr_slots[slot],
});
- thread_caps[slot] = Some(at);
+ // OK because length of a slice must fit into an isize.
+ curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap());
}
sid = next;
}
}
}
}
+
+ /// Return the starting configuration of a PikeVM search.
+ ///
+ /// The "start config" is basically whether the search should be anchored
+ /// or not and the NFA state ID at which to begin the search. The state ID
+ /// returned always corresponds to an anchored starting state even when the
+ /// search is unanchored. This is because the PikeVM search loop deals with
+ /// unanchored searches with an explicit epsilon closure out of the start
+ /// state.
+ ///
+ /// This routine accounts for both the caller's `Input` configuration
+ /// and the pattern itself. For example, even if the caller asks for an
+ /// unanchored search, if the pattern itself is anchored, then this will
+ /// always return 'true' because implementing an unanchored search in that
+ /// case would be incorrect.
+ ///
+ /// Similarly, if the caller requests an anchored search for a particular
+ /// pattern, then the starting state ID returned will reflect that.
+ ///
+ /// If a pattern ID is given in the input configuration that is not in
+ /// this regex, then `None` is returned.
+ fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> {
+ match input.get_anchored() {
+ // Only way we're unanchored is if both the caller asked for an
+ // unanchored search *and* the pattern is itself not anchored.
+ Anchored::No => Some((
+ self.nfa.is_always_start_anchored(),
+ self.nfa.start_anchored(),
+ )),
+ Anchored::Yes => Some((true, self.nfa.start_anchored())),
+ Anchored::Pattern(pid) => {
+ Some((true, self.nfa.start_pattern(pid)?))
+ }
+ }
+ }
}
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// infallible search.
+/// An iterator over all non-overlapping matches for a particular search.
///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
+/// The iterator yields a [`Match`] value until no more matches could be found.
///
-/// The lifetime variables are as follows:
+/// The lifetime parameters are as follows:
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
+/// * `'r` represents the lifetime of the PikeVM.
+/// * `'c` represents the lifetime of the PikeVM's cache.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`PikeVM::find_iter`] method.
#[derive(Debug)]
-pub struct FindLeftmostMatches<'r, 'c, 't> {
- vm: &'r PikeVM,
+pub struct FindMatches<'r, 'c, 'h> {
+ re: &'r PikeVM,
cache: &'c mut Cache,
- // scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
+ caps: Captures,
+ it: iter::Searcher<'h>,
}
-impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
- fn new(
- vm: &'r PikeVM,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
+impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ // Splitting 'self' apart seems necessary to appease borrowck.
+ let FindMatches { re, ref mut cache, ref mut caps, ref mut it } =
+ *self;
+ // 'advance' converts errors into panics, which is OK here because
+ // the PikeVM can never return an error.
+ it.advance(|input| {
+ re.search(cache, input, caps);
+ Ok(caps.get_match())
+ })
}
}
-impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
- // type Item = Captures;
- type Item = MultiMatch;
+/// An iterator over all non-overlapping leftmost matches, with their capturing
+/// groups, for a particular search.
+///
+/// The iterator yields a [`Captures`] value until no more matches could be
+/// found.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'r` represents the lifetime of the PikeVM.
+/// * `'c` represents the lifetime of the PikeVM's cache.
+/// * `'h` represents the lifetime of the haystack being searched.
+///
+/// This iterator can be created with the [`PikeVM::captures_iter`] method.
+#[derive(Debug)]
+pub struct CapturesMatches<'r, 'c, 'h> {
+ re: &'r PikeVM,
+ cache: &'c mut Cache,
+ caps: Captures,
+ it: iter::Searcher<'h>,
+}
- // fn next(&mut self) -> Option<Captures> {
- fn next(&mut self) -> Option<MultiMatch> {
- if self.last_end > self.text.len() {
- return None;
- }
- let mut caps = self.vm.create_captures();
- let m = self.vm.find_leftmost_at(
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- &mut caps,
- )?;
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.vm.config.get_utf8() {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
+impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
+ type Item = Captures;
+
+ #[inline]
+ fn next(&mut self) -> Option<Captures> {
+ // Splitting 'self' apart seems necessary to appease borrowck.
+ let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
+ *self;
+ // 'advance' converts errors into panics, which is OK here because
+ // the PikeVM can never return an error.
+ it.advance(|input| {
+ re.search(cache, input, caps);
+ Ok(caps.get_match())
+ });
+ if caps.is_match() {
+ Some(caps.clone())
} else {
- self.last_end = m.end();
+ None
}
- self.last_match = Some(m.end());
- Some(m)
}
}
+/// A cache represents mutable state that a [`PikeVM`] requires during a
+/// search.
+///
+/// For a given [`PikeVM`], its corresponding cache may be created either via
+/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
+/// every way, except the former does not require explicitly importing `Cache`.
+///
+/// A particular `Cache` is coupled with the [`PikeVM`] from which it
+/// was created. It may only be used with that `PikeVM`. A cache and its
+/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
+/// only be used with the new `PikeVM` (and not the old one).
#[derive(Clone, Debug)]
-pub struct Captures {
- slots: Vec<Slot>,
+pub struct Cache {
+ /// Stack used while computing epsilon closure. This effectively lets us
+ /// move what is more naturally expressed through recursion to a stack
+ /// on the heap.
+ stack: Vec<FollowEpsilon>,
+ /// The current active states being explored for the current byte in the
+ /// haystack.
+ curr: ActiveStates,
+ /// The next set of states we're building that will be explored for the
+ /// next byte in the haystack.
+ next: ActiveStates,
}
-impl Captures {
- pub fn new(nfa: &NFA) -> Captures {
- Captures { slots: vec![None; nfa.capture_slot_len()] }
+impl Cache {
+ /// Create a new [`PikeVM`] cache.
+ ///
+ /// A potentially more convenient routine to create a cache is
+ /// [`PikeVM::create_cache`], as it does not require also importing the
+ /// `Cache` type.
+ ///
+ /// If you want to reuse the returned `Cache` with some other `PikeVM`,
+ /// then you must call [`Cache::reset`] with the desired `PikeVM`.
+ pub fn new(re: &PikeVM) -> Cache {
+ Cache {
+ stack: vec![],
+ curr: ActiveStates::new(re),
+ next: ActiveStates::new(re),
+ }
+ }
+
+ /// Reset this cache such that it can be used for searching with a
+ /// different [`PikeVM`].
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `PikeVM`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `PikeVM`.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re1 = PikeVM::new(r"\w")?;
+ /// let re2 = PikeVM::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find_iter(&mut cache, "Δ").next(),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the PikeVM we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find_iter(&mut cache, "☃").next(),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &PikeVM) {
+ self.curr.reset(re);
+ self.next.reset(re);
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+ (self.stack.len() * size_of::<FollowEpsilon>())
+ + self.curr.memory_usage()
+ + self.next.memory_usage()
+ }
+
+ /// Clears this cache. This should be called at the start of every search
+ /// to ensure we start with a clean slate.
+ ///
+ /// This also sets the length of the capturing groups used in the current
+ /// search. This permits an optimization where by 'SlotTable::for_state'
+ /// only returns the number of slots equivalent to the number of slots
+ /// given in the 'Captures' value. This may be less than the total number
+ /// of possible slots, e.g., when one only wants to track overall match
+ /// offsets. This in turn permits less copying of capturing group spans
+ /// in the PikeVM.
+ fn setup_search(&mut self, captures_slot_len: usize) {
+ self.stack.clear();
+ self.curr.setup_search(captures_slot_len);
+ self.next.setup_search(captures_slot_len);
}
}
+/// A set of active states used to "simulate" the execution of an NFA via the
+/// PikeVM.
+///
+/// There are two sets of these used during NFA simulation. One set corresponds
+/// to the "current" set of states being traversed for the current position
+/// in a haystack. The other set corresponds to the "next" set of states being
+/// built, which will become the new "current" set for the next position in the
+/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
+/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
+///
+/// In addition to representing a set of NFA states, this also maintains slot
+/// values for each state. These slot values are what turn the NFA simulation
+/// into the "Pike VM." Namely, they track capturing group values for each
+/// state. During the computation of epsilon closure, we copy slot values from
+/// states in the "current" set to the "next" set. Eventually, once a match
+/// is found, the slot values for that match state are what we write to the
+/// caller provided 'Captures' value.
#[derive(Clone, Debug)]
-pub struct Cache {
- stack: Vec<FollowEpsilon>,
- clist: Threads,
- nlist: Threads,
+struct ActiveStates {
+ /// The set of active NFA states. This set preserves insertion order, which
+ /// is critical for simulating the match semantics of backtracking regex
+ /// engines.
+ set: SparseSet,
+ /// The slots for every NFA state, where each slot stores a (possibly
+ /// absent) offset. Every capturing group has two slots. One for a start
+ /// offset and one for an end offset.
+ slot_table: SlotTable,
}
-type Slot = Option<usize>;
+impl ActiveStates {
+ /// Create a new set of active states for the given PikeVM. The active
+ /// states returned may only be used with the given PikeVM. (Use 'reset'
+ /// to re-purpose the allocation for a different PikeVM.)
+ fn new(re: &PikeVM) -> ActiveStates {
+ let mut active = ActiveStates {
+ set: SparseSet::new(0),
+ slot_table: SlotTable::new(),
+ };
+ active.reset(re);
+ active
+ }
+
+ /// Reset this set of active states such that it can be used with the given
+ /// PikeVM (and only that PikeVM).
+ fn reset(&mut self, re: &PikeVM) {
+ self.set.resize(re.get_nfa().states().len());
+ self.slot_table.reset(re);
+ }
+
+ /// Return the heap memory usage, in bytes, used by this set of active
+ /// states.
+ ///
+ /// This does not include the stack size of this value.
+ fn memory_usage(&self) -> usize {
+ self.set.memory_usage() + self.slot_table.memory_usage()
+ }
+ /// Setup this set of active states for a new search. The given slot
+ /// length should be the number of slots in a caller provided 'Captures'
+ /// (and may be zero).
+ fn setup_search(&mut self, captures_slot_len: usize) {
+ self.set.clear();
+ self.slot_table.setup_search(captures_slot_len);
+ }
+}
+
+/// A table of slots, where each row represent a state in an NFA. Thus, the
+/// table has room for storing slots for every single state in an NFA.
+///
+/// This table is represented with a single contiguous allocation. In general,
+/// the notion of "capturing group" doesn't really exist at this level of
+/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
+/// maps to a pair of slots, one for the start offset and one for the end
+/// offset.) Slots are indexed by the 'Captures' NFA state.
+///
+/// N.B. Not every state actually needs a row of slots. Namely, states that
+/// only have epsilon transitions currently never have anything written to
+/// their rows in this table. Thus, the table is somewhat wasteful in its heap
+/// usage. However, it is important to maintain fast random access by state
+/// ID, which means one giant table tends to work well. RE2 takes a different
+/// approach here and allocates each row as its own reference counted thing.
+/// I explored such a strategy at one point here, but couldn't get it to work
+/// well using entirely safe code. (To the ambitious reader: I encourage you to
+/// re-litigate that experiment.) I very much wanted to stick to safe code, but
+/// could be convinced otherwise if there was a solid argument and the safety
+/// was encapsulated well.
#[derive(Clone, Debug)]
-struct Threads {
- set: SparseSet,
- caps: Vec<Slot>,
- slots_per_thread: usize,
+struct SlotTable {
+ /// The actual table of offsets.
+ table: Vec<Option<NonMaxUsize>>,
+ /// The number of slots per state, i.e., the table's stride or the length
+ /// of each row.
+ slots_per_state: usize,
+ /// The number of slots in the caller-provided 'Captures' value for the
+ /// current search. Setting this to 'slots_per_state' is always correct,
+ /// but may be wasteful.
+ slots_for_captures: usize,
+}
+
+impl SlotTable {
+ /// Create a new slot table.
+ ///
+ /// One should call 'reset' with the corresponding PikeVM before use.
+ fn new() -> SlotTable {
+ SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
+ }
+
+ /// Reset this slot table such that it can be used with the given PikeVM
+ /// (and only that PikeVM).
+ fn reset(&mut self, re: &PikeVM) {
+ let nfa = re.get_nfa();
+ self.slots_per_state = nfa.group_info().slot_len();
+ // This is always correct, but may be reduced for a particular search
+ // if a 'Captures' has fewer slots, e.g., none at all or only slots
+ // for tracking the overall match instead of all slots for every
+ // group.
+ self.slots_for_captures = core::cmp::max(
+ self.slots_per_state,
+ nfa.pattern_len().checked_mul(2).unwrap(),
+ );
+ let len = nfa
+ .states()
+ .len()
+ .checked_mul(self.slots_per_state)
+ // Add space to account for scratch space used during a search.
+ .and_then(|x| x.checked_add(self.slots_for_captures))
+ // It seems like this could actually panic on legitimate inputs on
+ // 32-bit targets, and very likely to panic on 16-bit. Should we
+ // somehow convert this to an error? What about something similar
+ // for the lazy DFA cache? If you're tripping this assert, please
+ // file a bug.
+ .expect("slot table length doesn't overflow");
+ // This happens about as often as a regex is compiled, so it probably
+ // should be at debug level, but I found it quite distracting and not
+ // particularly useful.
+ trace!(
+ "resizing PikeVM active states table to {} entries \
+ (slots_per_state={})",
+ len,
+ self.slots_per_state,
+ );
+ self.table.resize(len, None);
+ }
+
+ /// Return the heap memory usage, in bytes, used by this slot table.
+ ///
+ /// This does not include the stack size of this value.
+ fn memory_usage(&self) -> usize {
+ self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
+ }
+
+ /// Perform any per-search setup for this slot table.
+ ///
+ /// In particular, this sets the length of the number of slots used in the
+ /// 'Captures' given by the caller (if any at all). This number may be
+ /// smaller than the total number of slots available, e.g., when the caller
+ /// is only interested in tracking the overall match and not the spans of
+ /// every matching capturing group. Only tracking the overall match can
+ /// save a substantial amount of time copying capturing spans during a
+ /// search.
+ fn setup_search(&mut self, captures_slot_len: usize) {
+ self.slots_for_captures = captures_slot_len;
+ }
+
+ /// Return a mutable slice of the slots for the given state.
+ ///
+ /// Note that the length of the slice returned may be less than the total
+ /// number of slots available for this state. In particular, the length
+ /// always matches the number of slots indicated via 'setup_search'.
+ fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
+ let i = sid.as_usize() * self.slots_per_state;
+ &mut self.table[i..i + self.slots_for_captures]
+ }
+
+ /// Return a slice of slots of appropriate length where every slot offset
+ /// is guaranteed to be absent. This is useful in cases where you need to
+ /// compute an epsilon closure outside of the user supplied regex, and thus
+ /// never want it to have any capturing slots set.
+ fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
+ let i = self.table.len() - self.slots_for_captures;
+ &mut self.table[i..i + self.slots_for_captures]
+ }
}
+/// Represents a stack frame for use while computing an epsilon closure.
+///
+/// (An "epsilon closure" refers to the set of reachable NFA states from a
+/// single state without consuming any input. That is, the set of all epsilon
+/// transitions not only from that single state, but from every other state
+/// reachable by an epsilon transition as well. This is why it's called a
+/// "closure." Computing an epsilon closure is also done during DFA
+/// determinization! Compare and contrast the epsilon closure here in this
+/// PikeVM and the one used for determinization in crate::util::determinize.)
+///
+/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
+/// first traversal over all epsilon transitions from a particular state.
+/// (A depth first traversal is important because it emulates the same priority
+/// of matches that is typically found in backtracking regex engines.) This
+/// depth first traversal is naturally expressed using recursion, but to avoid
+/// a call stack size proportional to the size of a regex, we put our stack on
+/// the heap instead.
+///
+/// This stack thus consists of call frames. The typical call frame is
+/// `Explore`, which instructs epsilon closure to explore the epsilon
+/// transitions from that state. (Subsequent epsilon transitions are then
+/// pushed on to the stack as more `Explore` frames.) If the state ID being
+/// explored has no epsilon transitions, then the capturing group slots are
+/// copied from the original state that sparked the epsilon closure (from the
+/// 'step' routine) to the state ID being explored. This way, capturing group
+/// slots are forwarded from the previous state to the next.
+///
+/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
+/// set the position for a particular slot back to some particular offset. This
+/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
+/// set the offset of the slot indicated in `Capture` to the current offset,
+/// and then push the old offset on to the stack as a `RestoreCapture` frame.
+/// Thus, the new offset is only used until the epsilon closure reverts back to
+/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
+/// transition its "scope" to only states that come "after" it during depth
+/// first traversal.
#[derive(Clone, Debug)]
enum FollowEpsilon {
- StateID(StateID),
- Capture { slot: usize, pos: Slot },
+ /// Explore the epsilon transitions from a state ID.
+ Explore(StateID),
+ /// Reset the given `slot` to the given `offset` (which might be `None`).
+ RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
}
-impl Cache {
- pub fn new(nfa: &NFA) -> Cache {
- Cache {
- stack: vec![],
- clist: Threads::new(nfa),
- nlist: Threads::new(nfa),
+/// A set of counters that "instruments" a PikeVM search. To enable this, you
+/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust
+/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in
+/// the environment. The metrics collected will be dumped automatically for
+/// every search executed by the PikeVM.
+///
+/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an
+/// absolute decrease in wall-clock performance, even if the 'trace' log level
+/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't
+/// enabled.) The main point of instrumentation is to get counts of various
+/// events that occur during the PikeVM's execution.
+///
+/// This is a somewhat hacked together collection of metrics that are useful
+/// to gather from a PikeVM search. In particular, it lets us scrutinize the
+/// performance profile of a search beyond what general purpose profiling tools
+/// give us. Namely, we orient the profiling data around the specific states of
+/// the NFA.
+///
+/// In other words, this lets us see which parts of the NFA graph are most
+/// frequently activated. This then provides direction for optimization
+/// opportunities.
+///
+/// The really sad part about this is that it absolutely clutters up the PikeVM
+/// implementation. :'( Another approach would be to just manually add this
+/// code in whenever I want this kind of profiling data, but it's complicated
+/// and tedious enough that I went with this approach... for now.
+///
+/// When instrumentation is enabled (which also turns on 'logging'), then a
+/// `Counters` is initialized for every search and `trace`'d just before the
+/// search returns to the caller.
+///
+/// Tip: When debugging performance problems with the PikeVM, it's best to try
+/// to work with an NFA that is as small as possible. Otherwise the state graph
+/// is likely to be too big to digest.
+#[cfg(feature = "internal-instrument-pikevm")]
+#[derive(Clone, Debug)]
+struct Counters {
+ /// The number of times the NFA is in a particular permutation of states.
+ state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>,
+ /// The number of times 'step' is called for a particular state ID (which
+ /// indexes this array).
+ steps: Vec<u64>,
+ /// The number of times an epsilon closure was computed for a state.
+ closures: Vec<u64>,
+ /// The number of times a particular state ID is pushed on to a stack while
+ /// computing an epsilon closure.
+ stack_pushes: Vec<u64>,
+ /// The number of times a particular state ID is inserted into a sparse set
+ /// while computing an epsilon closure.
+ set_inserts: Vec<u64>,
+}
+
+#[cfg(feature = "internal-instrument-pikevm")]
+impl Counters {
+ fn empty() -> Counters {
+ Counters {
+ state_sets: alloc::collections::BTreeMap::new(),
+ steps: vec![],
+ closures: vec![],
+ stack_pushes: vec![],
+ set_inserts: vec![],
}
}
- fn clear(&mut self) {
- self.stack.clear();
- self.clist.set.clear();
- self.nlist.set.clear();
+ fn reset(&mut self, nfa: &NFA) {
+ let len = nfa.states().len();
+
+ self.state_sets.clear();
+
+ self.steps.clear();
+ self.steps.resize(len, 0);
+
+ self.closures.clear();
+ self.closures.resize(len, 0);
+
+ self.stack_pushes.clear();
+ self.stack_pushes.resize(len, 0);
+
+ self.set_inserts.clear();
+ self.set_inserts.resize(len, 0);
+ }
+
+ fn eprint(&self, nfa: &NFA) {
+ trace!("===== START PikeVM Instrumentation Output =====");
+ // We take the top-K most occurring state sets. Otherwise the output
+ // is likely to be overwhelming. And we probably only care about the
+ // most frequently occurring ones anyway.
+ const LIMIT: usize = 20;
+ let mut set_counts =
+ self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>();
+ set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count));
+ trace!("## PikeVM frequency of state sets (top {})", LIMIT);
+ for (set, count) in set_counts.iter().take(LIMIT) {
+ trace!("{:?}: {}", set, count);
+ }
+ if set_counts.len() > LIMIT {
+ trace!(
+ "... {} sets omitted (out of {} total)",
+ set_counts.len() - LIMIT,
+ set_counts.len(),
+ );
+ }
+
+ trace!("");
+ trace!("## PikeVM total frequency of events");
+ trace!(
+ "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}",
+ self.steps.iter().copied().sum::<u64>(),
+ self.closures.iter().copied().sum::<u64>(),
+ self.stack_pushes.iter().copied().sum::<u64>(),
+ self.set_inserts.iter().copied().sum::<u64>(),
+ );
+
+ trace!("");
+ trace!("## PikeVM frequency of events broken down by state");
+ for sid in 0..self.steps.len() {
+ trace!(
+ "{:06}: steps: {}, closures: {}, \
+ stack-pushes: {}, set-inserts: {}",
+ sid,
+ self.steps[sid],
+ self.closures[sid],
+ self.stack_pushes[sid],
+ self.set_inserts[sid],
+ );
+ }
+
+ trace!("");
+ trace!("## NFA debug display");
+ trace!("{:?}", nfa);
+ trace!("===== END PikeVM Instrumentation Output =====");
}
- fn swap(&mut self) {
- core::mem::swap(&mut self.clist, &mut self.nlist);
+ fn record_state_set(&mut self, set: &SparseSet) {
+ let set = set.iter().collect::<Vec<StateID>>();
+ *self.state_sets.entry(set).or_insert(0) += 1;
}
-}
-impl Threads {
- fn new(nfa: &NFA) -> Threads {
- let mut threads = Threads {
- set: SparseSet::new(0),
- caps: vec![],
- slots_per_thread: 0,
- };
- threads.resize(nfa);
- threads
+ fn record_step(&mut self, sid: StateID) {
+ self.steps[sid] += 1;
}
- fn resize(&mut self, nfa: &NFA) {
- if nfa.states().len() == self.set.capacity() {
- return;
- }
- self.slots_per_thread = nfa.capture_slot_len();
- self.set.resize(nfa.states().len());
- self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
+ fn record_closure(&mut self, sid: StateID) {
+ self.closures[sid] += 1;
+ }
+
+ fn record_stack_push(&mut self, sid: StateID) {
+ self.stack_pushes[sid] += 1;
}
- fn caps(&mut self, sid: StateID) -> &mut [Slot] {
- let i = sid.as_usize() * self.slots_per_thread;
- &mut self.caps[i..i + self.slots_per_thread]
+ fn record_set_insert(&mut self, sid: StateID) {
+ self.set_inserts[sid] += 1;
}
}
diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
index 92f36ce3a..2d43a5b6f 100644
--- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs
+++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
@@ -1,165 +1,160 @@
-// I've called the primary data structure in this module a "range trie." As far
-// as I can tell, there is no prior art on a data structure like this, however,
-// it's likely someone somewhere has built something like it. Searching for
-// "range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
-// but it does not appear relevant.
-//
-// The range trie is just like a trie in that it is a special case of a
-// deterministic finite state machine. It has states and each state has a set
-// of transitions to other states. It is acyclic, and, like a normal trie,
-// it makes no attempt to reuse common suffixes among its elements. The key
-// difference between a normal trie and a range trie below is that a range trie
-// operates on *contiguous sequences* of bytes instead of singleton bytes.
-// One could say say that our alphabet is ranges of bytes instead of bytes
-// themselves, except a key part of range trie construction is splitting ranges
-// apart to ensure there is at most one transition that can be taken for any
-// byte in a given state.
-//
-// I've tried to explain the details of how the range trie works below, so
-// for now, we are left with trying to understand what problem we're trying to
-// solve. Which is itself fairly involved!
-//
-// At the highest level, here's what we want to do. We want to convert a
-// sequence of Unicode codepoints into a finite state machine whose transitions
-// are over *bytes* and *not* Unicode codepoints. We want this because it makes
-// said finite state machines much smaller and much faster to execute. As a
-// simple example, consider a byte oriented automaton for all Unicode scalar
-// values (0x00 through 0x10FFFF, not including surrogate codepoints):
-//
-// [00-7F]
-// [C2-DF][80-BF]
-// [E0-E0][A0-BF][80-BF]
-// [E1-EC][80-BF][80-BF]
-// [ED-ED][80-9F][80-BF]
-// [EE-EF][80-BF][80-BF]
-// [F0-F0][90-BF][80-BF][80-BF]
-// [F1-F3][80-BF][80-BF][80-BF]
-// [F4-F4][80-8F][80-BF][80-BF]
-//
-// (These byte ranges are generated via the regex-syntax::utf8 module, which
-// was based on Russ Cox's code in RE2, which was in turn based on Ken
-// Thompson's implementation of the same idea in his Plan9 implementation of
-// grep.)
-//
-// It should be fairly straight-forward to see how one could compile this into
-// a DFA. The sequences are sorted and non-overlapping. Essentially, you could
-// build a trie from this fairly easy. The problem comes when your initial
-// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
-// represented by '\w' contains only a tenth of the codepoints that
-// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges
-// as we did above, the list would stretch to 892 entries! This turns into
-// quite a large NFA with a few thousand states. Turning this beast into a DFA
-// takes quite a bit of time. We are thus left with trying to trim down the
-// number of states we produce as early as possible.
-//
-// One approach (used by RE2 and still by the regex crate, at time of writing)
-// is to try to find common suffixes while building NFA states for the above
-// and reuse them. This is very cheap to do and one can control precisely how
-// much extra memory you want to use for the cache.
-//
-// Another approach, however, is to reuse an algorithm for constructing a
-// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
-// the full details here, but I explain it in more depth in my blog post on
-// FSTs[1]. Note that the algorithm was not invented by me, but was published
-// in paper by Daciuk et al. in 2000 called "Incremental Construction of
-// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
-// it is also possible to control the amount of extra memory one uses, although
-// this usually comes with the cost of sacrificing true minimality. (But it's
-// typically close enough with a reasonably sized cache of states.)
-//
-// The catch is that Daciuk's algorithm only works if you add your keys in
-// lexicographic ascending order. In our case, since we're dealing with ranges,
-// we also need the additional requirement that ranges are either equivalent
-// or do not overlap at all. For example, if one were given the following byte
-// ranges:
-//
-// [BC-BF][80-BF]
-// [BC-BF][90-BF]
-//
-// Then Daciuk's algorithm would not work, since there is nothing to handle the
-// fact that the ranges overlap. They would need to be split apart. Thankfully,
-// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges
-// meets both of our requirements. (A proof for this eludes me, but it appears
-// true.)
-//
-// ... however, we would also like to be able to compile UTF-8 automata in
-// reverse. We want this because in order to find the starting location of a
-// match using a DFA, we need to run a second DFA---a reversed version of the
-// forward DFA---backwards to discover the match location. Unfortunately, if
-// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
-// can overlap, even if they are sorted:
-//
-// [00-7F]
-// [80-BF][80-9F][ED-ED]
-// [80-BF][80-BF][80-8F][F4-F4]
-// [80-BF][80-BF][80-BF][F1-F3]
-// [80-BF][80-BF][90-BF][F0-F0]
-// [80-BF][80-BF][E1-EC]
-// [80-BF][80-BF][EE-EF]
-// [80-BF][A0-BF][E0-E0]
-// [80-BF][C2-DF]
-//
-// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
-// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
-// simple way to apply Daciuk's algorithm.
-//
-// And thus, the range trie was born. The range trie's only purpose is to take
-// sequences of byte ranges like the ones above, collect them into a trie and
-// then spit them in a sorted fashion with no overlapping ranges. For example,
-// 0x00-0x10FFFF gets translated to:
-//
-// [0-7F]
-// [80-BF][80-9F][80-8F][F1-F3]
-// [80-BF][80-9F][80-8F][F4]
-// [80-BF][80-9F][90-BF][F0]
-// [80-BF][80-9F][90-BF][F1-F3]
-// [80-BF][80-9F][E1-EC]
-// [80-BF][80-9F][ED]
-// [80-BF][80-9F][EE-EF]
-// [80-BF][A0-BF][80-8F][F1-F3]
-// [80-BF][A0-BF][80-8F][F4]
-// [80-BF][A0-BF][90-BF][F0]
-// [80-BF][A0-BF][90-BF][F1-F3]
-// [80-BF][A0-BF][E0]
-// [80-BF][A0-BF][E1-EC]
-// [80-BF][A0-BF][EE-EF]
-// [80-BF][C2-DF]
-//
-// We've thus satisfied our requirements for running Daciuk's algorithm. All
-// sequences of ranges are sorted, and any corresponding ranges are either
-// exactly equivalent or non-overlapping.
-//
-// In effect, a range trie is building a DFA from a sequence of arbitrary
-// byte ranges. But it uses an algoritm custom tailored to its input, so it
-// is not as costly as traditional DFA construction. While it is still quite
-// a bit more costly than the forward's case (which only needs Daciuk's
-// algorithm), it winds up saving a substantial amount of time if one is doing
-// a full DFA powerset construction later by virtue of producing a much much
-// smaller NFA.
-//
-// [1] - https://blog.burntsushi.net/transducers/
-// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
-
-use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32};
+/*
+I've called the primary data structure in this module a "range trie." As far
+as I can tell, there is no prior art on a data structure like this, however,
+it's likely someone somewhere has built something like it. Searching for
+"range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
+but it does not appear relevant.
+
+The range trie is just like a trie in that it is a special case of a
+deterministic finite state machine. It has states and each state has a set
+of transitions to other states. It is acyclic, and, like a normal trie,
+it makes no attempt to reuse common suffixes among its elements. The key
+difference between a normal trie and a range trie below is that a range trie
+operates on *contiguous sequences* of bytes instead of singleton bytes.
+One could say say that our alphabet is ranges of bytes instead of bytes
+themselves, except a key part of range trie construction is splitting ranges
+apart to ensure there is at most one transition that can be taken for any
+byte in a given state.
+
+I've tried to explain the details of how the range trie works below, so
+for now, we are left with trying to understand what problem we're trying to
+solve. Which is itself fairly involved!
+
+At the highest level, here's what we want to do. We want to convert a
+sequence of Unicode codepoints into a finite state machine whose transitions
+are over *bytes* and *not* Unicode codepoints. We want this because it makes
+said finite state machines much smaller and much faster to execute. As a
+simple example, consider a byte oriented automaton for all Unicode scalar
+values (0x00 through 0x10FFFF, not including surrogate codepoints):
+
+ [00-7F]
+ [C2-DF][80-BF]
+ [E0-E0][A0-BF][80-BF]
+ [E1-EC][80-BF][80-BF]
+ [ED-ED][80-9F][80-BF]
+ [EE-EF][80-BF][80-BF]
+ [F0-F0][90-BF][80-BF][80-BF]
+ [F1-F3][80-BF][80-BF][80-BF]
+ [F4-F4][80-8F][80-BF][80-BF]
+
+(These byte ranges are generated via the regex-syntax::utf8 module, which
+was based on Russ Cox's code in RE2, which was in turn based on Ken
+Thompson's implementation of the same idea in his Plan9 implementation of
+grep.)
+
+It should be fairly straight-forward to see how one could compile this into
+a DFA. The sequences are sorted and non-overlapping. Essentially, you could
+build a trie from this fairly easy. The problem comes when your initial
+range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
+represented by '\w' contains only a tenth of the codepoints that
+0x00-0x10FFFF contains, but if we were to write out the byte based ranges
+as we did above, the list would stretch to 892 entries! This turns into
+quite a large NFA with a few thousand states. Turning this beast into a DFA
+takes quite a bit of time. We are thus left with trying to trim down the
+number of states we produce as early as possible.
+
+One approach (used by RE2 and still by the regex crate, at time of writing)
+is to try to find common suffixes while building NFA states for the above
+and reuse them. This is very cheap to do and one can control precisely how
+much extra memory you want to use for the cache.
+
+Another approach, however, is to reuse an algorithm for constructing a
+*minimal* DFA from a sorted sequence of inputs. I don't want to go into
+the full details here, but I explain it in more depth in my blog post on
+FSTs[1]. Note that the algorithm was not invented by me, but was published
+in paper by Daciuk et al. in 2000 called "Incremental Construction of
+MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
+it is also possible to control the amount of extra memory one uses, although
+this usually comes with the cost of sacrificing true minimality. (But it's
+typically close enough with a reasonably sized cache of states.)
+
+The catch is that Daciuk's algorithm only works if you add your keys in
+lexicographic ascending order. In our case, since we're dealing with ranges,
+we also need the additional requirement that ranges are either equivalent
+or do not overlap at all. For example, if one were given the following byte
+ranges:
+
+ [BC-BF][80-BF]
+ [BC-BF][90-BF]
+
+Then Daciuk's algorithm would not work, since there is nothing to handle the
+fact that the ranges overlap. They would need to be split apart. Thankfully,
+Thompson's algorithm for producing byte ranges for Unicode codepoint ranges
+meets both of our requirements. (A proof for this eludes me, but it appears
+true.)
+
+... however, we would also like to be able to compile UTF-8 automata in
+reverse. We want this because in order to find the starting location of a
+match using a DFA, we need to run a second DFA---a reversed version of the
+forward DFA---backwards to discover the match location. Unfortunately, if
+we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
+can overlap, even if they are sorted:
+
+ [00-7F]
+ [80-BF][80-9F][ED-ED]
+ [80-BF][80-BF][80-8F][F4-F4]
+ [80-BF][80-BF][80-BF][F1-F3]
+ [80-BF][80-BF][90-BF][F0-F0]
+ [80-BF][80-BF][E1-EC]
+ [80-BF][80-BF][EE-EF]
+ [80-BF][A0-BF][E0-E0]
+ [80-BF][C2-DF]
+
+For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
+overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
+simple way to apply Daciuk's algorithm.
+
+And thus, the range trie was born. The range trie's only purpose is to take
+sequences of byte ranges like the ones above, collect them into a trie and then
+spit them out in a sorted fashion with no overlapping ranges. For example,
+0x00-0x10FFFF gets translated to:
+
+ [0-7F]
+ [80-BF][80-9F][80-8F][F1-F3]
+ [80-BF][80-9F][80-8F][F4]
+ [80-BF][80-9F][90-BF][F0]
+ [80-BF][80-9F][90-BF][F1-F3]
+ [80-BF][80-9F][E1-EC]
+ [80-BF][80-9F][ED]
+ [80-BF][80-9F][EE-EF]
+ [80-BF][A0-BF][80-8F][F1-F3]
+ [80-BF][A0-BF][80-8F][F4]
+ [80-BF][A0-BF][90-BF][F0]
+ [80-BF][A0-BF][90-BF][F1-F3]
+ [80-BF][A0-BF][E0]
+ [80-BF][A0-BF][E1-EC]
+ [80-BF][A0-BF][EE-EF]
+ [80-BF][C2-DF]
+
+We've thus satisfied our requirements for running Daciuk's algorithm. All
+sequences of ranges are sorted, and any corresponding ranges are either
+exactly equivalent or non-overlapping.
+
+In effect, a range trie is building a DFA from a sequence of arbitrary byte
+ranges. But it uses an algorithm custom tailored to its input, so it is not as
+costly as traditional DFA construction. While it is still quite a bit more
+costly than the forward case (which only needs Daciuk's algorithm), it winds
+up saving a substantial amount of time if one is doing a full DFA powerset
+construction later by virtue of producing a much much smaller NFA.
+
+[1] - https://blog.burntsushi.net/transducers/
+[2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
+*/
+
+use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive};
use alloc::{format, string::String, vec, vec::Vec};
use regex_syntax::utf8::Utf8Range;
-/// A smaller state ID means more effective use of the CPU cache and less
-/// time spent copying. The implementation below will panic if the state ID
-/// space is exhausted, but in order for that to happen, the range trie itself
-/// would use well over 100GB of memory. Moreover, it's likely impossible
-/// for the state ID space to get that big. In fact, it's likely that even a
-/// u16 would be good enough here. But it's not quite clear how to prove this.
-type StateID = u32;
+use crate::util::primitives::StateID;
/// There is only one final state in this trie. Every sequence of byte ranges
/// added shares the same final state.
-const FINAL: StateID = 0;
+const FINAL: StateID = StateID::ZERO;
/// The root state of the trie.
-const ROOT: StateID = 1;
+const ROOT: StateID = StateID::new_unchecked(1);
/// A range trie represents an ordered set of sequences of bytes.
///
@@ -193,7 +188,7 @@ pub struct RangeTrie {
/// A stack for traversing this trie to yield sequences of byte ranges in
/// lexicographic order.
iter_stack: RefCell<Vec<NextIter>>,
- /// A bufer that stores the current sequence during iteration.
+ /// A buffer that stores the current sequence during iteration.
iter_ranges: RefCell<Vec<Utf8Range>>,
/// A stack used for traversing the trie in order to (deeply) duplicate
/// a state. States are recursively duplicated when ranges are split.
@@ -431,14 +426,16 @@ impl RangeTrie {
}
pub fn add_empty(&mut self) -> StateID {
- if self.states.len() as u64 > u32::MAX as u64 {
- // This generally should not happen since a range trie is only
- // ever used to compile a single sequence of Unicode scalar values.
- // If we ever got to this point, we would, at *minimum*, be using
- // 96GB in just the range trie alone.
- panic!("too many sequences added to range trie");
- }
- let id = self.states.len() as StateID;
+ let id = match StateID::try_from(self.states.len()) {
+ Ok(id) => id,
+ Err(_) => {
+ // This generally should not happen since a range trie is
+ // only ever used to compile a single sequence of Unicode
+ // scalar values. If we ever got to this point, we would, at
+ // *minimum*, be using 96GB in just the range trie alone.
+ panic!("too many sequences added to range trie");
+ }
+ };
// If we have some free states available, then use them to avoid
// more allocations.
if let Some(mut state) = self.free.pop() {
@@ -542,12 +539,12 @@ impl RangeTrie {
/// Return an immutable borrow for the state with the given ID.
fn state(&self, id: StateID) -> &State {
- &self.states[id as usize]
+ &self.states[id]
}
/// Return a mutable borrow for the state with the given ID.
fn state_mut(&mut self, id: StateID) -> &mut State {
- &mut self.states[id as usize]
+ &mut self.states[id]
}
}
@@ -625,7 +622,7 @@ struct NextIter {
}
/// The next state to process during insertion and any remaining ranges that we
-/// want to add for a partcular sequence of ranges. The first such instance
+/// want to add for a particular sequence of ranges. The first such instance
/// is always the root state along with all ranges given.
#[derive(Clone, Debug)]
struct NextInsert {
@@ -651,7 +648,7 @@ impl NextInsert {
let mut tmp = [Utf8Range { start: 0, end: 0 }; 4];
tmp[..len].copy_from_slice(ranges);
- NextInsert { state_id, ranges: tmp, len: len as u8 }
+ NextInsert { state_id, ranges: tmp, len: u8::try_from(len).unwrap() }
}
/// Push a new empty state to visit along with any remaining ranges that
@@ -679,7 +676,7 @@ impl NextInsert {
/// Return the remaining ranges to insert.
fn ranges(&self) -> &[Utf8Range] {
- &self.ranges[..self.len as usize]
+ &self.ranges[..usize::try_from(self.len).unwrap()]
}
}
@@ -871,7 +868,7 @@ impl fmt::Debug for RangeTrie {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "")?;
for (i, state) in self.states.iter().enumerate() {
- let status = if i == FINAL as usize { '*' } else { ' ' };
+ let status = if i == FINAL.as_usize() { '*' } else { ' ' };
writeln!(f, "{}{:06}: {:?}", status, i, state)?;
}
Ok(())
@@ -893,12 +890,19 @@ impl fmt::Debug for State {
impl fmt::Debug for Transition {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.range.start == self.range.end {
- write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
+ write!(
+ f,
+ "{:02X} => {:02X}",
+ self.range.start,
+ self.next_id.as_usize(),
+ )
} else {
write!(
f,
"{:02X}-{:02X} => {:02X}",
- self.range.start, self.range.end, self.next_id
+ self.range.start,
+ self.range.end,
+ self.next_id.as_usize(),
)
}
}
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs
index 0bc1ece58..22b5a7644 100644
--- a/vendor/regex-automata/src/util/alphabet.rs
+++ b/vendor/regex-automata/src/util/alphabet.rs
@@ -1,25 +1,65 @@
-use core::convert::TryFrom;
-
+/*!
+This module provides APIs for dealing with the alphabets of finite state
+machines.
+
+There are two principal types in this module, [`ByteClasses`] and [`Unit`].
+The former defines the alphabet of a finite state machine while the latter
+represents an element of that alphabet.
+
+To a first approximation, the alphabet of all automata in this crate is just
+a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this
+can be quite wasteful when building a transition table for a DFA, since it
+requires storing a state identifier for each element in the alphabet. Instead,
+we collapse the alphabet of an automaton down into equivalence classes, where
+every byte in the same equivalence class never discriminates between a match or
+a non-match from any other byte in the same class. For example, in the regex
+`[a-z]+`, then you could consider it having an alphabet consisting of two
+equivalence classes: `a-z` and everything else. In terms of the transitions on
+an automaton, it doesn't actually require representing every distinct byte.
+Just the equivalence classes.
+
+The downside of equivalence classes is that, of course, searching a haystack
+deals with individual byte values. Those byte values need to be mapped to
+their corresponding equivalence class. This is what `ByteClasses` does. In
+practice, doing this for every state transition has negligible impact on modern
+CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly
+considerably) shrinking the size of the transition table.
+
+One last hiccup concerns `Unit`. Namely, because of look-around and how the
+DFAs in this crate work, we need to add a sentinel value to our alphabet
+of equivalence classes that represents the "end" of a search. We call that
+sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an
+equivalence class corresponding to a set of bytes, or it is a special "end of
+input" sentinel.
+
+In general, you should not expect to need either of these types unless you're
+doing lower level shenanigans with DFAs, or even building your own DFAs.
+(Although, you don't have to use these types to build your own DFAs of course.)
+For example, if you're walking a DFA's state graph, it's probably useful to
+make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead
+of just visiting every distinct `u8` value. The latter isn't necessarily wrong,
+but it could be potentially very wasteful.
+*/
use crate::util::{
- bytes::{DeserializeError, SerializeError},
- DebugByte,
+ escape::DebugByte,
+ wire::{self, DeserializeError, SerializeError},
};
-/// Unit represents a single unit of input for DFA based regex engines.
+/// Unit represents a single unit of haystack for DFA based regex engines.
///
-/// **NOTE:** It is not expected for consumers of this crate to need to use
-/// this type unless they are implementing their own DFA. And even then, it's
-/// not required: implementors may use other techniques to handle input.
+/// It is not expected for consumers of this crate to need to use this type
+/// unless they are implementing their own DFA. And even then, it's not
+/// required: implementors may use other techniques to handle haystack units.
///
-/// Typically, a single unit of input for a DFA would be a single byte.
+/// Typically, a single unit of haystack for a DFA would be a single byte.
/// However, for the DFAs in this crate, matches are delayed by a single byte
/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
/// we have consumed the haystack, we must run the DFA through one additional
-/// transition using an input that indicates the haystack has ended.
+/// transition using a unit that indicates the haystack has ended.
///
-/// Since there is no way to represent a sentinel with a `u8` since all
-/// possible values *may* be valid inputs to a DFA, this type explicitly adds
-/// room for a sentinel value.
+/// There is no way to represent a sentinel with a `u8` since all possible
+/// values *may* be valid haystack units to a DFA, therefore this type
+/// explicitly adds room for a sentinel value.
///
/// The sentinel EOI value is always its own equivalence class and is
/// ultimately represented by adding 1 to the maximum equivalence class value.
@@ -36,74 +76,108 @@ use crate::util::{
/// Where EOI is the special sentinel value that is always in its own
/// singleton equivalence class.
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
-pub enum Unit {
+pub struct Unit(UnitKind);
+
+#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+enum UnitKind {
+ /// Represents a byte value, or more typically, an equivalence class
+ /// represented as a byte value.
U8(u8),
+ /// Represents the "end of input" sentinel. We regretably use a `u16`
+ /// here since the maximum sentinel value is `256`. Thankfully, we don't
+ /// actually store a `Unit` anywhere, so this extra space shouldn't be too
+ /// bad.
EOI(u16),
}
impl Unit {
- /// Create a new input unit from a byte value.
+ /// Create a new haystack unit from a byte value.
///
- /// All possible byte values are legal. However, when creating an input
- /// unit for a specific DFA, one should be careful to only construct input
- /// units that are in that DFA's alphabet. Namely, one way to compact a
- /// DFA's in-memory representation is to collapse its transitions to a set
- /// of equivalence classes into a set of all possible byte values. If a
- /// DFA uses equivalence classes instead of byte values, then the byte
- /// given here should be the equivalence class.
+ /// All possible byte values are legal. However, when creating a haystack
+ /// unit for a specific DFA, one should be careful to only construct units
+ /// that are in that DFA's alphabet. Namely, one way to compact a DFA's
+ /// in-memory representation is to collapse its transitions to a set of
+ /// equivalence classes into a set of all possible byte values. If a DFA
+ /// uses equivalence classes instead of byte values, then the byte given
+ /// here should be the equivalence class.
pub fn u8(byte: u8) -> Unit {
- Unit::U8(byte)
+ Unit(UnitKind::U8(byte))
}
+ /// Create a new "end of input" haystack unit.
+ ///
+ /// The value given is the sentinel value used by this unit to represent
+ /// the "end of input." The value should be the total number of equivalence
+ /// classes in the corresponding alphabet. Its maximum value is `256`,
+ /// which occurs when every byte is its own equivalence class.
+ ///
+ /// # Panics
+ ///
+ /// This panics when `num_byte_equiv_classes` is greater than `256`.
pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
assert!(
num_byte_equiv_classes <= 256,
"max number of byte-based equivalent classes is 256, but got {}",
num_byte_equiv_classes,
);
- Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap())
+ Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap()))
}
+ /// If this unit is not an "end of input" sentinel, then returns its
+ /// underlying byte value. Otherwise return `None`.
pub fn as_u8(self) -> Option<u8> {
- match self {
- Unit::U8(b) => Some(b),
- Unit::EOI(_) => None,
+ match self.0 {
+ UnitKind::U8(b) => Some(b),
+ UnitKind::EOI(_) => None,
}
}
- #[cfg(feature = "alloc")]
- pub fn as_eoi(self) -> Option<usize> {
- match self {
- Unit::U8(_) => None,
- Unit::EOI(eoi) => Some(eoi as usize),
+ /// If this unit is an "end of input" sentinel, then return the underlying
+ /// sentinel value that was given to [`Unit::eoi`]. Otherwise return
+ /// `None`.
+ pub fn as_eoi(self) -> Option<u16> {
+ match self.0 {
+ UnitKind::U8(_) => None,
+ UnitKind::EOI(sentinel) => Some(sentinel),
}
}
+ /// Return this unit as a `usize`, regardless of whether it is a byte value
+ /// or an "end of input" sentinel. In the latter case, the underlying
+ /// sentinel value given to [`Unit::eoi`] is returned.
pub fn as_usize(self) -> usize {
- match self {
- Unit::U8(b) => b as usize,
- Unit::EOI(eoi) => eoi as usize,
+ match self.0 {
+ UnitKind::U8(b) => usize::from(b),
+ UnitKind::EOI(eoi) => usize::from(eoi),
}
}
- pub fn is_eoi(&self) -> bool {
- match *self {
- Unit::EOI(_) => true,
- _ => false,
- }
+ /// Returns true if and only of this unit is a byte value equivalent to the
+ /// byte given. This always returns false when this is an "end of input"
+ /// sentinel.
+ pub fn is_byte(self, byte: u8) -> bool {
+ self.as_u8().map_or(false, |b| b == byte)
}
- #[cfg(feature = "alloc")]
- pub fn is_word_byte(&self) -> bool {
- self.as_u8().map_or(false, crate::util::is_word_byte)
+ /// Returns true when this unit represents an "end of input" sentinel.
+ pub fn is_eoi(self) -> bool {
+ self.as_eoi().is_some()
+ }
+
+ /// Returns true when this unit corresponds to an ASCII word byte.
+ ///
+ /// This always returns false when this unit represents an "end of input"
+ /// sentinel.
+ pub fn is_word_byte(self) -> bool {
+ self.as_u8().map_or(false, crate::util::utf8::is_word_byte)
}
}
impl core::fmt::Debug for Unit {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- match *self {
- Unit::U8(b) => write!(f, "{:?}", DebugByte(b)),
- Unit::EOI(_) => write!(f, "EOI"),
+ match self.0 {
+ UnitKind::U8(b) => write!(f, "{:?}", DebugByte(b)),
+ UnitKind::EOI(_) => write!(f, "EOI"),
}
}
}
@@ -113,23 +187,48 @@ impl core::fmt::Debug for Unit {
/// This is used in a DFA to reduce the size of the transition table. This can
/// have a particularly large impact not only on the total size of a dense DFA,
/// but also on compile times.
+///
+/// The essential idea here is that the alphabet of a DFA is shrunk from the
+/// usual 256 distinct byte values down to a set of equivalence classes. The
+/// guarantee you get is that any byte belonging to the same equivalence class
+/// can be treated as if it were any other byte in the same class, and the
+/// result of a search wouldn't change.
+///
+/// # Example
+///
+/// This example shows how to get byte classes from an
+/// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes.
+///
+/// ```
+/// use regex_automata::nfa::thompson::NFA;
+///
+/// let nfa = NFA::new("[a-z]+")?;
+/// let classes = nfa.byte_classes();
+/// // 'a' and 'z' are in the same class for this regex.
+/// assert_eq!(classes.get(b'a'), classes.get(b'z'));
+/// // But 'a' and 'A' are not.
+/// assert_ne!(classes.get(b'a'), classes.get(b'A'));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
#[derive(Clone, Copy)]
pub struct ByteClasses([u8; 256]);
impl ByteClasses {
/// Creates a new set of equivalence classes where all bytes are mapped to
/// the same class.
+ #[inline]
pub fn empty() -> ByteClasses {
ByteClasses([0; 256])
}
/// Creates a new set of equivalence classes where each byte belongs to
/// its own equivalence class.
- #[cfg(feature = "alloc")]
+ #[inline]
pub fn singletons() -> ByteClasses {
let mut classes = ByteClasses::empty();
- for i in 0..256 {
- classes.set(i as u8, i as u8);
+ for b in 0..=255 {
+ classes.set(b, b);
}
classes
}
@@ -139,18 +238,19 @@ impl ByteClasses {
/// an error is returned. Upon success, the number of bytes read along with
/// the map are returned. The number of bytes read is always a multiple of
/// 8.
- pub fn from_bytes(
+ pub(crate) fn from_bytes(
slice: &[u8],
) -> Result<(ByteClasses, usize), DeserializeError> {
- if slice.len() < 256 {
- return Err(DeserializeError::buffer_too_small("byte class map"));
- }
+ wire::check_slice_len(slice, 256, "byte class map")?;
let mut classes = ByteClasses::empty();
for (b, &class) in slice[..256].iter().enumerate() {
- classes.set(b as u8, class);
+ classes.set(u8::try_from(b).unwrap(), class);
}
- for b in classes.iter() {
- if b.as_usize() >= classes.alphabet_len() {
+ // We specifically don't use 'classes.iter()' here because that
+ // iterator depends on 'classes.alphabet_len()' being correct. But that
+ // is precisely the thing we're trying to verify below!
+ for &b in classes.0.iter() {
+ if usize::from(b) >= classes.alphabet_len() {
return Err(DeserializeError::generic(
"found equivalence class greater than alphabet len",
));
@@ -163,7 +263,7 @@ impl ByteClasses {
/// buffer is too small, then an error is returned. Upon success, the total
/// number of bytes written is returned. The number of bytes written is
/// guaranteed to be a multiple of 8.
- pub fn write_to(
+ pub(crate) fn write_to(
&self,
mut dst: &mut [u8],
) -> Result<usize, SerializeError> {
@@ -179,41 +279,38 @@ impl ByteClasses {
}
/// Returns the total number of bytes written by `write_to`.
- pub fn write_to_len(&self) -> usize {
+ pub(crate) fn write_to_len(&self) -> usize {
256
}
/// Set the equivalence class for the given byte.
#[inline]
pub fn set(&mut self, byte: u8, class: u8) {
- self.0[byte as usize] = class;
+ self.0[usize::from(byte)] = class;
}
/// Get the equivalence class for the given byte.
#[inline]
pub fn get(&self, byte: u8) -> u8 {
- self.0[byte as usize]
- }
-
- /// Get the equivalence class for the given byte while forcefully
- /// eliding bounds checks.
- #[inline]
- pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
- *self.0.get_unchecked(byte as usize)
+ self.0[usize::from(byte)]
}
- /// Get the equivalence class for the given input unit and return the
+ /// Get the equivalence class for the given haystack unit and return the
/// class as a `usize`.
#[inline]
pub fn get_by_unit(&self, unit: Unit) -> usize {
- match unit {
- Unit::U8(b) => usize::try_from(self.get(b)).unwrap(),
- Unit::EOI(b) => usize::try_from(b).unwrap(),
+ match unit.0 {
+ UnitKind::U8(b) => usize::from(self.get(b)),
+ UnitKind::EOI(b) => usize::from(b),
}
}
+ /// Create a unit that represents the "end of input" sentinel based on the
+ /// number of equivalence classes.
#[inline]
pub fn eoi(&self) -> Unit {
+ // The alphabet length already includes the EOI sentinel, hence why
+ // we subtract 1.
Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
}
@@ -225,49 +322,153 @@ impl ByteClasses {
// Add one since the number of equivalence classes is one bigger than
// the last one. But add another to account for the final EOI class
// that isn't explicitly represented.
- self.0[255] as usize + 1 + 1
+ usize::from(self.0[255]) + 1 + 1
}
/// Returns the stride, as a base-2 exponent, required for these
/// equivalence classes.
///
/// The stride is always the smallest power of 2 that is greater than or
- /// equal to the alphabet length. This is done so that converting between
- /// state IDs and indices can be done with shifts alone, which is much
- /// faster than integer division.
- #[cfg(feature = "alloc")]
+ /// equal to the alphabet length, and the `stride2` returned here is the
+ /// exponent applied to `2` to get the smallest power. This is done so that
+ /// converting between premultiplied state IDs and indices can be done with
+ /// shifts alone, which is much faster than integer division.
+ #[inline]
pub fn stride2(&self) -> usize {
- self.alphabet_len().next_power_of_two().trailing_zeros() as usize
+ let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
+ usize::try_from(zeros).unwrap()
}
/// Returns true if and only if every byte in this class maps to its own
/// equivalence class. Equivalently, there are 257 equivalence classes
- /// and each class contains exactly one byte (plus the special EOI class).
+ /// and each class contains either exactly one byte or corresponds to the
+ /// singleton class containing the "end of input" sentinel.
#[inline]
pub fn is_singleton(&self) -> bool {
self.alphabet_len() == 257
}
/// Returns an iterator over all equivalence classes in this set.
+ #[inline]
pub fn iter(&self) -> ByteClassIter<'_> {
ByteClassIter { classes: self, i: 0 }
}
/// Returns an iterator over a sequence of representative bytes from each
- /// equivalence class. Namely, this yields exactly N items, where N is
- /// equivalent to the number of equivalence classes. Each item is an
- /// arbitrary byte drawn from each equivalence class.
+ /// equivalence class within the range of bytes given.
+ ///
+ /// When the given range is unbounded on both sides, the iterator yields
+ /// exactly N items, where N is equivalent to the number of equivalence
+ /// classes. Each item is an arbitrary byte drawn from each equivalence
+ /// class.
///
/// This is useful when one is determinizing an NFA and the NFA's alphabet
- /// hasn't been converted to equivalence classes yet. Picking an arbitrary
- /// byte from each equivalence class then permits a full exploration of
- /// the NFA instead of using every possible byte value.
- #[cfg(feature = "alloc")]
- pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
- ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+ /// hasn't been converted to equivalence classes. Picking an arbitrary byte
+ /// from each equivalence class then permits a full exploration of the NFA
+ /// instead of using every possible byte value and thus potentially saves
+ /// quite a lot of redundant work.
+ ///
+ /// # Example
+ ///
+ /// This shows an example of what a complete sequence of representatives
+ /// might look like from a real example.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
+ ///
+ /// let nfa = NFA::new("[a-z]+")?;
+ /// let classes = nfa.byte_classes();
+ /// let reps: Vec<Unit> = classes.representatives(..).collect();
+ /// // Note that the specific byte values yielded are not guaranteed!
+ /// let expected = vec![
+ /// Unit::u8(b'\x00'),
+ /// Unit::u8(b'a'),
+ /// Unit::u8(b'{'),
+ /// Unit::eoi(3),
+ /// ];
+ /// assert_eq!(expected, reps);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Note though, that you can ask for an arbitrary range of bytes, and only
+ /// representatives for that range will be returned:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
+ ///
+ /// let nfa = NFA::new("[a-z]+")?;
+ /// let classes = nfa.byte_classes();
+ /// let reps: Vec<Unit> = classes.representatives(b'A'..=b'z').collect();
+ /// // Note that the specific byte values yielded are not guaranteed!
+ /// let expected = vec![
+ /// Unit::u8(b'A'),
+ /// Unit::u8(b'a'),
+ /// ];
+ /// assert_eq!(expected, reps);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn representatives<R: core::ops::RangeBounds<u8>>(
+ &self,
+ range: R,
+ ) -> ByteClassRepresentatives<'_> {
+ use core::ops::Bound;
+
+ let cur_byte = match range.start_bound() {
+ Bound::Included(&i) => usize::from(i),
+ Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(),
+ Bound::Unbounded => 0,
+ };
+ let end_byte = match range.end_bound() {
+ Bound::Included(&i) => {
+ Some(usize::from(i).checked_add(1).unwrap())
+ }
+ Bound::Excluded(&i) => Some(usize::from(i)),
+ Bound::Unbounded => None,
+ };
+ assert_ne!(
+ cur_byte,
+ usize::MAX,
+ "start range must be less than usize::MAX",
+ );
+ ByteClassRepresentatives {
+ classes: self,
+ cur_byte,
+ end_byte,
+ last_class: None,
+ }
}
/// Returns an iterator of the bytes in the given equivalence class.
+ ///
+ /// This is useful when one needs to know the actual bytes that belong to
+ /// an equivalence class. For example, conceptually speaking, accelerating
+ /// a DFA state occurs when a state only has a few outgoing transitions.
+ /// But in reality, what is required is that there are only a small
+ /// number of distinct bytes that can lead to an outgoing transition. The
+ /// difference is that any one transition can correspond to an equivalence
+ /// class which may contains many bytes. Therefore, DFA state acceleration
+ /// considers the actual elements in each equivalence class of each
+ /// outgoing transition.
+ ///
+ /// # Example
+ ///
+ /// This shows an example of how to get all of the elements in an
+ /// equivalence class.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit};
+ ///
+ /// let nfa = NFA::new("[a-z]+")?;
+ /// let classes = nfa.byte_classes();
+ /// let elements: Vec<Unit> = classes.elements(Unit::u8(1)).collect();
+ /// let expected: Vec<Unit> = (b'a'..=b'z').map(Unit::u8).collect();
+ /// assert_eq!(expected, elements);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
pub fn elements(&self, class: Unit) -> ByteClassElements {
ByteClassElements { classes: self, class, byte: 0 }
}
@@ -281,6 +482,12 @@ impl ByteClasses {
}
}
+impl Default for ByteClasses {
+ fn default() -> ByteClasses {
+ ByteClasses::singletons()
+ }
+}
+
impl core::fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if self.is_singleton() {
@@ -307,6 +514,13 @@ impl core::fmt::Debug for ByteClasses {
}
/// An iterator over each equivalence class.
+///
+/// The last element in this iterator always corresponds to [`Unit::eoi`].
+///
+/// This is created by the [`ByteClasses::iter`] method.
+///
+/// The lifetime `'a` refers to the lifetime of the byte classes that this
+/// iterator was created from.
#[derive(Debug)]
pub struct ByteClassIter<'a> {
classes: &'a ByteClasses,
@@ -321,7 +535,7 @@ impl<'a> Iterator for ByteClassIter<'a> {
self.i += 1;
Some(self.classes.eoi())
} else if self.i < self.classes.alphabet_len() {
- let class = self.i as u8;
+ let class = u8::try_from(self.i).unwrap();
self.i += 1;
Some(Unit::u8(class))
} else {
@@ -331,31 +545,44 @@ impl<'a> Iterator for ByteClassIter<'a> {
}
/// An iterator over representative bytes from each equivalence class.
-#[cfg(feature = "alloc")]
+///
+/// This is created by the [`ByteClasses::representatives`] method.
+///
+/// The lifetime `'a` refers to the lifetime of the byte classes that this
+/// iterator was created from.
#[derive(Debug)]
pub struct ByteClassRepresentatives<'a> {
classes: &'a ByteClasses,
- byte: usize,
+ cur_byte: usize,
+ end_byte: Option<usize>,
last_class: Option<u8>,
}
-#[cfg(feature = "alloc")]
impl<'a> Iterator for ByteClassRepresentatives<'a> {
type Item = Unit;
fn next(&mut self) -> Option<Unit> {
- while self.byte < 256 {
- let byte = self.byte as u8;
+ while self.cur_byte < self.end_byte.unwrap_or(256) {
+ let byte = u8::try_from(self.cur_byte).unwrap();
let class = self.classes.get(byte);
- self.byte += 1;
+ self.cur_byte += 1;
if self.last_class != Some(class) {
self.last_class = Some(class);
return Some(Unit::u8(byte));
}
}
- if self.byte == 256 {
- self.byte += 1;
+ if self.cur_byte != usize::MAX && self.end_byte.is_none() {
+ // Using usize::MAX as a sentinel is OK because we ban usize::MAX
+ // from appearing as a start bound in iterator construction. But
+ // why do it this way? Well, we want to return the EOI class
+ // whenever the end of the given range is unbounded because EOI
+ // isn't really a "byte" per se, so the only way it should be
+ // excluded is if there is a bounded end to the range. Therefore,
+ // when the end is unbounded, we just need to know whether we've
+ // reported EOI or not. When we do, we set cur_byte to a value it
+ // can never otherwise be.
+ self.cur_byte = usize::MAX;
return Some(self.classes.eoi());
}
None
@@ -363,6 +590,11 @@ impl<'a> Iterator for ByteClassRepresentatives<'a> {
}
/// An iterator over all elements in an equivalence class.
+///
+/// This is created by the [`ByteClasses::elements`] method.
+///
+/// The lifetime `'a` refers to the lifetime of the byte classes that this
+/// iterator was created from.
#[derive(Debug)]
pub struct ByteClassElements<'a> {
classes: &'a ByteClasses,
@@ -375,9 +607,9 @@ impl<'a> Iterator for ByteClassElements<'a> {
fn next(&mut self) -> Option<Unit> {
while self.byte < 256 {
- let byte = self.byte as u8;
+ let byte = u8::try_from(self.byte).unwrap();
self.byte += 1;
- if self.class.as_u8() == Some(self.classes.get(byte)) {
+ if self.class.is_byte(self.classes.get(byte)) {
return Some(Unit::u8(byte));
}
}
@@ -394,7 +626,7 @@ impl<'a> Iterator for ByteClassElements<'a> {
/// An iterator over all elements in an equivalence class expressed as a
/// sequence of contiguous ranges.
#[derive(Debug)]
-pub struct ByteClassElementRanges<'a> {
+struct ByteClassElementRanges<'a> {
elements: ByteClassElements<'a>,
range: Option<(Unit, Unit)>,
}
@@ -426,6 +658,8 @@ impl<'a> Iterator for ByteClassElementRanges<'a> {
}
}
+/// A partitioning of bytes into equivalence classes.
+///
/// A byte class set keeps track of an *approximation* of equivalence classes
/// of bytes during NFA construction. That is, every byte in an equivalence
/// class cannot discriminate between a match and a non-match.
@@ -446,21 +680,28 @@ impl<'a> Iterator for ByteClassElementRanges<'a> {
/// rethinking how equivalence classes are computed, including changing the
/// representation here, which is only able to group contiguous bytes into the
/// same equivalence class.)
+#[cfg(feature = "alloc")]
#[derive(Clone, Debug)]
-pub struct ByteClassSet(ByteSet);
+pub(crate) struct ByteClassSet(ByteSet);
+#[cfg(feature = "alloc")]
+impl Default for ByteClassSet {
+ fn default() -> ByteClassSet {
+ ByteClassSet::empty()
+ }
+}
+
+#[cfg(feature = "alloc")]
impl ByteClassSet {
/// Create a new set of byte classes where all bytes are part of the same
/// equivalence class.
- #[cfg(feature = "alloc")]
- pub fn empty() -> Self {
+ pub(crate) fn empty() -> Self {
ByteClassSet(ByteSet::empty())
}
/// Indicate the the range of byte given (inclusive) can discriminate a
/// match between it and all other bytes outside of the range.
- #[cfg(feature = "alloc")]
- pub fn set_range(&mut self, start: u8, end: u8) {
+ pub(crate) fn set_range(&mut self, start: u8, end: u8) {
debug_assert!(start <= end);
if start > 0 {
self.0.add(start - 1);
@@ -469,8 +710,7 @@ impl ByteClassSet {
}
/// Add the contiguous ranges in the set given to this byte class set.
- #[cfg(feature = "alloc")]
- pub fn add_set(&mut self, set: &ByteSet) {
+ pub(crate) fn add_set(&mut self, set: &ByteSet) {
for (start, end) in set.iter_ranges() {
self.set_range(start, end);
}
@@ -479,8 +719,7 @@ impl ByteClassSet {
/// Convert this boolean set to a map that maps all byte values to their
/// corresponding equivalence class. The last mapping indicates the largest
/// equivalence class identifier (which is never bigger than 255).
- #[cfg(feature = "alloc")]
- pub fn byte_classes(&self) -> ByteClasses {
+ pub(crate) fn byte_classes(&self) -> ByteClasses {
let mut classes = ByteClasses::empty();
let mut class = 0u8;
let mut b = 0u8;
@@ -500,7 +739,7 @@ impl ByteClassSet {
/// A simple set of bytes that is reasonably cheap to copy and allocation free.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
-pub struct ByteSet {
+pub(crate) struct ByteSet {
bits: BitSet,
}
@@ -511,90 +750,113 @@ struct BitSet([u128; 2]);
impl ByteSet {
/// Create an empty set of bytes.
- #[cfg(feature = "alloc")]
- pub fn empty() -> ByteSet {
+ pub(crate) fn empty() -> ByteSet {
ByteSet { bits: BitSet([0; 2]) }
}
/// Add a byte to this set.
///
/// If the given byte already belongs to this set, then this is a no-op.
- #[cfg(feature = "alloc")]
- pub fn add(&mut self, byte: u8) {
+ pub(crate) fn add(&mut self, byte: u8) {
let bucket = byte / 128;
let bit = byte % 128;
- self.bits.0[bucket as usize] |= 1 << bit;
- }
-
- /// Add an inclusive range of bytes.
- #[cfg(feature = "alloc")]
- pub fn add_all(&mut self, start: u8, end: u8) {
- for b in start..=end {
- self.add(b);
- }
+ self.bits.0[usize::from(bucket)] |= 1 << bit;
}
/// Remove a byte from this set.
///
/// If the given byte is not in this set, then this is a no-op.
- #[cfg(feature = "alloc")]
- pub fn remove(&mut self, byte: u8) {
+ pub(crate) fn remove(&mut self, byte: u8) {
let bucket = byte / 128;
let bit = byte % 128;
- self.bits.0[bucket as usize] &= !(1 << bit);
- }
-
- /// Remove an inclusive range of bytes.
- #[cfg(feature = "alloc")]
- pub fn remove_all(&mut self, start: u8, end: u8) {
- for b in start..=end {
- self.remove(b);
- }
+ self.bits.0[usize::from(bucket)] &= !(1 << bit);
}
/// Return true if and only if the given byte is in this set.
- pub fn contains(&self, byte: u8) -> bool {
+ pub(crate) fn contains(&self, byte: u8) -> bool {
let bucket = byte / 128;
let bit = byte % 128;
- self.bits.0[bucket as usize] & (1 << bit) > 0
+ self.bits.0[usize::from(bucket)] & (1 << bit) > 0
}
/// Return true if and only if the given inclusive range of bytes is in
/// this set.
- #[cfg(feature = "alloc")]
- pub fn contains_range(&self, start: u8, end: u8) -> bool {
+ pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool {
(start..=end).all(|b| self.contains(b))
}
/// Returns an iterator over all bytes in this set.
- #[cfg(feature = "alloc")]
- pub fn iter(&self) -> ByteSetIter {
+ pub(crate) fn iter(&self) -> ByteSetIter {
ByteSetIter { set: self, b: 0 }
}
/// Returns an iterator over all contiguous ranges of bytes in this set.
- #[cfg(feature = "alloc")]
- pub fn iter_ranges(&self) -> ByteSetRangeIter {
+ pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter {
ByteSetRangeIter { set: self, b: 0 }
}
- /// Return the number of bytes in this set.
- #[cfg(feature = "alloc")]
- pub fn len(&self) -> usize {
- (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize
- }
-
/// Return true if and only if this set is empty.
- #[cfg(feature = "alloc")]
- pub fn is_empty(&self) -> bool {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn is_empty(&self) -> bool {
self.bits.0 == [0, 0]
}
+
+ /// Deserializes a byte set from the given slice. If the slice is of
+ /// incorrect length or is otherwise malformed, then an error is returned.
+ /// Upon success, the number of bytes read along with the set are returned.
+ /// The number of bytes read is always a multiple of 8.
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(ByteSet, usize), DeserializeError> {
+ use core::mem::size_of;
+
+ wire::check_slice_len(slice, 2 * size_of::<u128>(), "byte set")?;
+ let mut nread = 0;
+ let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?;
+ nread += nr;
+ let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?;
+ nread += nr;
+ Ok((ByteSet { bits: BitSet([low, high]) }, nread))
+ }
+
+ /// Writes this byte set to the given byte buffer. If the given buffer is
+ /// too small, then an error is returned. Upon success, the total number of
+ /// bytes written is returned. The number of bytes written is guaranteed to
+ /// be a multiple of 8.
+ pub(crate) fn write_to<E: crate::util::wire::Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ use core::mem::size_of;
+
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("byte set"));
+ }
+ let mut nw = 0;
+ E::write_u128(self.bits.0[0], &mut dst[nw..]);
+ nw += size_of::<u128>();
+ E::write_u128(self.bits.0[1], &mut dst[nw..]);
+ nw += size_of::<u128>();
+ assert_eq!(nwrite, nw, "expected to write certain number of bytes",);
+ assert_eq!(
+ nw % 8,
+ 0,
+ "expected to write multiple of 8 bytes for byte set",
+ );
+ Ok(nw)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub(crate) fn write_to_len(&self) -> usize {
+ 2 * core::mem::size_of::<u128>()
+ }
}
impl core::fmt::Debug for BitSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut fmtd = f.debug_set();
- for b in (0..256).map(|b| b as u8) {
+ for b in 0u8..=255 {
if (ByteSet { bits: *self }).contains(b) {
fmtd.entry(&b);
}
@@ -604,7 +866,7 @@ impl core::fmt::Debug for BitSet {
}
#[derive(Debug)]
-pub struct ByteSetIter<'a> {
+pub(crate) struct ByteSetIter<'a> {
set: &'a ByteSet,
b: usize,
}
@@ -614,7 +876,7 @@ impl<'a> Iterator for ByteSetIter<'a> {
fn next(&mut self) -> Option<u8> {
while self.b <= 255 {
- let b = self.b as u8;
+ let b = u8::try_from(self.b).unwrap();
self.b += 1;
if self.set.contains(b) {
return Some(b);
@@ -625,7 +887,7 @@ impl<'a> Iterator for ByteSetIter<'a> {
}
#[derive(Debug)]
-pub struct ByteSetRangeIter<'a> {
+pub(crate) struct ByteSetRangeIter<'a> {
set: &'a ByteSet,
b: usize,
}
@@ -634,16 +896,17 @@ impl<'a> Iterator for ByteSetRangeIter<'a> {
type Item = (u8, u8);
fn next(&mut self) -> Option<(u8, u8)> {
+ let asu8 = |n: usize| u8::try_from(n).unwrap();
while self.b <= 255 {
- let start = self.b as u8;
+ let start = asu8(self.b);
self.b += 1;
if !self.set.contains(start) {
continue;
}
let mut end = start;
- while self.b <= 255 && self.set.contains(self.b as u8) {
- end = self.b as u8;
+ while self.b <= 255 && self.set.contains(asu8(self.b)) {
+ end = asu8(self.b);
self.b += 1;
}
return Some((start, end));
@@ -652,8 +915,7 @@ impl<'a> Iterator for ByteSetRangeIter<'a> {
}
}
-#[cfg(test)]
-#[cfg(feature = "alloc")]
+#[cfg(all(test, feature = "alloc"))]
mod tests {
use alloc::{vec, vec::Vec};
@@ -694,8 +956,8 @@ mod tests {
#[test]
fn full_byte_classes() {
let mut set = ByteClassSet::empty();
- for i in 0..256u16 {
- set.set_range(i as u8, i as u8);
+ for b in 0u8..=255 {
+ set.set_range(b, b);
}
assert_eq!(set.byte_classes().alphabet_len(), 257);
}
@@ -787,4 +1049,91 @@ mod tests {
let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
assert_eq!(elements, vec![Unit::eoi(256)]);
}
+
+ #[test]
+ fn representatives() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'b', b'd');
+ set.set_range(b'g', b'm');
+ set.set_range(b'z', b'z');
+ let classes = set.byte_classes();
+
+ let got: Vec<Unit> = classes.representatives(..).collect();
+ let expected = vec![
+ Unit::u8(b'\x00'),
+ Unit::u8(b'b'),
+ Unit::u8(b'e'),
+ Unit::u8(b'g'),
+ Unit::u8(b'n'),
+ Unit::u8(b'z'),
+ Unit::u8(b'\x7B'),
+ Unit::eoi(7),
+ ];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(..0).collect();
+ assert!(got.is_empty());
+ let got: Vec<Unit> = classes.representatives(1..1).collect();
+ assert!(got.is_empty());
+ let got: Vec<Unit> = classes.representatives(255..255).collect();
+ assert!(got.is_empty());
+
+ // A weird case that is the only guaranteed to way to get an iterator
+ // of just the EOI class by excluding all possible byte values.
+ let got: Vec<Unit> = classes
+ .representatives((
+ core::ops::Bound::Excluded(255),
+ core::ops::Bound::Unbounded,
+ ))
+ .collect();
+ let expected = vec![Unit::eoi(7)];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(..=255).collect();
+ let expected = vec![
+ Unit::u8(b'\x00'),
+ Unit::u8(b'b'),
+ Unit::u8(b'e'),
+ Unit::u8(b'g'),
+ Unit::u8(b'n'),
+ Unit::u8(b'z'),
+ Unit::u8(b'\x7B'),
+ ];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'b'..=b'd').collect();
+ let expected = vec![Unit::u8(b'b')];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'a'..=b'd').collect();
+ let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'b'..=b'e').collect();
+ let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'A'..=b'Z').collect();
+ let expected = vec![Unit::u8(b'A')];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'A'..=b'z').collect();
+ let expected = vec![
+ Unit::u8(b'A'),
+ Unit::u8(b'b'),
+ Unit::u8(b'e'),
+ Unit::u8(b'g'),
+ Unit::u8(b'n'),
+ Unit::u8(b'z'),
+ ];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'z'..).collect();
+ let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)];
+ assert_eq!(expected, got);
+
+ let got: Vec<Unit> = classes.representatives(b'z'..=0xFF).collect();
+ let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')];
+ assert_eq!(expected, got);
+ }
}
diff --git a/vendor/regex-automata/src/util/captures.rs b/vendor/regex-automata/src/util/captures.rs
new file mode 100644
index 000000000..cd3a5f8f7
--- /dev/null
+++ b/vendor/regex-automata/src/util/captures.rs
@@ -0,0 +1,2547 @@
+/*!
+Provides types for dealing with capturing groups.
+
+Capturing groups refer to sub-patterns of regexes that some regex engines can
+report matching offsets for. For example, matching `[a-z]([0-9]+)` against
+`a789` would give `a789` as the overall match (for the implicit capturing group
+at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an
+explicit capturing group at index `1`).
+
+Not all regex engines can report match offsets for capturing groups. Indeed,
+to a first approximation, regex engines that can report capturing group offsets
+tend to be quite a bit slower than regex engines that can't. This is because
+tracking capturing groups at search time usually requires more "power" that
+in turn adds overhead.
+
+Other regex implementations might call capturing groups "submatches."
+
+# Overview
+
+The main types in this module are:
+
+* [`Captures`] records the capturing group offsets found during a search. It
+provides convenience routines for looking up capturing group offsets by either
+index or name.
+* [`GroupInfo`] records the mapping between capturing groups and "slots,"
+where the latter are how capturing groups are recorded during a regex search.
+This also keeps a mapping from capturing group name to index, and capture
+group index to name. A `GroupInfo` is used by `Captures` internally to
+provide a convenient API. It is unlikely that you'll use a `GroupInfo`
+directly, but for example, if you've compiled an Thompson NFA, then you can use
+[`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its
+underlying `GroupInfo`.
+*/
+
+use alloc::{string::String, sync::Arc, vec, vec::Vec};
+
+use crate::util::{
+ interpolate,
+ primitives::{
+ NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex,
+ },
+ search::{Match, Span},
+};
+
+/// The span offsets of capturing groups after a match has been found.
+///
+/// This type represents the output of regex engines that can report the
+/// offsets at which capturing groups matches or "submatches" occur. For
+/// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match
+/// occurs, it will at minimum contain the [`PatternID`] of the pattern that
+/// matched. Depending upon how it was constructed, it may also contain the
+/// start/end offsets of the entire match of the pattern and the start/end
+/// offsets of each capturing group that participated in the match.
+///
+/// Values of this type are always created for a specific [`GroupInfo`]. It is
+/// unspecified behavior to use a `Captures` value in a search with any regex
+/// engine that has a different `GroupInfo` than the one the `Captures` were
+/// created with.
+///
+/// # Constructors
+///
+/// There are three constructors for this type that control what kind of
+/// information is available upon a match:
+///
+/// * [`Captures::all`]: Will store overall pattern match offsets in addition
+/// to the offsets of capturing groups that participated in the match.
+/// * [`Captures::matches`]: Will store only the overall pattern
+/// match offsets. The offsets of capturing groups (even ones that participated
+/// in the match) are not available.
+/// * [`Captures::empty`]: Will only store the pattern ID that matched. No
+/// match offsets are available at all.
+///
+/// If you aren't sure which to choose, then pick the first one. The first one
+/// is what convenience routines like,
+/// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures),
+/// will use automatically.
+///
+/// The main difference between these choices is performance. Namely, if you
+/// ask for _less_ information, then the execution of regex search may be able
+/// to run more quickly.
+///
+/// # Notes
+///
+/// It is worth pointing out that this type is not coupled to any one specific
+/// regex engine. Instead, its coupling is with [`GroupInfo`], which is the
+/// thing that is responsible for mapping capturing groups to "slot" offsets.
+/// Slot offsets are indices into a single sequence of memory at which matching
+/// haystack offsets for the corresponding group are written by regex engines.
+///
+/// # Example
+///
+/// This example shows how to parse a simple date and extract the components of
+/// the date via capturing groups:
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+///
+/// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// re.captures(&mut cache, "2010-03-14", &mut caps);
+/// assert!(caps.is_match());
+/// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
+/// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
+/// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: named capturing groups
+///
+/// This example is like the one above, but leverages the ability to name
+/// capturing groups in order to make the code a bit clearer:
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+///
+/// let re = PikeVM::new(r"^(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})$")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// re.captures(&mut cache, "2010-03-14", &mut caps);
+/// assert!(caps.is_match());
+/// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y"));
+/// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m"));
+/// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d"));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct Captures {
+ /// The group info that these capture groups are coupled to. This is what
+ /// gives the "convenience" of the `Captures` API. Namely, it provides the
+ /// slot mapping and the name|-->index mapping for capture lookups by name.
+ group_info: GroupInfo,
+ /// The ID of the pattern that matched. Regex engines must set this to
+ /// None when no match occurs.
+ pid: Option<PatternID>,
+ /// The slot values, i.e., submatch offsets.
+ ///
+ /// In theory, the smallest sequence of slots would be something like
+ /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use
+ /// `sum(groups(pattern) for pattern in regex) * 2`. Why?
+ ///
+ /// Well, the former could be used in theory, because we don't generally
+ /// have any overlapping APIs that involve capturing groups. Therefore,
+ /// there's technically never any need to have slots set for multiple
+ /// patterns. However, this might change some day, in which case, we would
+ /// need to have slots available.
+ ///
+ /// The other reason is that during the execution of some regex engines,
+ /// there exists a point in time where multiple slots for different
+ /// patterns may be written to before knowing which pattern has matched.
+ /// Therefore, the regex engines themselves, in order to support multiple
+ /// patterns correctly, must have all slots available. If `Captures`
+ /// doesn't have all slots available, then regex engines can't write
+ /// directly into the caller provided `Captures` and must instead write
+ /// into some other storage and then copy the slots involved in the match
+ /// at the end of the search.
+ ///
+ /// So overall, at least as of the time of writing, it seems like the path
+ /// of least resistance is to just require allocating all possible slots
+ /// instead of the conceptual minimum. Another way to justify this is that
+ /// the most common case is a single pattern, in which case, there is no
+ /// inefficiency here since the 'max' and 'sum' calculations above are
+ /// equivalent in that case.
+ ///
+ /// N.B. The mapping from group index to slot is maintained by `GroupInfo`
+ /// and is considered an API guarantee. See `GroupInfo` for more details on
+ /// that mapping.
+ ///
+ /// N.B. `Option<NonMaxUsize>` has the same size as a `usize`.
+ slots: Vec<Option<NonMaxUsize>>,
+}
+
+impl Captures {
+ /// Create new storage for the offsets of all matching capturing groups.
+ ///
+ /// This routine provides the most information for matches---namely, the
+ /// spans of matching capturing groups---but also requires the regex search
+ /// routines to do the most work.
+ ///
+ /// It is unspecified behavior to use the returned `Captures` value in a
+ /// search with a `GroupInfo` other than the one that is provided to this
+ /// constructor.
+ ///
+ /// # Example
+ ///
+ /// This example shows that all capturing groups---but only ones that
+ /// participated in a match---are available to query after a match has
+ /// been found:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::captures::Captures,
+ /// Span, Match,
+ /// };
+ ///
+ /// let re = PikeVM::new(
+ /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$",
+ /// )?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = Captures::all(re.get_nfa().group_info().clone());
+ ///
+ /// re.captures(&mut cache, "ABC123", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
+ /// // The 'lower' group didn't match, so it won't have any offsets.
+ /// assert_eq!(None, caps.get_group_by_name("lower"));
+ /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper"));
+ /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn all(group_info: GroupInfo) -> Captures {
+ let slots = group_info.slot_len();
+ Captures { group_info, pid: None, slots: vec![None; slots] }
+ }
+
+ /// Create new storage for only the full match spans of a pattern. This
+ /// does not include any capturing group offsets.
+ ///
+ /// It is unspecified behavior to use the returned `Captures` value in a
+ /// search with a `GroupInfo` other than the one that is provided to this
+ /// constructor.
+ ///
+ /// # Example
+ ///
+ /// This example shows that only overall match offsets are reported when
+ /// this constructor is used. Accessing any capturing groups other than
+ /// the 0th will always return `None`.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::captures::Captures,
+ /// Match,
+ /// };
+ ///
+ /// let re = PikeVM::new(
+ /// r"^(?:(?P<lower>[a-z]+)|(?P<upper>[A-Z]+))(?P<digits>[0-9]+)$",
+ /// )?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = Captures::matches(re.get_nfa().group_info().clone());
+ ///
+ /// re.captures(&mut cache, "ABC123", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
+ /// // We didn't ask for capturing group offsets, so they aren't available.
+ /// assert_eq!(None, caps.get_group_by_name("lower"));
+ /// assert_eq!(None, caps.get_group_by_name("upper"));
+ /// assert_eq!(None, caps.get_group_by_name("digits"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn matches(group_info: GroupInfo) -> Captures {
+ // This is OK because we know there are at least this many slots,
+ // and GroupInfo construction guarantees that the number of slots fits
+ // into a usize.
+ let slots = group_info.pattern_len().checked_mul(2).unwrap();
+ Captures { group_info, pid: None, slots: vec![None; slots] }
+ }
+
+ /// Create new storage for only tracking which pattern matched. No offsets
+ /// are stored at all.
+ ///
+ /// It is unspecified behavior to use the returned `Captures` value in a
+ /// search with a `GroupInfo` other than the one that is provided to this
+ /// constructor.
+ ///
+ /// # Example
+ ///
+ /// This example shows that only the pattern that matched can be accessed
+ /// from a `Captures` value created via this constructor.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::captures::Captures,
+ /// PatternID,
+ /// };
+ ///
+ /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
+ ///
+ /// re.captures(&mut cache, "aABCz", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(PatternID::must(0)), caps.pattern());
+ /// // We didn't ask for any offsets, so they aren't available.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// re.captures(&mut cache, &"aABCz"[1..], &mut caps);
+ /// assert!(caps.is_match());
+ /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
+ /// // We didn't ask for any offsets, so they aren't available.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn empty(group_info: GroupInfo) -> Captures {
+ Captures { group_info, pid: None, slots: vec![] }
+ }
+
+ /// Returns true if and only if this capturing group represents a match.
+ ///
+ /// This is a convenience routine for `caps.pattern().is_some()`.
+ ///
+ /// # Example
+ ///
+ /// When using the PikeVM (for example), the lightest weight way of
+ /// detecting whether a match exists is to create capturing groups that
+ /// only track the ID of the pattern that match (if any):
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::captures::Captures,
+ /// };
+ ///
+ /// let re = PikeVM::new(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
+ ///
+ /// re.captures(&mut cache, "aABCz", &mut caps);
+ /// assert!(caps.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match(&self) -> bool {
+ self.pid.is_some()
+ }
+
+ /// Returns the identifier of the pattern that matched when this
+ /// capturing group represents a match. If no match was found, then this
+ /// always returns `None`.
+ ///
+ /// This returns a pattern ID in precisely the cases in which `is_match`
+ /// returns `true`. Similarly, the pattern ID returned is always the
+ /// same pattern ID found in the `Match` returned by `get_match`.
+ ///
+ /// # Example
+ ///
+ /// When using the PikeVM (for example), the lightest weight way of
+ /// detecting which pattern matched is to create capturing groups that only
+ /// track the ID of the pattern that match (if any):
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::captures::Captures,
+ /// PatternID,
+ /// };
+ ///
+ /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = Captures::empty(re.get_nfa().group_info().clone());
+ ///
+ /// re.captures(&mut cache, "ABC", &mut caps);
+ /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
+ /// // Recall that offsets are only available when using a non-empty
+ /// // Captures value. So even though a match occurred, this returns None!
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn pattern(&self) -> Option<PatternID> {
+ self.pid
+ }
+
+ /// Returns the pattern ID and the span of the match, if one occurred.
+ ///
+ /// This always returns `None` when `Captures` was created with
+ /// [`Captures::empty`], even if a match was found.
+ ///
+ /// If this routine returns a non-`None` value, then `is_match` is
+ /// guaranteed to return `true` and `pattern` is also guaranteed to return
+ /// a non-`None` value.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get the full match from a search:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "ABC", &mut caps);
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn get_match(&self) -> Option<Match> {
+ Some(Match::new(self.pattern()?, self.get_group(0)?))
+ }
+
+ /// Returns the span of a capturing group match corresponding to the group
+ /// index given, only if both the overall pattern matched and the capturing
+ /// group participated in that match.
+ ///
+ /// This returns `None` if `index` is invalid. `index` is valid if and only
+ /// if it's less than [`Captures::group_len`] for the matching pattern.
+ ///
+ /// This always returns `None` when `Captures` was created with
+ /// [`Captures::empty`], even if a match was found. This also always
+ /// returns `None` for any `index > 0` when `Captures` was created with
+ /// [`Captures::matches`].
+ ///
+ /// If this routine returns a non-`None` value, then `is_match` is
+ /// guaranteed to return `true`, `pattern` is guaranteed to return a
+ /// non-`None` value and `get_match` is guaranteed to return a non-`None`
+ /// value.
+ ///
+ /// By convention, the 0th capture group will always return the same
+ /// span as the span returned by `get_match`. This is because the 0th
+ /// capture group always corresponds to the entirety of the pattern's
+ /// match. (It is similarly always unnamed because it is implicit.) This
+ /// isn't necessarily true of all regex engines. For example, one can
+ /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a
+ /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't
+ /// technically forced to make the 0th capturing group always correspond to
+ /// the entire match.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get the capturing groups, by index, from a
+ /// match:
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
+ ///
+ /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2));
+ /// // Looking for a non-existent capturing group will return None:
+ /// assert_eq!(None, caps.get_group(3));
+ /// assert_eq!(None, caps.get_group(9944060567225171988));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn get_group(&self, index: usize) -> Option<Span> {
+ let pid = self.pattern()?;
+ // There's a little bit of work needed to map captures to slots in the
+ // fully general case. But in the overwhelming common case of a single
+ // pattern, we can just do some simple arithmetic.
+ let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 {
+ (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?)
+ } else {
+ self.group_info().slots(pid, index)?
+ };
+ let start = self.slots.get(slot_start).copied()??;
+ let end = self.slots.get(slot_end).copied()??;
+ Some(Span { start: start.get(), end: end.get() })
+ }
+
+ /// Returns the span of a capturing group match corresponding to the group
+ /// name given, only if both the overall pattern matched and the capturing
+ /// group participated in that match.
+ ///
+ /// This returns `None` if `name` does not correspond to a valid capturing
+ /// group for the pattern that matched.
+ ///
+ /// This always returns `None` when `Captures` was created with
+ /// [`Captures::empty`], even if a match was found. This also always
+ /// returns `None` for any `index > 0` when `Captures` was created with
+ /// [`Captures::matches`].
+ ///
+ /// If this routine returns a non-`None` value, then `is_match` is
+ /// guaranteed to return `true`, `pattern` is guaranteed to return a
+ /// non-`None` value and `get_match` is guaranteed to return a non-`None`
+ /// value.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get the capturing groups, by name, from a
+ /// match:
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match};
+ ///
+ /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first"));
+ /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
+ /// // Looking for a non-existent capturing group will return None:
+ /// assert_eq!(None, caps.get_group_by_name("middle"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn get_group_by_name(&self, name: &str) -> Option<Span> {
+ let index = self.group_info().to_index(self.pattern()?, name)?;
+ self.get_group(index)
+ }
+
+ /// Returns an iterator of possible spans for every capturing group in the
+ /// matching pattern.
+ ///
+ /// If this `Captures` value does not correspond to a match, then the
+ /// iterator returned yields no elements.
+ ///
+ /// Note that the iterator returned yields elements of type `Option<Span>`.
+ /// A span is present if and only if it corresponds to a capturing group
+ /// that participated in a match.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to collect all capturing groups:
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+ ///
+ /// let re = PikeVM::new(
+ /// // Matches first/last names, with an optional middle name.
+ /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Harry James Potter", &mut caps);
+ /// assert!(caps.is_match());
+ /// let groups: Vec<Option<Span>> = caps.iter().collect();
+ /// assert_eq!(groups, vec![
+ /// Some(Span::from(0..18)),
+ /// Some(Span::from(0..5)),
+ /// Some(Span::from(6..11)),
+ /// Some(Span::from(12..18)),
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This example uses the same regex as the previous example, but with a
+ /// haystack that omits the middle name. This results in a capturing group
+ /// that is present in the elements yielded by the iterator but without a
+ /// match:
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+ ///
+ /// let re = PikeVM::new(
+ /// // Matches first/last names, with an optional middle name.
+ /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Harry Potter", &mut caps);
+ /// assert!(caps.is_match());
+ /// let groups: Vec<Option<Span>> = caps.iter().collect();
+ /// assert_eq!(groups, vec![
+ /// Some(Span::from(0..12)),
+ /// Some(Span::from(0..5)),
+ /// None,
+ /// Some(Span::from(6..12)),
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn iter(&self) -> CapturesPatternIter<'_> {
+ let names = self
+ .pattern()
+ .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| {
+ self.group_info().pattern_names(pid).enumerate()
+ });
+ CapturesPatternIter { caps: self, names }
+ }
+
+ /// Return the total number of capturing groups for the matching pattern.
+ ///
+ /// If this `Captures` value does not correspond to a match, then this
+ /// always returns `0`.
+ ///
+ /// This always returns the same number of elements yielded by
+ /// [`Captures::iter`]. That is, the number includes capturing groups even
+ /// if they don't participate in the match.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to count the total number of capturing groups
+ /// associated with a pattern. Notice that it includes groups that did not
+ /// participate in a match (just like `Captures::iter` does).
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new(
+ /// // Matches first/last names, with an optional middle name.
+ /// r"^(?P<first>\pL+)\s+(?:(?P<middle>\pL+)\s+)?(?P<last>\pL+)$",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Harry Potter", &mut caps);
+ /// assert_eq!(4, caps.group_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn group_len(&self) -> usize {
+ let pid = match self.pattern() {
+ None => return 0,
+ Some(pid) => pid,
+ };
+ self.group_info().group_len(pid)
+ }
+
+ /// Returns a reference to the underlying group info on which these
+ /// captures are based.
+ ///
+ /// The difference between `GroupInfo` and `Captures` is that the former
+ /// defines the structure of capturing groups where as the latter is what
+ /// stores the actual match information. So where as `Captures` only gives
+ /// you access to the current match, `GroupInfo` lets you query any
+ /// information about all capturing groups, even ones for patterns that
+ /// weren't involved in a match.
+ ///
+ /// Note that a `GroupInfo` uses reference counting internally, so it may
+ /// be cloned cheaply.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get all capturing group names from the
+ /// underlying `GroupInfo`. Notice that we don't even need to run a
+ /// search.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"(?P<foo>a)",
+ /// r"(a)(b)",
+ /// r"ab",
+ /// r"(?P<bar>a)(?P<quux>a)",
+ /// r"(?P<foo>z)",
+ /// ])?;
+ /// let caps = re.create_captures();
+ ///
+ /// let expected = vec![
+ /// (PatternID::must(0), 0, None),
+ /// (PatternID::must(0), 1, Some("foo")),
+ /// (PatternID::must(1), 0, None),
+ /// (PatternID::must(1), 1, None),
+ /// (PatternID::must(1), 2, None),
+ /// (PatternID::must(2), 0, None),
+ /// (PatternID::must(3), 0, None),
+ /// (PatternID::must(3), 1, Some("bar")),
+ /// (PatternID::must(3), 2, Some("quux")),
+ /// (PatternID::must(4), 0, None),
+ /// (PatternID::must(4), 1, Some("foo")),
+ /// ];
+ /// // We could also just use 're.get_nfa().group_info()'.
+ /// let got: Vec<(PatternID, usize, Option<&str>)> =
+ /// caps.group_info().all_names().collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn group_info(&self) -> &GroupInfo {
+ &self.group_info
+ }
+
+ /// Interpolates the capture references in `replacement` with the
+ /// corresponding substrings in `haystack` matched by each reference. The
+ /// interpolated string is returned.
+ ///
+ /// See the [`interpolate` module](interpolate) for documentation on the
+ /// format of the replacement string.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use interpolation, and also shows how it
+ /// can work with multi-pattern regexes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let replacement = "year=$year, month=$month, day=$day";
+ ///
+ /// // This matches the first pattern.
+ /// let hay = "On 14-03-2010, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let result = caps.interpolate_string(hay, replacement);
+ /// assert_eq!("year=2010, month=03, day=14", result);
+ ///
+ /// // And this matches the second pattern.
+ /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let result = caps.interpolate_string(hay, replacement);
+ /// assert_eq!("year=2010, month=03, day=14", result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn interpolate_string(
+ &self,
+ haystack: &str,
+ replacement: &str,
+ ) -> String {
+ let mut dst = String::new();
+ self.interpolate_string_into(haystack, replacement, &mut dst);
+ dst
+ }
+
+ /// Interpolates the capture references in `replacement` with the
+ /// corresponding substrings in `haystack` matched by each reference. The
+ /// interpolated string is written to `dst`.
+ ///
+ /// See the [`interpolate` module](interpolate) for documentation on the
+ /// format of the replacement string.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use interpolation, and also shows how it
+ /// can work with multi-pattern regexes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let replacement = "year=$year, month=$month, day=$day";
+ ///
+ /// // This matches the first pattern.
+ /// let hay = "On 14-03-2010, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let mut dst = String::new();
+ /// caps.interpolate_string_into(hay, replacement, &mut dst);
+ /// assert_eq!("year=2010, month=03, day=14", dst);
+ ///
+ /// // And this matches the second pattern.
+ /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let mut dst = String::new();
+ /// caps.interpolate_string_into(hay, replacement, &mut dst);
+ /// assert_eq!("year=2010, month=03, day=14", dst);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn interpolate_string_into(
+ &self,
+ haystack: &str,
+ replacement: &str,
+ dst: &mut String,
+ ) {
+ interpolate::string(
+ replacement,
+ |index, dst| {
+ let span = match self.get_group(index) {
+ None => return,
+ Some(span) => span,
+ };
+ dst.push_str(&haystack[span]);
+ },
+ |name| self.group_info().to_index(self.pattern()?, name),
+ dst,
+ );
+ }
+
+ /// Interpolates the capture references in `replacement` with the
+ /// corresponding substrings in `haystack` matched by each reference. The
+ /// interpolated byte string is returned.
+ ///
+ /// See the [`interpolate` module](interpolate) for documentation on the
+ /// format of the replacement string.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use interpolation, and also shows how it
+ /// can work with multi-pattern regexes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let replacement = b"year=$year, month=$month, day=$day";
+ ///
+ /// // This matches the first pattern.
+ /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let result = caps.interpolate_bytes(hay, replacement);
+ /// assert_eq!(&b"year=2010, month=03, day=14"[..], result);
+ ///
+ /// // And this matches the second pattern.
+ /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let result = caps.interpolate_bytes(hay, replacement);
+ /// assert_eq!(&b"year=2010, month=03, day=14"[..], result);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn interpolate_bytes(
+ &self,
+ haystack: &[u8],
+ replacement: &[u8],
+ ) -> Vec<u8> {
+ let mut dst = vec![];
+ self.interpolate_bytes_into(haystack, replacement, &mut dst);
+ dst
+ }
+
+ /// Interpolates the capture references in `replacement` with the
+ /// corresponding substrings in `haystack` matched by each reference. The
+ /// interpolated byte string is written to `dst`.
+ ///
+ /// See the [`interpolate` module](interpolate) for documentation on the
+ /// format of the replacement string.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use interpolation, and also shows how it
+ /// can work with multi-pattern regexes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID};
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})",
+ /// r"(?<year>[0-9]{4})-(?<month>[0-9]{2})-(?<day>[0-9]{2})",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let replacement = b"year=$year, month=$month, day=$day";
+ ///
+ /// // This matches the first pattern.
+ /// let hay = b"On 14-03-2010, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let mut dst = vec![];
+ /// caps.interpolate_bytes_into(hay, replacement, &mut dst);
+ /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst);
+ ///
+ /// // And this matches the second pattern.
+ /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// let mut dst = vec![];
+ /// caps.interpolate_bytes_into(hay, replacement, &mut dst);
+ /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn interpolate_bytes_into(
+ &self,
+ haystack: &[u8],
+ replacement: &[u8],
+ dst: &mut Vec<u8>,
+ ) {
+ interpolate::bytes(
+ replacement,
+ |index, dst| {
+ let span = match self.get_group(index) {
+ None => return,
+ Some(span) => span,
+ };
+ dst.extend_from_slice(&haystack[span]);
+ },
+ |name| self.group_info().to_index(self.pattern()?, name),
+ dst,
+ );
+ }
+
+ /// This is a convenience routine for extracting the substrings
+ /// corresponding to matching capture groups in the given `haystack`. The
+ /// `haystack` should be the same substring used to find the match spans in
+ /// this `Captures` value.
+ ///
+ /// This is identical to [`Captures::extract_bytes`], except it works with
+ /// `&str` instead of `&[u8]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the number of explicit matching groups in this
+ /// `Captures` value is less than `N`. This also panics if this `Captures`
+ /// value does not correspond to a match.
+ ///
+ /// Note that this does *not* panic if the number of explicit matching
+ /// groups is bigger than `N`. In that case, only the first `N` matching
+ /// groups are extracted.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let hay = "On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// assert!(caps.is_match());
+ /// let (full, [year, month, day]) = caps.extract(hay);
+ /// assert_eq!("2010-03-14", full);
+ /// assert_eq!("2010", year);
+ /// assert_eq!("03", month);
+ /// assert_eq!("14", day);
+ ///
+ /// // We can also ask for fewer than all capture groups.
+ /// let (full, [year]) = caps.extract(hay);
+ /// assert_eq!("2010-03-14", full);
+ /// assert_eq!("2010", year);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn extract<'h, const N: usize>(
+ &self,
+ haystack: &'h str,
+ ) -> (&'h str, [&'h str; N]) {
+ let mut matched = self.iter().flatten();
+ let whole_match = &haystack[matched.next().expect("a match")];
+ let group_matches = [0; N].map(|_| {
+ let sp = matched.next().expect("too few matching groups");
+ &haystack[sp]
+ });
+ (whole_match, group_matches)
+ }
+
+ /// This is a convenience routine for extracting the substrings
+ /// corresponding to matching capture groups in the given `haystack`. The
+ /// `haystack` should be the same substring used to find the match spans in
+ /// this `Captures` value.
+ ///
+ /// This is identical to [`Captures::extract`], except it works with
+ /// `&[u8]` instead of `&str`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the number of explicit matching groups in this
+ /// `Captures` value is less than `N`. This also panics if this `Captures`
+ /// value does not correspond to a match.
+ ///
+ /// Note that this does *not* panic if the number of explicit matching
+ /// groups is bigger than `N`. In that case, only the first `N` matching
+ /// groups are extracted.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// let hay = b"On 2010-03-14, I became a Tenneessee lamb.";
+ /// re.captures(&mut cache, hay, &mut caps);
+ /// assert!(caps.is_match());
+ /// let (full, [year, month, day]) = caps.extract_bytes(hay);
+ /// assert_eq!(b"2010-03-14", full);
+ /// assert_eq!(b"2010", year);
+ /// assert_eq!(b"03", month);
+ /// assert_eq!(b"14", day);
+ ///
+ /// // We can also ask for fewer than all capture groups.
+ /// let (full, [year]) = caps.extract_bytes(hay);
+ /// assert_eq!(b"2010-03-14", full);
+ /// assert_eq!(b"2010", year);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn extract_bytes<'h, const N: usize>(
+ &self,
+ haystack: &'h [u8],
+ ) -> (&'h [u8], [&'h [u8]; N]) {
+ let mut matched = self.iter().flatten();
+ let whole_match = &haystack[matched.next().expect("a match")];
+ let group_matches = [0; N].map(|_| {
+ let sp = matched.next().expect("too few matching groups");
+ &haystack[sp]
+ });
+ (whole_match, group_matches)
+ }
+}
+
+/// Lower level "slot" oriented APIs. One does not typically need to use these
+/// when executing a search. They are instead mostly intended for folks that
+/// are writing their own regex engine while reusing this `Captures` type.
+impl Captures {
+ /// Clear this `Captures` value.
+ ///
+ /// After clearing, all slots inside this `Captures` value will be set to
+ /// `None`. Similarly, any pattern ID that it was previously associated
+ /// with (for a match) is erased.
+ ///
+ /// It is not usually necessary to call this routine. Namely, a `Captures`
+ /// value only provides high level access to the capturing groups of the
+ /// pattern that matched, and only low level access to individual slots.
+ /// Thus, even if slots corresponding to groups that aren't associated
+ /// with the matching pattern are set, then it won't impact the higher
+ /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will
+ /// return `None` if no pattern ID is present, even if there are spans set
+ /// in the underlying slots.
+ ///
+ /// Thus, to "clear" a `Captures` value of a match, it is usually only
+ /// necessary to call [`Captures::set_pattern`] with `None`.
+ ///
+ /// # Example
+ ///
+ /// This example shows what happens when a `Captures` value is cleared.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert!(caps.is_match());
+ /// let slots: Vec<Option<usize>> =
+ /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
+ /// // Note that the following ordering is considered an API guarantee.
+ /// assert_eq!(slots, vec![
+ /// Some(0),
+ /// Some(17),
+ /// Some(0),
+ /// Some(5),
+ /// Some(6),
+ /// Some(17),
+ /// ]);
+ ///
+ /// // Now clear the slots. Everything is gone and it is no longer a match.
+ /// caps.clear();
+ /// assert!(!caps.is_match());
+ /// let slots: Vec<Option<usize>> =
+ /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
+ /// assert_eq!(slots, vec![
+ /// None,
+ /// None,
+ /// None,
+ /// None,
+ /// None,
+ /// None,
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn clear(&mut self) {
+ self.pid = None;
+ for slot in self.slots.iter_mut() {
+ *slot = None;
+ }
+ }
+
+ /// Set the pattern on this `Captures` value.
+ ///
+ /// When the pattern ID is `None`, then this `Captures` value does not
+ /// correspond to a match (`is_match` will return `false`). Otherwise, it
+ /// corresponds to a match.
+ ///
+ /// This is useful in search implementations where you might want to
+ /// initially call `set_pattern(None)` in order to avoid the cost of
+ /// calling `clear()` if it turns out to not be necessary.
+ ///
+ /// # Example
+ ///
+ /// This example shows that `set_pattern` merely overwrites the pattern ID.
+ /// It does not actually change the underlying slot values.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::pikevm::PikeVM;
+ ///
+ /// let re = PikeVM::new(r"^(?P<first>\pL+)\s+(?P<last>\pL+)$")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert!(caps.is_match());
+ /// assert!(caps.pattern().is_some());
+ /// let slots: Vec<Option<usize>> =
+ /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
+ /// // Note that the following ordering is considered an API guarantee.
+ /// assert_eq!(slots, vec![
+ /// Some(0),
+ /// Some(17),
+ /// Some(0),
+ /// Some(5),
+ /// Some(6),
+ /// Some(17),
+ /// ]);
+ ///
+ /// // Now set the pattern to None. Note that the slot values remain.
+ /// caps.set_pattern(None);
+ /// assert!(!caps.is_match());
+ /// assert!(!caps.pattern().is_some());
+ /// let slots: Vec<Option<usize>> =
+ /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect();
+ /// // Note that the following ordering is considered an API guarantee.
+ /// assert_eq!(slots, vec![
+ /// Some(0),
+ /// Some(17),
+ /// Some(0),
+ /// Some(5),
+ /// Some(6),
+ /// Some(17),
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn set_pattern(&mut self, pid: Option<PatternID>) {
+ self.pid = pid;
+ }
+
+ /// Returns the underlying slots, where each slot stores a single offset.
+ ///
+ /// Every matching capturing group generally corresponds to two slots: one
+ /// slot for the starting position and another for the ending position.
+ /// Typically, either both are present or neither are. (The weasel word
+ /// "typically" is used here because it really depends on the regex engine
+ /// implementation. Every sensible regex engine likely adheres to this
+ /// invariant, and every regex engine in this crate is sensible.)
+ ///
+ /// Generally speaking, callers should prefer to use higher level routines
+ /// like [`Captures::get_match`] or [`Captures::get_group`].
+ ///
+ /// An important note here is that a regex engine may not reset all of the
+ /// slots to `None` values when no match occurs, or even when a match of
+ /// a different pattern occurs. But this depends on how the regex engine
+ /// implementation deals with slots.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get the underlying slots from a regex match.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::primitives::{PatternID, NonMaxUsize},
+ /// };
+ ///
+ /// let re = PikeVM::new_many(&[
+ /// r"[a-z]+",
+ /// r"[0-9]+",
+ /// ])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "123", &mut caps);
+ /// assert_eq!(Some(PatternID::must(1)), caps.pattern());
+ /// // Note that the only guarantee we have here is that slots 2 and 3
+ /// // are set to correct values. The contents of the first two slots are
+ /// // unspecified since the 0th pattern did not match.
+ /// let expected = &[
+ /// None,
+ /// None,
+ /// NonMaxUsize::new(0),
+ /// NonMaxUsize::new(3),
+ /// ];
+ /// assert_eq!(expected, caps.slots());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn slots(&self) -> &[Option<NonMaxUsize>] {
+ &self.slots
+ }
+
+ /// Returns the underlying slots as a mutable slice, where each slot stores
+ /// a single offset.
+ ///
+ /// This tends to be most useful for regex engine implementations for
+ /// writing offsets for matching capturing groups to slots.
+ ///
+ /// See [`Captures::slots`] for more information about slots.
+ #[inline]
+ pub fn slots_mut(&mut self) -> &mut [Option<NonMaxUsize>] {
+ &mut self.slots
+ }
+}
+
+impl core::fmt::Debug for Captures {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut dstruct = f.debug_struct("Captures");
+ dstruct.field("pid", &self.pid);
+ if let Some(pid) = self.pid {
+ dstruct.field("spans", &CapturesDebugMap { pid, caps: self });
+ }
+ dstruct.finish()
+ }
+}
+
+/// A little helper type to provide a nice map-like debug representation for
+/// our capturing group spans.
+struct CapturesDebugMap<'a> {
+ pid: PatternID,
+ caps: &'a Captures,
+}
+
+impl<'a> core::fmt::Debug for CapturesDebugMap<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ struct Key<'a>(usize, Option<&'a str>);
+
+ impl<'a> core::fmt::Debug for Key<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}", self.0)?;
+ if let Some(name) = self.1 {
+ write!(f, "/{:?}", name)?;
+ }
+ Ok(())
+ }
+ }
+
+ let mut map = f.debug_map();
+ let names = self.caps.group_info().pattern_names(self.pid);
+ for (group_index, maybe_name) in names.enumerate() {
+ let key = Key(group_index, maybe_name);
+ match self.caps.get_group(group_index) {
+ None => map.entry(&key, &None::<()>),
+ Some(span) => map.entry(&key, &span),
+ };
+ }
+ map.finish()
+ }
+}
+
+/// An iterator over all capturing groups in a `Captures` value.
+///
+/// This iterator includes capturing groups that did not participate in a
+/// match. See the [`Captures::iter`] method documentation for more details
+/// and examples.
+///
+/// The lifetime parameter `'a` refers to the lifetime of the underlying
+/// `Captures` value.
+#[derive(Clone, Debug)]
+pub struct CapturesPatternIter<'a> {
+ caps: &'a Captures,
+ names: core::iter::Enumerate<GroupInfoPatternNames<'a>>,
+}
+
+impl<'a> Iterator for CapturesPatternIter<'a> {
+ type Item = Option<Span>;
+
+ fn next(&mut self) -> Option<Option<Span>> {
+ let (group_index, _) = self.names.next()?;
+ Some(self.caps.get_group(group_index))
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.names.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.names.count()
+ }
+}
+
+impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {}
+impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {}
+
+/// Represents information about capturing groups in a compiled regex.
+///
+/// The information encapsulated by this type consists of the following. For
+/// each pattern:
+///
+/// * A map from every capture group name to its corresponding capture group
+/// index.
+/// * A map from every capture group index to its corresponding capture group
+/// name.
+/// * A map from capture group index to its corresponding slot index. A slot
+/// refers to one half of a capturing group. That is, a capture slot is either
+/// the start or end of a capturing group. A slot is usually the mechanism
+/// by which a regex engine records offsets for each capturing group during a
+/// search.
+///
+/// A `GroupInfo` uses reference counting internally and is thus cheap to
+/// clone.
+///
+/// # Mapping from capture groups to slots
+///
+/// One of the main responsibilities of a `GroupInfo` is to build a mapping
+/// from `(PatternID, u32)` (where the `u32` is a capture index) to something
+/// called a "slot." As mentioned above, a slot refers to one half of a
+/// capturing group. Both combined provide the start and end offsets of
+/// a capturing group that participated in a match.
+///
+/// **The mapping between group indices and slots is an API guarantee.** That
+/// is, the mapping won't change within a semver compatible release.
+///
+/// Slots exist primarily because this is a convenient mechanism by which
+/// regex engines report group offsets at search time. For example, the
+/// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture)
+/// NFA state includes the slot index. When a regex engine transitions through
+/// this state, it will likely use the slot index to write the current haystack
+/// offset to some region of memory. When a match is found, those slots are
+/// then reported to the caller, typically via a convenient abstraction like a
+/// [`Captures`] value.
+///
+/// Because this crate provides first class support for multi-pattern regexes,
+/// and because of some performance related reasons, the mapping between
+/// capturing groups and slots is a little complex. However, in the case of a
+/// single pattern, the mapping can be described very simply: for all capture
+/// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`.
+/// Notice that the pattern ID isn't involved at all here, because it only
+/// applies to a single-pattern regex, it is therefore always `0`.
+///
+/// In the multi-pattern case, the mapping is a bit more complicated. To talk
+/// about it, we must define what we mean by "implicit" vs "explicit"
+/// capturing groups:
+///
+/// * An **implicit** capturing group refers to the capturing group that is
+/// present for every pattern automatically, and corresponds to the overall
+/// match of a pattern. Every pattern has precisely one implicit capturing
+/// group. It is always unnamed and it always corresponds to the capture group
+/// index `0`.
+/// * An **explicit** capturing group refers to any capturing group that
+/// appears in the concrete syntax of the pattern. (Or, if an NFA was hand
+/// built without any concrete syntax, it refers to any capturing group with an
+/// index greater than `0`.)
+///
+/// Some examples:
+///
+/// * `\w+` has one implicit capturing group and zero explicit capturing
+/// groups.
+/// * `(\w+)` has one implicit group and one explicit group.
+/// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups.
+///
+/// Turning back to the slot mapping, we can now state it as follows:
+///
+/// * Given a pattern ID `pid`, the slots for its implicit group are always
+/// at `pid * 2` and `pid * 2 + 1`.
+/// * Given a pattern ID `0`, the slots for its explicit groups start
+/// at `group_info.pattern_len() * 2`.
+/// * Given a pattern ID `pid > 0`, the slots for its explicit groups start
+/// immediately following where the slots for the explicit groups of `pid - 1`
+/// end.
+///
+/// In particular, while there is a concrete formula one can use to determine
+/// where the slots for the implicit group of any pattern are, there is no
+/// general formula for determining where the slots for explicit capturing
+/// groups are. This is because each pattern can contain a different number
+/// of groups.
+///
+/// The intended way of getting the slots for a particular capturing group
+/// (whether implicit or explicit) is via the [`GroupInfo::slot`] or
+/// [`GroupInfo::slots`] method.
+///
+/// See below for a concrete example of how capturing groups get mapped to
+/// slots.
+///
+/// # Example
+///
+/// This example shows how to build a new `GroupInfo` and query it for
+/// information.
+///
+/// ```
+/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID};
+///
+/// let info = GroupInfo::new(vec![
+/// vec![None, Some("foo")],
+/// vec![None],
+/// vec![None, None, None, Some("bar"), None],
+/// vec![None, None, Some("foo")],
+/// ])?;
+/// // The number of patterns being tracked.
+/// assert_eq!(4, info.pattern_len());
+/// // We can query the number of groups for any pattern.
+/// assert_eq!(2, info.group_len(PatternID::must(0)));
+/// assert_eq!(1, info.group_len(PatternID::must(1)));
+/// assert_eq!(5, info.group_len(PatternID::must(2)));
+/// assert_eq!(3, info.group_len(PatternID::must(3)));
+/// // An invalid pattern always has zero groups.
+/// assert_eq!(0, info.group_len(PatternID::must(999)));
+/// // 2 slots per group
+/// assert_eq!(22, info.slot_len());
+///
+/// // We can map a group index for a particular pattern to its name, if
+/// // one exists.
+/// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2));
+/// assert_eq!(None, info.to_name(PatternID::must(2), 4));
+/// // Or map a name to its group index.
+/// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo"));
+/// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo"));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: mapping from capture groups to slots
+///
+/// This example shows the specific mapping from capture group indices for
+/// each pattern to their corresponding slots. The slot values shown in this
+/// example are considered an API guarantee.
+///
+/// ```
+/// use regex_automata::util::{captures::GroupInfo, primitives::PatternID};
+///
+/// let info = GroupInfo::new(vec![
+/// vec![None, Some("foo")],
+/// vec![None],
+/// vec![None, None, None, Some("bar"), None],
+/// vec![None, None, Some("foo")],
+/// ])?;
+///
+/// // We first show the slots for each pattern's implicit group.
+/// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0));
+/// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0));
+/// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0));
+/// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0));
+///
+/// // And now we show the slots for each pattern's explicit group.
+/// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1));
+/// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1));
+/// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2));
+/// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3));
+/// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4));
+/// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1));
+/// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2));
+///
+/// // Asking for the slots for an invalid pattern ID or even for an invalid
+/// // group index for a specific pattern will return None. So for example,
+/// // you're guaranteed to not get the slots for a different pattern than the
+/// // one requested.
+/// assert_eq!(None, info.slots(PatternID::must(5), 0));
+/// assert_eq!(None, info.slots(PatternID::must(1), 1));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug, Default)]
+pub struct GroupInfo(Arc<GroupInfoInner>);
+
+impl GroupInfo {
+ /// Creates a new group info from a sequence of patterns, where each
+ /// sequence of patterns yields a sequence of possible group names. The
+ /// index of each pattern in the sequence corresponds to its `PatternID`,
+ /// and the index of each group in each pattern's sequence corresponds to
+ /// its corresponding group index.
+ ///
+ /// While this constructor is very generic and therefore perhaps hard to
+ /// chew on, an example of a valid concrete type that can be passed to
+ /// this constructor is `Vec<Vec<Option<String>>>`. The outer `Vec`
+ /// corresponds to the patterns, i.e., one `Vec<Option<String>>` per
+ /// pattern. The inner `Vec` corresponds to the capturing groups for
+ /// each pattern. The `Option<String>` corresponds to the name of the
+ /// capturing group, if present.
+ ///
+ /// It is legal to pass an empty iterator to this constructor. It will
+ /// return an empty group info with zero slots. An empty group info is
+ /// useful for cases where you have no patterns or for cases where slots
+ /// aren't being used at all (e.g., for most DFAs in this crate).
+ ///
+ /// # Errors
+ ///
+ /// This constructor returns an error if the given capturing groups are
+ /// invalid in some way. Those reasons include, but are not necessarily
+ /// limited to:
+ ///
+ /// * Too many patterns (i.e., `PatternID` would overflow).
+ /// * Too many capturing groups (e.g., `u32` would overflow).
+ /// * A pattern is given that has no capturing groups. (All patterns must
+ /// have at least an implicit capturing group at index `0`.)
+ /// * The capturing group at index `0` has a name. It must be unnamed.
+ /// * There are duplicate capturing group names within the same pattern.
+ /// (Multiple capturing groups with the same name may exist, but they
+ /// must be in different patterns.)
+ ///
+ /// An example below shows how to trigger some of the above error
+ /// conditions.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a new `GroupInfo` and query it for
+ /// information.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// let info = GroupInfo::new(vec![
+ /// vec![None, Some("foo")],
+ /// vec![None],
+ /// vec![None, None, None, Some("bar"), None],
+ /// vec![None, None, Some("foo")],
+ /// ])?;
+ /// // The number of patterns being tracked.
+ /// assert_eq!(4, info.pattern_len());
+ /// // 2 slots per group
+ /// assert_eq!(22, info.slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: empty `GroupInfo`
+ ///
+ /// This example shows how to build a new `GroupInfo` and query it for
+ /// information.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// let info = GroupInfo::empty();
+ /// // Everything is zero.
+ /// assert_eq!(0, info.pattern_len());
+ /// assert_eq!(0, info.slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: error conditions
+ ///
+ /// This example shows how to provoke some of the ways in which building
+ /// a `GroupInfo` can fail.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// // Either the group info is empty, or all patterns must have at least
+ /// // one capturing group.
+ /// assert!(GroupInfo::new(vec![
+ /// vec![None, Some("a")], // ok
+ /// vec![None], // ok
+ /// vec![], // not ok
+ /// ]).is_err());
+ /// // Note that building an empty group info is OK.
+ /// assert!(GroupInfo::new(Vec::<Vec<Option<String>>>::new()).is_ok());
+ ///
+ /// // The first group in each pattern must correspond to an implicit
+ /// // anonymous group. i.e., One that is not named. By convention, this
+ /// // group corresponds to the overall match of a regex. Every other group
+ /// // in a pattern is explicit and optional.
+ /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err());
+ ///
+ /// // There must not be duplicate group names within the same pattern.
+ /// assert!(GroupInfo::new(vec![
+ /// vec![None, Some("foo"), Some("foo")],
+ /// ]).is_err());
+ /// // But duplicate names across distinct patterns is OK.
+ /// assert!(GroupInfo::new(vec![
+ /// vec![None, Some("foo")],
+ /// vec![None, Some("foo")],
+ /// ]).is_ok());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// There are other ways for building a `GroupInfo` to fail but are
+ /// difficult to show. For example, if the number of patterns given would
+ /// overflow `PatternID`.
+ pub fn new<P, G, N>(pattern_groups: P) -> Result<GroupInfo, GroupInfoError>
+ where
+ P: IntoIterator<Item = G>,
+ G: IntoIterator<Item = Option<N>>,
+ N: AsRef<str>,
+ {
+ let mut group_info = GroupInfoInner {
+ slot_ranges: vec![],
+ name_to_index: vec![],
+ index_to_name: vec![],
+ memory_extra: 0,
+ };
+ for (pattern_index, groups) in pattern_groups.into_iter().enumerate() {
+ // If we can't convert the pattern index to an ID, then the caller
+ // tried to build capture info for too many patterns.
+ let pid = PatternID::new(pattern_index)
+ .map_err(GroupInfoError::too_many_patterns)?;
+
+ let mut groups_iter = groups.into_iter().enumerate();
+ match groups_iter.next() {
+ None => return Err(GroupInfoError::missing_groups(pid)),
+ Some((_, Some(_))) => {
+ return Err(GroupInfoError::first_must_be_unnamed(pid))
+ }
+ Some((_, None)) => {}
+ }
+ group_info.add_first_group(pid);
+ // Now iterate over the rest, which correspond to all of the
+ // (conventionally) explicit capture groups in a regex pattern.
+ for (group_index, maybe_name) in groups_iter {
+ // Just like for patterns, if the group index can't be
+ // converted to a "small" index, then the caller has given too
+ // many groups for a particular pattern.
+ let group = SmallIndex::new(group_index).map_err(|_| {
+ GroupInfoError::too_many_groups(pid, group_index)
+ })?;
+ group_info.add_explicit_group(pid, group, maybe_name)?;
+ }
+ }
+ group_info.fixup_slot_ranges()?;
+ Ok(GroupInfo(Arc::new(group_info)))
+ }
+
+ /// This creates an empty `GroupInfo`.
+ ///
+ /// This is a convenience routine for calling `GroupInfo::new` with an
+ /// iterator that yields no elements.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a new empty `GroupInfo` and query it
+ /// for information.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// let info = GroupInfo::empty();
+ /// // Everything is zero.
+ /// assert_eq!(0, info.pattern_len());
+ /// assert_eq!(0, info.all_group_len());
+ /// assert_eq!(0, info.slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn empty() -> GroupInfo {
+ GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>())
+ .expect("empty group info is always valid")
+ }
+
+ /// Return the capture group index corresponding to the given name in the
+ /// given pattern. If no such capture group name exists in the given
+ /// pattern, then this returns `None`.
+ ///
+ /// If the given pattern ID is invalid, then this returns `None`.
+ ///
+ /// This also returns `None` for all inputs if these captures are empty
+ /// (e.g., built from an empty [`GroupInfo`]). To check whether captures
+ /// are are present for a specific pattern, use [`GroupInfo::group_len`].
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the capture index for the given pattern
+ /// and group name.
+ ///
+ /// Remember that capture indices are relative to the pattern, such that
+ /// the same capture index value may refer to different capturing groups
+ /// for distinct patterns.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1));
+ ///
+ /// let nfa = NFA::new_many(&[
+ /// r"a(?P<quux>\w+)z(?P<foo>\s+)",
+ /// r"a(?P<foo>\d+)z",
+ /// ])?;
+ /// let groups = nfa.group_info();
+ /// assert_eq!(Some(2), groups.to_index(pid0, "foo"));
+ /// // Recall that capture index 0 is always unnamed and refers to the
+ /// // entire pattern. So the first capturing group present in the pattern
+ /// // itself always starts at index 1.
+ /// assert_eq!(Some(1), groups.to_index(pid1, "foo"));
+ ///
+ /// // And if a name does not exist for a particular pattern, None is
+ /// // returned.
+ /// assert!(groups.to_index(pid0, "quux").is_some());
+ /// assert!(groups.to_index(pid1, "quux").is_none());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn to_index(&self, pid: PatternID, name: &str) -> Option<usize> {
+ let indices = self.0.name_to_index.get(pid.as_usize())?;
+ indices.get(name).cloned().map(|i| i.as_usize())
+ }
+
+ /// Return the capture name for the given index and given pattern. If the
+ /// corresponding group does not have a name, then this returns `None`.
+ ///
+ /// If the pattern ID is invalid, then this returns `None`.
+ ///
+ /// If the group index is invalid for the given pattern, then this returns
+ /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and
+ /// only if `index < nfa.pattern_capture_len(pid)`.
+ ///
+ /// This also returns `None` for all inputs if these captures are empty
+ /// (e.g., built from an empty [`GroupInfo`]). To check whether captures
+ /// are are present for a specific pattern, use [`GroupInfo::group_len`].
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the capture group name for the given
+ /// pattern and group index.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1));
+ ///
+ /// let nfa = NFA::new_many(&[
+ /// r"a(?P<foo>\w+)z(\s+)x(\d+)",
+ /// r"a(\d+)z(?P<foo>\s+)",
+ /// ])?;
+ /// let groups = nfa.group_info();
+ /// assert_eq!(None, groups.to_name(pid0, 0));
+ /// assert_eq!(Some("foo"), groups.to_name(pid0, 1));
+ /// assert_eq!(None, groups.to_name(pid0, 2));
+ /// assert_eq!(None, groups.to_name(pid0, 3));
+ ///
+ /// assert_eq!(None, groups.to_name(pid1, 0));
+ /// assert_eq!(None, groups.to_name(pid1, 1));
+ /// assert_eq!(Some("foo"), groups.to_name(pid1, 2));
+ /// // '3' is not a valid capture index for the second pattern.
+ /// assert_eq!(None, groups.to_name(pid1, 3));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> {
+ let pattern_names = self.0.index_to_name.get(pid.as_usize())?;
+ pattern_names.get(group_index)?.as_deref()
+ }
+
+ /// Return an iterator of all capture groups and their names (if present)
+ /// for a particular pattern.
+ ///
+ /// If the given pattern ID is invalid or if this `GroupInfo` is empty,
+ /// then the iterator yields no elements.
+ ///
+ /// The number of elements yielded by this iterator is always equal to
+ /// the result of calling [`GroupInfo::group_len`] with the same
+ /// `PatternID`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get a list of all capture group names for
+ /// a particular pattern.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
+ /// // The first is the implicit group that is always unnammed. The next
+ /// // 5 groups are the explicit groups found in the concrete syntax above.
+ /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
+ /// let got: Vec<Option<&str>> =
+ /// nfa.group_info().pattern_names(PatternID::ZERO).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// // Using an invalid pattern ID will result in nothing yielded.
+ /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
+ /// assert_eq!(0, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> {
+ GroupInfoPatternNames {
+ it: self
+ .0
+ .index_to_name
+ .get(pid.as_usize())
+ .map(|indices| indices.iter())
+ .unwrap_or([].iter()),
+ }
+ }
+
+ /// Return an iterator of all capture groups for all patterns supported by
+ /// this `GroupInfo`. Each item yielded is a triple of the group's pattern
+ /// ID, index in the pattern and the group's name, if present.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get a list of all capture groups found in
+ /// one NFA, potentially spanning multiple patterns.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&[
+ /// r"(?P<foo>a)",
+ /// r"a",
+ /// r"(a)",
+ /// ])?;
+ /// let expected = vec![
+ /// (PatternID::must(0), 0, None),
+ /// (PatternID::must(0), 1, Some("foo")),
+ /// (PatternID::must(1), 0, None),
+ /// (PatternID::must(2), 0, None),
+ /// (PatternID::must(2), 1, None),
+ /// ];
+ /// let got: Vec<(PatternID, usize, Option<&str>)> =
+ /// nfa.group_info().all_names().collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Unlike other capturing group related routines, this routine doesn't
+ /// panic even if captures aren't enabled on this NFA:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build_many(&[
+ /// r"(?P<foo>a)",
+ /// r"a",
+ /// r"(a)",
+ /// ])?;
+ /// // When captures aren't enabled, there's nothing to return.
+ /// assert_eq!(0, nfa.group_info().all_names().count());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn all_names(&self) -> GroupInfoAllNames<'_> {
+ GroupInfoAllNames {
+ group_info: self,
+ pids: PatternID::iter(self.pattern_len()),
+ current_pid: None,
+ names: None,
+ }
+ }
+
+ /// Returns the starting and ending slot corresponding to the given
+ /// capturing group for the given pattern. The ending slot is always one
+ /// more than the starting slot returned.
+ ///
+ /// Note that this is like [`GroupInfo::slot`], except that it also returns
+ /// the ending slot value for convenience.
+ ///
+ /// If either the pattern ID or the capture index is invalid, then this
+ /// returns None.
+ ///
+ /// # Example
+ ///
+ /// This example shows that the starting slots for the first capturing
+ /// group of each pattern are distinct.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["a", "b"])?;
+ /// assert_ne!(
+ /// nfa.group_info().slots(PatternID::must(0), 0),
+ /// nfa.group_info().slots(PatternID::must(1), 0),
+ /// );
+ ///
+ /// // Also, the start and end slot values are never equivalent.
+ /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap();
+ /// assert_ne!(start, end);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn slots(
+ &self,
+ pid: PatternID,
+ group_index: usize,
+ ) -> Option<(usize, usize)> {
+ // Since 'slot' only even returns valid starting slots, we know that
+ // there must also be an end slot and that end slot is always one more
+ // than the start slot.
+ self.slot(pid, group_index).map(|start| (start, start + 1))
+ }
+
+ /// Returns the starting slot corresponding to the given capturing group
+ /// for the given pattern. The ending slot is always one more than the
+ /// value returned.
+ ///
+ /// If either the pattern ID or the capture index is invalid, then this
+ /// returns None.
+ ///
+ /// # Example
+ ///
+ /// This example shows that the starting slots for the first capturing
+ /// group of each pattern are distinct.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["a", "b"])?;
+ /// assert_ne!(
+ /// nfa.group_info().slot(PatternID::must(0), 0),
+ /// nfa.group_info().slot(PatternID::must(1), 0),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn slot(&self, pid: PatternID, group_index: usize) -> Option<usize> {
+ if group_index >= self.group_len(pid) {
+ return None;
+ }
+ // At this point, we know that 'pid' refers to a real pattern and that
+ // 'group_index' refers to a real group. We therefore also know that
+ // the pattern and group can be combined to return a correct slot.
+ // That's why we don't need to use checked arithmetic below.
+ if group_index == 0 {
+ Some(pid.as_usize() * 2)
+ } else {
+ // As above, we don't need to check that our slot is less than the
+ // end of our range since we already know the group index is a
+ // valid index for the given pattern.
+ let (start, _) = self.0.slot_ranges[pid];
+ Some(start.as_usize() + ((group_index - 1) * 2))
+ }
+ }
+
+ /// Returns the total number of patterns in this `GroupInfo`.
+ ///
+ /// This may return zero if the `GroupInfo` was constructed with no
+ /// patterns.
+ ///
+ /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
+ /// `GroupInfo` construction will fail if too many patterns are added.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(3, nfa.group_info().pattern_len());
+ ///
+ /// let nfa = NFA::never_match();
+ /// assert_eq!(0, nfa.group_info().pattern_len());
+ ///
+ /// let nfa = NFA::always_match();
+ /// assert_eq!(1, nfa.group_info().pattern_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.0.pattern_len()
+ }
+
+ /// Return the number of capture groups in a pattern.
+ ///
+ /// If the pattern ID is invalid, then this returns `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the values returned by this routine may vary
+ /// for different patterns and NFA configurations.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
+ ///
+ /// let nfa = NFA::new(r"(a)(b)(c)")?;
+ /// // There are 3 explicit groups in the pattern's concrete syntax and
+ /// // 1 unnamed and implicit group spanning the entire pattern.
+ /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO));
+ ///
+ /// let nfa = NFA::new(r"abc")?;
+ /// // There is just the unnamed implicit group.
+ /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO));
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"abc")?;
+ /// // We disabled capturing groups, so there are none.
+ /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"(a)(b)(c)")?;
+ /// // We disabled capturing groups, so there are none, even if there are
+ /// // explicit groups in the concrete syntax.
+ /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn group_len(&self, pid: PatternID) -> usize {
+ self.0.group_len(pid)
+ }
+
+ /// Return the total number of capture groups across all patterns.
+ ///
+ /// This includes implicit groups that represent the entire match of a
+ /// pattern.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the values returned by this routine may vary
+ /// for different patterns and NFA configurations.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID};
+ ///
+ /// let nfa = NFA::new(r"(a)(b)(c)")?;
+ /// // There are 3 explicit groups in the pattern's concrete syntax and
+ /// // 1 unnamed and implicit group spanning the entire pattern.
+ /// assert_eq!(4, nfa.group_info().all_group_len());
+ ///
+ /// let nfa = NFA::new(r"abc")?;
+ /// // There is just the unnamed implicit group.
+ /// assert_eq!(1, nfa.group_info().all_group_len());
+ ///
+ /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?;
+ /// // Each pattern has one implicit groups, and two
+ /// // patterns have one explicit group each.
+ /// assert_eq!(5, nfa.group_info().all_group_len());
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"abc")?;
+ /// // We disabled capturing groups, so there are none.
+ /// assert_eq!(0, nfa.group_info().all_group_len());
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"(a)(b)(c)")?;
+ /// // We disabled capturing groups, so there are none, even if there are
+ /// // explicit groups in the concrete syntax.
+ /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn all_group_len(&self) -> usize {
+ self.slot_len() / 2
+ }
+
+ /// Returns the total number of slots in this `GroupInfo` across all
+ /// patterns.
+ ///
+ /// The total number of slots is always twice the total number of capturing
+ /// groups, including both implicit and explicit groups.
+ ///
+ /// # Example
+ ///
+ /// This example shows the relationship between the number of capturing
+ /// groups and slots.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// // There are 11 total groups here.
+ /// let info = GroupInfo::new(vec![
+ /// vec![None, Some("foo")],
+ /// vec![None],
+ /// vec![None, None, None, Some("bar"), None],
+ /// vec![None, None, Some("foo")],
+ /// ])?;
+ /// // 2 slots per group gives us 11*2=22 slots.
+ /// assert_eq!(22, info.slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn slot_len(&self) -> usize {
+ self.0.small_slot_len().as_usize()
+ }
+
+ /// Returns the total number of slots for implicit capturing groups.
+ ///
+ /// This is like [`GroupInfo::slot_len`], except it doesn't include the
+ /// explicit slots for each pattern. Since there are always exactly 2
+ /// implicit slots for each pattern, the number of implicit slots is always
+ /// equal to twice the number of patterns.
+ ///
+ /// # Example
+ ///
+ /// This example shows the relationship between the number of capturing
+ /// groups, implicit slots and explicit slots.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// // There are 11 total groups here.
+ /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?;
+ /// // 2 slots per group gives us 11*2=22 slots.
+ /// assert_eq!(6, info.slot_len());
+ /// // 2 implicit slots per pattern gives us 2 implicit slots since there
+ /// // is 1 pattern.
+ /// assert_eq!(2, info.implicit_slot_len());
+ /// // 2 explicit capturing groups gives us 2*2=4 explicit slots.
+ /// assert_eq!(4, info.explicit_slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn implicit_slot_len(&self) -> usize {
+ self.pattern_len() * 2
+ }
+
+ /// Returns the total number of slots for explicit capturing groups.
+ ///
+ /// This is like [`GroupInfo::slot_len`], except it doesn't include the
+ /// implicit slots for each pattern. (There are always 2 implicit slots for
+ /// each pattern.)
+ ///
+ /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is
+ /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`,
+ /// both the total number of slots and the number of explicit slots is
+ /// `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the relationship between the number of capturing
+ /// groups, implicit slots and explicit slots.
+ ///
+ /// ```
+ /// use regex_automata::util::captures::GroupInfo;
+ ///
+ /// // There are 11 total groups here.
+ /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?;
+ /// // 2 slots per group gives us 11*2=22 slots.
+ /// assert_eq!(6, info.slot_len());
+ /// // 2 implicit slots per pattern gives us 2 implicit slots since there
+ /// // is 1 pattern.
+ /// assert_eq!(2, info.implicit_slot_len());
+ /// // 2 explicit capturing groups gives us 2*2=4 explicit slots.
+ /// assert_eq!(4, info.explicit_slot_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn explicit_slot_len(&self) -> usize {
+ self.slot_len().saturating_sub(self.implicit_slot_len())
+ }
+
+ /// Returns the memory usage, in bytes, of this `GroupInfo`.
+ ///
+ /// This does **not** include the stack size used up by this `GroupInfo`.
+ /// To compute that, use `std::mem::size_of::<GroupInfo>()`.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of as s;
+
+ s::<GroupInfoInner>()
+ + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>()
+ + self.0.name_to_index.len() * s::<CaptureNameMap>()
+ + self.0.index_to_name.len() * s::<Vec<Option<Arc<str>>>>()
+ + self.0.memory_extra
+ }
+}
+
+/// A map from capture group name to its corresponding capture group index.
+///
+/// This type is actually wrapped inside a Vec indexed by pattern ID on a
+/// `GroupInfo`, since multiple patterns may have the same capture group name.
+/// That is, each pattern gets its own namespace of capture group names.
+///
+/// Perhaps a more memory efficient representation would be
+/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look
+/// up a capture index by name without producing a `Arc<str>`, which requires
+/// an allocation. To fix this, I think we'd need to define our own unsized
+/// type or something? Anyway, I didn't give this much thought since it
+/// probably doesn't matter much in the grand scheme of things. But it did
+/// stand out to me as mildly wasteful.
+#[cfg(feature = "std")]
+type CaptureNameMap = std::collections::HashMap<Arc<str>, SmallIndex>;
+#[cfg(not(feature = "std"))]
+type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, SmallIndex>;
+
+/// The inner guts of `GroupInfo`. This type only exists so that it can
+/// be wrapped in an `Arc` to make `GroupInfo` reference counted.
+#[derive(Debug, Default)]
+struct GroupInfoInner {
+ slot_ranges: Vec<(SmallIndex, SmallIndex)>,
+ name_to_index: Vec<CaptureNameMap>,
+ index_to_name: Vec<Vec<Option<Arc<str>>>>,
+ memory_extra: usize,
+}
+
+impl GroupInfoInner {
+ /// This adds the first unnamed group for the given pattern ID. The given
+ /// pattern ID must be zero if this is the first time this method is
+ /// called, or must be exactly one more than the pattern ID supplied to the
+ /// previous call to this method. (This method panics if this rule is
+ /// violated.)
+ ///
+ /// This can be thought of as initializing the GroupInfo state for the
+ /// given pattern and closing off the state for any previous pattern.
+ fn add_first_group(&mut self, pid: PatternID) {
+ assert_eq!(pid.as_usize(), self.slot_ranges.len());
+ assert_eq!(pid.as_usize(), self.name_to_index.len());
+ assert_eq!(pid.as_usize(), self.index_to_name.len());
+ // This is the start of our slots for the explicit capturing groups.
+ // Note that since the slots for the 0th group for every pattern appear
+ // before any slots for the nth group (where n > 0) in any pattern, we
+ // will have to fix up the slot ranges once we know how many patterns
+ // we've added capture groups for.
+ let slot_start = self.small_slot_len();
+ self.slot_ranges.push((slot_start, slot_start));
+ self.name_to_index.push(CaptureNameMap::new());
+ self.index_to_name.push(vec![None]);
+ self.memory_extra += core::mem::size_of::<Option<Arc<str>>>();
+ }
+
+ /// Add an explicit capturing group for the given pattern with the given
+ /// index. If the group has a name, then that must be given as well.
+ ///
+ /// Note that every capturing group except for the first or zeroth group is
+ /// explicit.
+ ///
+ /// This returns an error if adding this group would result in overflowing
+ /// slot indices or if a capturing group with the same name for this
+ /// pattern has already been added.
+ fn add_explicit_group<N: AsRef<str>>(
+ &mut self,
+ pid: PatternID,
+ group: SmallIndex,
+ maybe_name: Option<N>,
+ ) -> Result<(), GroupInfoError> {
+ // We also need to check that the slot index generated for
+ // this group is also valid. Although, this is a little weird
+ // because we offset these indices below, at which point, we'll
+ // have to recheck them. Gosh this is annoying. Note that
+ // the '+2' below is OK because 'end' is guaranteed to be less
+ // than isize::MAX.
+ let end = &mut self.slot_ranges[pid].1;
+ *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| {
+ GroupInfoError::too_many_groups(pid, group.as_usize())
+ })?;
+ if let Some(name) = maybe_name {
+ let name = Arc::<str>::from(name.as_ref());
+ if self.name_to_index[pid].contains_key(&*name) {
+ return Err(GroupInfoError::duplicate(pid, &name));
+ }
+ let len = name.len();
+ self.name_to_index[pid].insert(Arc::clone(&name), group);
+ self.index_to_name[pid].push(Some(name));
+ // Adds the memory used by the Arc<str> in both maps.
+ self.memory_extra +=
+ 2 * (len + core::mem::size_of::<Option<Arc<str>>>());
+ // And also the value entry for the 'name_to_index' map.
+ // This is probably an underestimate for 'name_to_index' since
+ // hashmaps/btrees likely have some non-zero overhead, but we
+ // assume here that they have zero overhead.
+ self.memory_extra += core::mem::size_of::<SmallIndex>();
+ } else {
+ self.index_to_name[pid].push(None);
+ self.memory_extra += core::mem::size_of::<Option<Arc<str>>>();
+ }
+ // This is a sanity assert that checks that our group index
+ // is in line with the number of groups added so far for this
+ // pattern.
+ assert_eq!(group.one_more(), self.group_len(pid));
+ // And is also in line with the 'index_to_name' map.
+ assert_eq!(group.one_more(), self.index_to_name[pid].len());
+ Ok(())
+ }
+
+ /// This corrects the slot ranges to account for the slots corresponding
+ /// to the zeroth group of each pattern. That is, every slot range is
+ /// offset by 'pattern_len() * 2', since each pattern uses two slots to
+ /// represent the zeroth group.
+ fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> {
+ use crate::util::primitives::IteratorIndexExt;
+ // Since we know number of patterns fits in PatternID and
+ // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will
+ // never overflow usize.
+ let offset = self.pattern_len().checked_mul(2).unwrap();
+ for (pid, &mut (ref mut start, ref mut end)) in
+ self.slot_ranges.iter_mut().with_pattern_ids()
+ {
+ let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2);
+ let new_end = match end.as_usize().checked_add(offset) {
+ Some(new_end) => new_end,
+ None => {
+ return Err(GroupInfoError::too_many_groups(
+ pid, group_len,
+ ))
+ }
+ };
+ *end = SmallIndex::new(new_end).map_err(|_| {
+ GroupInfoError::too_many_groups(pid, group_len)
+ })?;
+ // Since start <= end, if end is valid then start must be too.
+ *start = SmallIndex::new(start.as_usize() + offset).unwrap();
+ }
+ Ok(())
+ }
+
+ /// Return the total number of patterns represented by this capture slot
+ /// info.
+ fn pattern_len(&self) -> usize {
+ self.slot_ranges.len()
+ }
+
+ /// Return the total number of capturing groups for the given pattern. If
+ /// the given pattern isn't valid for this capture slot info, then 0 is
+ /// returned.
+ fn group_len(&self, pid: PatternID) -> usize {
+ let (start, end) = match self.slot_ranges.get(pid.as_usize()) {
+ None => return 0,
+ Some(range) => range,
+ };
+ // The difference between any two SmallIndex values always fits in a
+ // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also
+ // know that start<=end by construction and that the number of groups
+ // never exceeds SmallIndex and thus never overflows usize.
+ 1 + ((end.as_usize() - start.as_usize()) / 2)
+ }
+
+ /// Return the total number of slots in this capture slot info as a
+ /// "small index."
+ fn small_slot_len(&self) -> SmallIndex {
+ // Since slots are allocated in order of pattern (starting at 0) and
+ // then in order of capture group, it follows that the number of slots
+ // is the end of the range of slots for the last pattern. This is
+ // true even when the last pattern has no capturing groups, since
+ // 'slot_ranges' will still represent it explicitly with an empty
+ // range.
+ self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end)
+ }
+}
+
+/// An error that may occur when building a `GroupInfo`.
+///
+/// Building a `GroupInfo` does a variety of checks to make sure the
+/// capturing groups satisfy a number of invariants. This includes, but is not
+/// limited to, ensuring that the first capturing group is unnamed and that
+/// there are no duplicate capture groups for a specific pattern.
+#[derive(Clone, Debug)]
+pub struct GroupInfoError {
+ kind: GroupInfoErrorKind,
+}
+
+/// The kind of error that occurs when building a `GroupInfo` fails.
+///
+/// We keep this un-exported because it's not clear how useful it is to
+/// export it.
+#[derive(Clone, Debug)]
+enum GroupInfoErrorKind {
+ /// This occurs when too many patterns have been added. i.e., It would
+ /// otherwise overflow a `PatternID`.
+ TooManyPatterns { err: PatternIDError },
+ /// This occurs when too many capturing groups have been added for a
+ /// particular pattern.
+ TooManyGroups {
+ /// The ID of the pattern that had too many groups.
+ pattern: PatternID,
+ /// The minimum number of groups that the caller has tried to add for
+ /// a pattern.
+ minimum: usize,
+ },
+ /// An error that occurs when a pattern has no capture groups. Either the
+ /// group info must be empty, or all patterns must have at least one group
+ /// (corresponding to the unnamed group for the entire pattern).
+ MissingGroups {
+ /// The ID of the pattern that had no capturing groups.
+ pattern: PatternID,
+ },
+ /// An error that occurs when one tries to provide a name for the capture
+ /// group at index 0. This capturing group must currently always be
+ /// unnamed.
+ FirstMustBeUnnamed {
+ /// The ID of the pattern that was found to have a named first
+ /// capturing group.
+ pattern: PatternID,
+ },
+ /// An error that occurs when duplicate capture group names for the same
+ /// pattern are added.
+ ///
+ /// NOTE: At time of writing, this error can never occur if you're using
+ /// regex-syntax, since the parser itself will reject patterns with
+ /// duplicate capture group names. This error can only occur when the
+ /// builder is used to hand construct NFAs.
+ Duplicate {
+ /// The pattern in which the duplicate capture group name was found.
+ pattern: PatternID,
+ /// The duplicate name.
+ name: String,
+ },
+}
+
+impl GroupInfoError {
+ fn too_many_patterns(err: PatternIDError) -> GroupInfoError {
+ GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } }
+ }
+
+ fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError {
+ GroupInfoError {
+ kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum },
+ }
+ }
+
+ fn missing_groups(pattern: PatternID) -> GroupInfoError {
+ GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } }
+ }
+
+ fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError {
+ GroupInfoError {
+ kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern },
+ }
+ }
+
+ fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError {
+ GroupInfoError {
+ kind: GroupInfoErrorKind::Duplicate {
+ pattern,
+ name: String::from(name),
+ },
+ }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for GroupInfoError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind {
+ GroupInfoErrorKind::TooManyPatterns { .. }
+ | GroupInfoErrorKind::TooManyGroups { .. }
+ | GroupInfoErrorKind::MissingGroups { .. }
+ | GroupInfoErrorKind::FirstMustBeUnnamed { .. }
+ | GroupInfoErrorKind::Duplicate { .. } => None,
+ }
+ }
+}
+
+impl core::fmt::Display for GroupInfoError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use self::GroupInfoErrorKind::*;
+
+ match self.kind {
+ TooManyPatterns { ref err } => {
+ write!(f, "too many patterns to build capture info: {}", err)
+ }
+ TooManyGroups { pattern, minimum } => {
+ write!(
+ f,
+ "too many capture groups (at least {}) were \
+ found for pattern {}",
+ minimum,
+ pattern.as_usize()
+ )
+ }
+ MissingGroups { pattern } => write!(
+ f,
+ "no capturing groups found for pattern {} \
+ (either all patterns have zero groups or all patterns have \
+ at least one group)",
+ pattern.as_usize(),
+ ),
+ FirstMustBeUnnamed { pattern } => write!(
+ f,
+ "first capture group (at index 0) for pattern {} has a name \
+ (it must be unnamed)",
+ pattern.as_usize(),
+ ),
+ Duplicate { pattern, ref name } => write!(
+ f,
+ "duplicate capture group name '{}' found for pattern {}",
+ name,
+ pattern.as_usize(),
+ ),
+ }
+ }
+}
+
+/// An iterator over capturing groups and their names for a specific pattern.
+///
+/// This iterator is created by [`GroupInfo::pattern_names`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo`
+/// from which this iterator was created.
+#[derive(Clone, Debug)]
+pub struct GroupInfoPatternNames<'a> {
+ it: core::slice::Iter<'a, Option<Arc<str>>>,
+}
+
+impl GroupInfoPatternNames<'static> {
+ fn empty() -> GroupInfoPatternNames<'static> {
+ GroupInfoPatternNames { it: [].iter() }
+ }
+}
+
+impl<'a> Iterator for GroupInfoPatternNames<'a> {
+ type Item = Option<&'a str>;
+
+ fn next(&mut self) -> Option<Option<&'a str>> {
+ self.it.next().map(|x| x.as_deref())
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+
+ fn count(self) -> usize {
+ self.it.count()
+ }
+}
+
+impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {}
+impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {}
+
+/// An iterator over capturing groups and their names for a `GroupInfo`.
+///
+/// This iterator is created by [`GroupInfo::all_names`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo`
+/// from which this iterator was created.
+#[derive(Debug)]
+pub struct GroupInfoAllNames<'a> {
+ group_info: &'a GroupInfo,
+ pids: PatternIDIter,
+ current_pid: Option<PatternID>,
+ names: Option<core::iter::Enumerate<GroupInfoPatternNames<'a>>>,
+}
+
+impl<'a> Iterator for GroupInfoAllNames<'a> {
+ type Item = (PatternID, usize, Option<&'a str>);
+
+ fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> {
+ // If the group info has no captures, then we never have anything
+ // to yield. We need to consider this case explicitly (at time of
+ // writing) because 'pattern_capture_names' will panic if captures
+ // aren't enabled.
+ if self.group_info.0.index_to_name.is_empty() {
+ return None;
+ }
+ if self.current_pid.is_none() {
+ self.current_pid = Some(self.pids.next()?);
+ }
+ let pid = self.current_pid.unwrap();
+ if self.names.is_none() {
+ self.names = Some(self.group_info.pattern_names(pid).enumerate());
+ }
+ let (group_index, name) = match self.names.as_mut().unwrap().next() {
+ Some((group_index, name)) => (group_index, name),
+ None => {
+ self.current_pid = None;
+ self.names = None;
+ return self.next();
+ }
+ };
+ Some((pid, group_index, name))
+ }
+}
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs
index b384de8e1..30a82afb8 100644
--- a/vendor/regex-automata/src/util/determinize/mod.rs
+++ b/vendor/regex-automata/src/util/determinize/mod.rs
@@ -13,11 +13,9 @@ in common, as defined by this module:
word boundaries, line boundaries, etc., is all the same. This also includes
the look-behind assertions that are satisfied by each starting state
classification.
-
* The representation of DFA states as sets of NFA states, including
convenience types for building these DFA states that are amenable to reusing
allocations.
-
* Routines for the "classical" parts of determinization: computing the
epsilon closure, tracking match states (with corresponding pattern IDs, since
we support multi-pattern finite automata) and, of course, computing the
@@ -44,19 +42,21 @@ pub(crate) use self::state::{
};
use crate::{
- nfa::thompson::{self, Look, LookSet},
+ nfa::thompson,
util::{
alphabet,
- id::StateID,
- matchtypes::MatchKind,
+ look::{Look, LookSet},
+ primitives::StateID,
+ search::MatchKind,
sparse_set::{SparseSet, SparseSets},
start::Start,
+ utf8,
},
};
mod state;
-/// Compute the set of all eachable NFA states, including the full epsilon
+/// Compute the set of all reachable NFA states, including the full epsilon
/// closure, from a DFA state for a single unit of input. The set of reachable
/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
/// also includes any look-behind assertions satisfied by `unit`, in addition
@@ -100,6 +100,15 @@ pub(crate) fn next(
) -> StateBuilderNFA {
sparses.clear();
+ // Whether the NFA is matched in reverse or not. We use this in some
+ // conditional logic for dealing with the exceptionally annoying CRLF-aware
+ // line anchors.
+ let rev = nfa.is_reverse();
+ // The look-around matcher that our NFA is configured with. We don't
+ // actually use it to match look-around assertions, but we do need its
+ // configuration for constructing states consistent with how it matches.
+ let lookm = nfa.look_matcher();
+
// Put the NFA state IDs into a sparse set in case we need to
// re-compute their epsilon closure.
//
@@ -113,43 +122,66 @@ pub(crate) fn next(
sparses.set1.insert(nfa_id);
});
- // Compute look-ahead assertions originating from the current state.
- // Based on the input unit we're transitioning over, some additional
- // set of assertions may be true. Thus, we re-compute this state's
- // epsilon closure (but only if necessary).
+ // Compute look-ahead assertions originating from the current state. Based
+ // on the input unit we're transitioning over, some additional set of
+ // assertions may be true. Thus, we re-compute this state's epsilon closure
+ // (but only if necessary). Notably, when we build a DFA state initially,
+ // we don't enable any look-ahead assertions because we don't know whether
+ // they're true or not at that point.
if !state.look_need().is_empty() {
// Add look-ahead assertions that are now true based on the current
// input unit.
let mut look_have = state.look_have().clone();
match unit.as_u8() {
+ Some(b'\r') => {
+ if !rev || !state.is_half_crlf() {
+ look_have = look_have.insert(Look::EndCRLF);
+ }
+ }
Some(b'\n') => {
- look_have.insert(Look::EndLine);
+ if rev || !state.is_half_crlf() {
+ look_have = look_have.insert(Look::EndCRLF);
+ }
}
Some(_) => {}
None => {
- look_have.insert(Look::EndText);
- look_have.insert(Look::EndLine);
+ look_have = look_have.insert(Look::End);
+ look_have = look_have.insert(Look::EndLF);
+ look_have = look_have.insert(Look::EndCRLF);
}
}
+ if unit.is_byte(lookm.get_line_terminator()) {
+ look_have = look_have.insert(Look::EndLF);
+ }
+ if state.is_half_crlf()
+ && ((rev && !unit.is_byte(b'\r'))
+ || (!rev && !unit.is_byte(b'\n')))
+ {
+ look_have = look_have.insert(Look::StartCRLF);
+ }
if state.is_from_word() == unit.is_word_byte() {
- look_have.insert(Look::WordBoundaryUnicodeNegate);
- look_have.insert(Look::WordBoundaryAsciiNegate);
+ look_have = look_have.insert(Look::WordUnicodeNegate);
+ look_have = look_have.insert(Look::WordAsciiNegate);
} else {
- look_have.insert(Look::WordBoundaryUnicode);
- look_have.insert(Look::WordBoundaryAscii);
+ look_have = look_have.insert(Look::WordUnicode);
+ look_have = look_have.insert(Look::WordAscii);
}
// If we have new assertions satisfied that are among the set of
- // assertions that exist in this state (that is, just because
- // we added an EndLine assertion above doesn't mean there is an
- // EndLine conditional epsilon transition in this state), then we
- // re-compute this state's epsilon closure using the updated set of
- // assertions.
+ // assertions that exist in this state (that is, just because we added
+ // an EndLF assertion above doesn't mean there is an EndLF conditional
+ // epsilon transition in this state), then we re-compute this state's
+ // epsilon closure using the updated set of assertions.
+ //
+ // Note that since our DFA states omit unconditional epsilon
+ // transitions, this check is necessary for correctness. If we re-did
+ // the epsilon closure below needlessly, it could change based on the
+ // fact that we omitted epsilon states originally.
if !look_have
.subtract(state.look_have())
.intersect(state.look_need())
.is_empty()
{
- for nfa_id in &sparses.set1 {
+ for nfa_id in sparses.set1.iter() {
epsilon_closure(
nfa,
nfa_id,
@@ -166,24 +198,36 @@ pub(crate) fn next(
// Convert our empty builder into one that can record assertions and match
// pattern IDs.
let mut builder = empty_builder.into_matches();
- // Set whether the StartLine look-behind assertion is true for this
+ // Set whether the StartLF look-behind assertion is true for this
// transition or not. The look-behind assertion for ASCII word boundaries
// is handled below.
- if nfa.has_any_anchor() {
- if unit.as_u8().map_or(false, |b| b == b'\n') {
- // Why only handle StartLine here and not StartText? That's
- // because StartText can only impact the starting state, which
- // is speical cased in start state handling.
- builder.look_have().insert(Look::StartLine);
- }
+ if nfa.look_set_any().contains_anchor_line()
+ && unit.is_byte(lookm.get_line_terminator())
+ {
+ // Why only handle StartLF here and not Start? That's because Start
+ // can only impact the starting state, which is special cased in
+ // start state handling.
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ }
+ // We also need to add StartCRLF to our assertions too, if we can. This
+ // is unfortunately a bit more complicated, because it depends on the
+ // direction of the search. In the forward direction, ^ matches after a
+ // \n, but in the reverse direction, ^ only matches after a \r. (This is
+ // further complicated by the fact that reverse a regex means changing a ^
+ // to a $ and vice versa.)
+ if nfa.look_set_any().contains_anchor_crlf()
+ && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n')))
+ {
+ builder.set_look_have(|have| have.insert(Look::StartCRLF));
}
- for nfa_id in &sparses.set1 {
+ for nfa_id in sparses.set1.iter() {
match *nfa.state(nfa_id) {
thompson::State::Union { .. }
+ | thompson::State::BinaryUnion { .. }
| thompson::State::Fail
| thompson::State::Look { .. }
| thompson::State::Capture { .. } => {}
- thompson::State::Match { id } => {
+ thompson::State::Match { pattern_id } => {
// Notice here that we are calling the NEW state a match
// state if the OLD state we are transitioning from
// contains an NFA match state. This is precisely how we
@@ -204,17 +248,25 @@ pub(crate) fn next(
// IDs in a set, we are guarateed not to have any duplicative
// match states. Thus, it is impossible to add the same pattern
// ID more than once.
- builder.add_match_pattern_id(id);
+ //
+ // N.B. We delay matches by 1 byte as a way to hack 1-byte
+ // look-around into DFA searches. This lets us support ^, $
+ // and ASCII-only \b. The delay is also why we need a special
+ // "end-of-input" (EOI) sentinel and why we need to follow the
+ // EOI sentinel at the end of every search. This final EOI
+ // transition is necessary to report matches found at the end
+ // of a haystack.
+ builder.add_match_pattern_id(pattern_id);
if !match_kind.continue_past_first_match() {
break;
}
}
- thompson::State::Range { range: ref r } => {
- if r.matches_unit(unit) {
+ thompson::State::ByteRange { ref trans } => {
+ if trans.matches_unit(unit) {
epsilon_closure(
nfa,
- r.next,
- *builder.look_have(),
+ trans.next,
+ builder.look_have(),
stack,
&mut sparses.set2,
);
@@ -225,7 +277,18 @@ pub(crate) fn next(
epsilon_closure(
nfa,
next,
- *builder.look_have(),
+ builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ thompson::State::Dense(ref dense) => {
+ if let Some(next) = dense.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ next,
+ builder.look_have(),
stack,
&mut sparses.set2,
);
@@ -250,11 +313,15 @@ pub(crate) fn next(
// if one was detected once it enters a quit state (and indeed, the search
// routines in this crate do just that), but it seems better to prevent
// these things by construction if possible.)
- if nfa.has_word_boundary()
- && unit.is_word_byte()
- && !sparses.set2.is_empty()
- {
- builder.set_is_from_word();
+ if !sparses.set2.is_empty() {
+ if nfa.look_set_any().contains_word() && unit.is_word_byte() {
+ builder.set_is_from_word();
+ }
+ if nfa.look_set_any().contains_anchor_crlf()
+ && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r')))
+ {
+ builder.set_is_half_crlf();
+ }
}
let mut builder_nfa = builder.into_nfa();
add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
@@ -303,8 +370,9 @@ pub(crate) fn epsilon_closure(
break;
}
match *nfa.state(id) {
- thompson::State::Range { .. }
+ thompson::State::ByteRange { .. }
| thompson::State::Sparse { .. }
+ | thompson::State::Dense { .. }
| thompson::State::Fail
| thompson::State::Match { .. } => break,
thompson::State::Look { look, next } => {
@@ -323,6 +391,10 @@ pub(crate) fn epsilon_closure(
// to the top of the stack.
stack.extend(alternates[1..].iter().rev());
}
+ thompson::State::BinaryUnion { alt1, alt2 } => {
+ id = alt1;
+ stack.push(alt2);
+ }
thompson::State::Capture { next, .. } => {
id = next;
}
@@ -336,15 +408,15 @@ pub(crate) fn epsilon_closure(
/// were added to `set`.
///
/// The DFA builder state given should already have its complete set of match
-/// pattern IDs added (if any) and any look-behind assertions (StartLine,
-/// StartText and whether this state is being generated for a transition over a
-/// word byte when applicable) that are true immediately prior to transitioning
-/// into this state (via `builder.look_have()`). The match pattern IDs should
-/// correspond to matches that occured on the previous transition, since all
-/// matches are delayed by one byte. The things that should _not_ be set are
-/// look-ahead assertions (EndLine, EndText and whether the next byte is a
-/// word byte or not). The builder state should also not have anything in
-/// `look_need` set, as this routine will compute that for you.
+/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start
+/// and whether this state is being generated for a transition over a word byte
+/// when applicable) that are true immediately prior to transitioning into this
+/// state (via `builder.look_have()`). The match pattern IDs should correspond
+/// to matches that occurred on the previous transition, since all matches are
+/// delayed by one byte. The things that should _not_ be set are look-ahead
+/// assertions (EndLF, End and whether the next byte is a word byte or not).
+/// The builder state should also not have anything in `look_need` set, as this
+/// routine will compute that for you.
///
/// The given NFA should be able to resolve all identifiers in `set` to a
/// particular NFA state. Additionally, `set` must have capacity equivalent
@@ -354,56 +426,114 @@ pub(crate) fn add_nfa_states(
set: &SparseSet,
builder: &mut StateBuilderNFA,
) {
- for nfa_id in set {
+ for nfa_id in set.iter() {
match *nfa.state(nfa_id) {
- thompson::State::Range { .. } => {
+ thompson::State::ByteRange { .. } => {
builder.add_nfa_state_id(nfa_id);
}
thompson::State::Sparse { .. } => {
builder.add_nfa_state_id(nfa_id);
}
+ thompson::State::Dense { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
thompson::State::Look { look, .. } => {
builder.add_nfa_state_id(nfa_id);
- builder.look_need().insert(look);
+ builder.set_look_need(|need| need.insert(look));
}
thompson::State::Union { .. }
- | thompson::State::Capture { .. } => {
- // Pure epsilon transitions don't need to be tracked
- // as part of the DFA state. Tracking them is actually
- // superfluous; they won't cause any harm other than making
- // determinization slower.
+ | thompson::State::BinaryUnion { .. } => {
+ // Pure epsilon transitions don't need to be tracked as part
+ // of the DFA state. Tracking them is actually superfluous;
+ // they won't cause any harm other than making determinization
+ // slower.
//
// Why aren't these needed? Well, in an NFA, epsilon
- // transitions are really just jumping points to other
- // states. So once you hit an epsilon transition, the same
- // set of resulting states always appears. Therefore,
- // putting them in a DFA's set of ordered NFA states is
- // strictly redundant.
+ // transitions are really just jumping points to other states.
+ // So once you hit an epsilon transition, the same set of
+ // resulting states always appears. Therefore, putting them in
+ // a DFA's set of ordered NFA states is strictly redundant.
//
// Look-around states are also epsilon transitions, but
// they are *conditional*. So their presence could be
// discriminatory, and thus, they are tracked above.
//
- // But wait... why are epsilon states in our `set` in the
- // first place? Why not just leave them out? They're in
- // our `set` because it was generated by computing an
- // epsilon closure, and we want to keep track of all states
- // we visited to avoid re-visiting them. In exchange, we
- // have to do this second iteration over our collected
- // states to finalize our DFA state.
+ // But wait... why are epsilon states in our `set` in the first
+ // place? Why not just leave them out? They're in our `set`
+ // because it was generated by computing an epsilon closure,
+ // and we want to keep track of all states we visited to avoid
+ // re-visiting them. In exchange, we have to do this second
+ // iteration over our collected states to finalize our DFA
+ // state. In theory, we could avoid this second iteration if
+ // we maintained two sets during epsilon closure: the set of
+ // visited states (to avoid cycles) and the set of states that
+ // will actually be used to construct the next DFA state.
+ //
+ // Note that this optimization requires that we re-compute the
+ // epsilon closure to account for look-ahead in 'next' *only
+ // when necessary*. Namely, only when the set of look-around
+ // assertions changes and only when those changes are within
+ // the set of assertions that are needed in order to step
+ // through the closure correctly. Otherwise, if we re-do the
+ // epsilon closure needlessly, it could change based on the
+ // fact that we are omitting epsilon states here.
+ //
+ // -----
+ //
+ // Welp, scratch the above. It turns out that recording these
+ // is in fact necessary to seemingly handle one particularly
+ // annoying case: when a conditional epsilon transition is
+ // put inside of a repetition operator. One specific case I
+ // ran into was the regex `(?:\b|%)+` on the haystack `z%`.
+ // The correct leftmost first matches are: [0, 0] and [1, 1].
+ // But the DFA was reporting [0, 0] and [1, 2]. To understand
+ // why this happens, consider the NFA for the aforementioned
+ // regex:
//
- // Note that this optimization requires that we re-compute
- // the epsilon closure to account for look-ahead in 'next'
- // *only when necessary*. Namely, only when the set of
- // look-around assertions changes and only when those
- // changes are within the set of assertions that are
- // needed in order to step through the closure correctly.
- // Otherwise, if we re-do the epsilon closure needlessly,
- // it could change based on the fact that we are omitting
- // epsilon states here.
+ // >000000: binary-union(4, 1)
+ // 000001: \x00-\xFF => 0
+ // 000002: WordAscii => 5
+ // 000003: % => 5
+ // ^000004: binary-union(2, 3)
+ // 000005: binary-union(4, 6)
+ // 000006: MATCH(0)
+ //
+ // The problem here is that one of the DFA start states is
+ // going to consist of the NFA states [2, 3] by computing the
+ // epsilon closure of state 4. State 4 isn't included because
+ // we previously were not keeping track of union states. But
+ // only a subset of transitions out of this state will be able
+ // to follow WordAscii, and in those cases, the epsilon closure
+ // is redone. The only problem is that computing the epsilon
+ // closure from [2, 3] is different than computing the epsilon
+ // closure from [4]. In the former case, assuming the WordAscii
+ // assertion is satisfied, you get: [2, 3, 6]. In the latter
+ // case, you get: [2, 6, 3]. Notice that '6' is the match state
+ // and appears AFTER '3' in the former case. This leads to a
+ // preferential but incorrect match of '%' before returning
+ // a match. In the latter case, the match is preferred over
+ // continuing to accept the '%'.
+ //
+ // It almost feels like we might be able to fix the NFA states
+ // to avoid this, or to at least only keep track of union
+ // states where this actually matters, since in the vast
+ // majority of cases, this doesn't matter.
+ //
+ // Another alternative would be to define a new HIR property
+ // called "assertion is repeated anywhere" and compute it
+ // inductively over the entire pattern. If it happens anywhere,
+ // which is probably pretty rare, then we record union states.
+ // Otherwise we don't.
+ builder.add_nfa_state_id(nfa_id);
}
+ // Capture states we definitely do not need to record, since they
+ // are unconditional epsilon transitions with no branching.
+ thompson::State::Capture { .. } => {}
+ // It's not totally clear whether we need to record fail states or
+ // not, but we do so out of an abundance of caution. Since they are
+ // quite rare in practice, there isn't much cost to recording them.
thompson::State::Fail => {
- break;
+ builder.add_nfa_state_id(nfa_id);
}
thompson::State::Match { .. } => {
// Normally, the NFA match state doesn't actually need to
@@ -420,74 +550,61 @@ pub(crate) fn add_nfa_states(
// there's no reason to track which look-around assertions were
// satisfied when this state was created.
if builder.look_need().is_empty() {
- builder.look_have().clear();
+ builder.set_look_have(|_| LookSet::empty());
}
}
/// Sets the appropriate look-behind assertions on the given state based on
/// this starting configuration.
pub(crate) fn set_lookbehind_from_start(
+ nfa: &thompson::NFA,
start: &Start,
builder: &mut StateBuilderMatches,
) {
+ let rev = nfa.is_reverse();
+ let lineterm = nfa.look_matcher().get_line_terminator();
match *start {
Start::NonWordByte => {}
Start::WordByte => {
builder.set_is_from_word();
}
Start::Text => {
- builder.look_have().insert(Look::StartText);
- builder.look_have().insert(Look::StartLine);
+ builder.set_look_have(|have| {
+ have.insert(Look::Start)
+ .insert(Look::StartLF)
+ .insert(Look::StartCRLF)
+ });
}
- Start::Line => {
- builder.look_have().insert(Look::StartLine);
+ Start::LineLF => {
+ if rev {
+ builder.set_is_half_crlf();
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ } else {
+ builder.set_look_have(|have| have.insert(Look::StartCRLF));
+ }
+ if lineterm == b'\n' {
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ }
+ }
+ Start::LineCR => {
+ if rev {
+ builder.set_look_have(|have| have.insert(Look::StartCRLF));
+ } else {
+ builder.set_is_half_crlf();
+ }
+ if lineterm == b'\r' {
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ }
+ }
+ Start::CustomLineTerminator => {
+ builder.set_look_have(|have| have.insert(Look::StartLF));
+ // This is a bit of a tricky case, but if the line terminator was
+ // set to a word byte, then we also need to behave as if the start
+ // configuration is Start::WordByte. That is, we need to mark our
+ // state as having come from a word byte.
+ if utf8::is_word_byte(lineterm) {
+ builder.set_is_from_word();
+ }
}
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::Start;
-
- #[test]
- #[should_panic]
- fn start_fwd_bad_range() {
- Start::from_position_fwd(&[], 0, 1);
- }
-
- #[test]
- #[should_panic]
- fn start_rev_bad_range() {
- Start::from_position_rev(&[], 0, 1);
- }
-
- #[test]
- fn start_fwd() {
- let f = Start::from_position_fwd;
-
- assert_eq!(Start::Text, f(&[], 0, 0));
- assert_eq!(Start::Text, f(b"abc", 0, 3));
- assert_eq!(Start::Text, f(b"\nabc", 0, 3));
-
- assert_eq!(Start::Line, f(b"\nabc", 1, 3));
-
- assert_eq!(Start::WordByte, f(b"abc", 1, 3));
-
- assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
- }
-
- #[test]
- fn start_rev() {
- let f = Start::from_position_rev;
-
- assert_eq!(Start::Text, f(&[], 0, 0));
- assert_eq!(Start::Text, f(b"abc", 0, 3));
- assert_eq!(Start::Text, f(b"abc\n", 0, 4));
-
- assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
-
- assert_eq!(Start::WordByte, f(b"abc", 0, 2));
-
- assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
}
}
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs
index 567e600d6..e64123587 100644
--- a/vendor/regex-automata/src/util/determinize/state.rs
+++ b/vendor/regex-automata/src/util/determinize/state.rs
@@ -10,13 +10,13 @@ The term "DFA state" is somewhat overloaded in this crate. In some cases, it
refers to the set of transitions over an alphabet for a particular state. In
other cases, it refers to a set of NFA states. The former is really about the
final representation of a state in a DFA's transition table, where as the
-latter---what this module is focusedon---is closer to an intermediate form that
-is used to help eventually build the transition table.
+latter---what this module is focused on---is closer to an intermediate form
+that is used to help eventually build the transition table.
This module exports four types. All four types represent the same idea: an
ordered set of NFA states. This ordered set represents the epsilon closure of a
particular NFA state, where the "epsilon closure" is the set of NFA states that
-can be transitioned to without consuming any input. i.e., Follow all of theNFA
+can be transitioned to without consuming any input. i.e., Follow all of the NFA
state's epsilon transitions. In addition, this implementation of DFA states
cares about two other things: the ordered set of pattern IDs corresponding
to the patterns that match if the state is a match state, and the set of
@@ -46,9 +46,11 @@ a copy). Here are the three types described succinctly:
and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
`StateBuilderEmpty` can only be used to query its underlying memory capacity,
or to convert into a builder for recording pattern IDs and/or assertions.
+
* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
can only be used for adding pattern IDs and recording assertions.
+
* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
can only be used for adding NFA state IDs and recording some assertions.
@@ -58,7 +60,7 @@ DFA state to check if it already exists. If it does, then there's no need to
freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
can be called to freeze the builder into an immutable `State`. In either
case, `clear` should be called on the builder to turn it back into a
-`StateBuilderEmpty` that reuses the underyling memory.
+`StateBuilderEmpty` that reuses the underlying memory.
The main purpose for splitting the builder into these distinct types is to
make it impossible to do things like adding a pattern ID after adding an NFA
@@ -68,7 +70,7 @@ type below.) If we just used one type for everything, it would be possible for
callers to use an incorrect interleaving of calls and thus result in a corrupt
representation. I chose to use more type machinery to make this impossible to
do because 1) determinization is itself pretty complex and it wouldn't be too
-hard to foul this up and 2) there isn't too much machinery involve and it's
+hard to foul this up and 2) there isn't too much machinery involved and it's
well contained.
As an optimization, sometimes states won't have certain things set. For
@@ -88,12 +90,11 @@ use core::{convert::TryFrom, mem};
use alloc::{sync::Arc, vec::Vec};
-use crate::{
- nfa::thompson::LookSet,
- util::{
- bytes::{self, Endian},
- id::{PatternID, StateID},
- },
+use crate::util::{
+ int::{I32, U32},
+ look::LookSet,
+ primitives::{PatternID, StateID},
+ wire::{self, Endian},
};
/// A DFA state that, at its core, is represented by an ordered set of NFA
@@ -102,7 +103,7 @@ use crate::{
/// This type is intended to be used only in NFA-to-DFA conversion via powerset
/// construction.
///
-/// It may be cheaply cloned and accessed safely from mulitple threads
+/// It may be cheaply cloned and accessed safely from multiple threads
/// simultaneously.
#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
pub(crate) struct State(Arc<[u8]>);
@@ -138,6 +139,10 @@ impl State {
self.repr().is_from_word()
}
+ pub(crate) fn is_half_crlf(&self) -> bool {
+ self.repr().is_half_crlf()
+ }
+
pub(crate) fn look_have(&self) -> LookSet {
self.repr().look_have()
}
@@ -146,8 +151,8 @@ impl State {
self.repr().look_need()
}
- pub(crate) fn match_count(&self) -> usize {
- self.repr().match_count()
+ pub(crate) fn match_len(&self) -> usize {
+ self.repr().match_len()
}
pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
@@ -158,6 +163,7 @@ impl State {
self.repr().match_pattern_ids()
}
+ #[cfg(all(test, not(miri)))]
pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
self.repr().iter_match_pattern_ids(f)
}
@@ -191,7 +197,7 @@ impl StateBuilderEmpty {
}
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
- self.0.extend_from_slice(&[0, 0, 0]);
+ self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
StateBuilderMatches(self.0)
}
@@ -224,30 +230,23 @@ impl StateBuilderMatches {
StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
}
- pub(crate) fn clear(self) -> StateBuilderEmpty {
- let mut builder = StateBuilderEmpty(self.0);
- builder.clear();
- builder
- }
-
- pub(crate) fn is_match(&self) -> bool {
- self.repr().is_match()
- }
-
- pub(crate) fn is_from_word(&self) -> bool {
- self.repr().is_from_word()
- }
-
pub(crate) fn set_is_from_word(&mut self) {
self.repr_vec().set_is_from_word()
}
- pub(crate) fn look_have(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[1])
+ pub(crate) fn set_is_half_crlf(&mut self) {
+ self.repr_vec().set_is_half_crlf()
+ }
+
+ pub(crate) fn look_have(&self) -> LookSet {
+ LookSet::read_repr(&self.0[1..])
}
- pub(crate) fn look_need(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[2])
+ pub(crate) fn set_look_have(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_have(set)
}
pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
@@ -295,20 +294,22 @@ impl StateBuilderNFA {
builder
}
- pub(crate) fn is_match(&self) -> bool {
- self.repr().is_match()
- }
-
- pub(crate) fn is_from_word(&self) -> bool {
- self.repr().is_from_word()
+ pub(crate) fn look_need(&self) -> LookSet {
+ self.repr().look_need()
}
- pub(crate) fn look_have(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.repr[1])
+ pub(crate) fn set_look_have(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_have(set)
}
- pub(crate) fn look_need(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.repr[2])
+ pub(crate) fn set_look_need(
+ &mut self,
+ set: impl FnMut(LookSet) -> LookSet,
+ ) {
+ self.repr_vec().set_look_need(set)
}
pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
@@ -316,10 +317,6 @@ impl StateBuilderNFA {
.add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
}
- pub(crate) fn memory_usage(&self) -> usize {
- self.repr.len()
- }
-
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.repr
}
@@ -355,8 +352,8 @@ impl StateBuilderNFA {
///
/// Byte 1 corresponds to the look-behind assertions that were satisfied by
/// the transition that created this state. This generally only includes the
-/// StartLine and StartText assertions. (Look-ahead assertions are not tracked
-/// as part of states. Instead, these are applied by re-computing the epsilon
+/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
+/// part of states. Instead, these are applied by re-computing the epsilon
/// closure of a state when computing the transition function. See `next` in
/// the parent module.)
///
@@ -425,6 +422,14 @@ impl<'a> Repr<'a> {
self.0[0] & (1 << 2) > 0
}
+ /// Returns true if and only if this state is marked as being inside of a
+ /// CRLF terminator. In the forward direction, this means the state was
+ /// created after seeing a `\r`. In the reverse direction, this means the
+ /// state was created after seeing a `\n`.
+ fn is_half_crlf(&self) -> bool {
+ self.0[0] & (1 << 3) > 0
+ }
+
/// The set of look-behind assertions that were true in the transition that
/// created this state.
///
@@ -436,7 +441,7 @@ impl<'a> Repr<'a> {
/// these are re-computed on demand via epsilon closure when computing the
/// transition function.
fn look_have(&self) -> LookSet {
- LookSet::from_repr(self.0[1])
+ LookSet::read_repr(&self.0[1..])
}
/// The set of look-around (both behind and ahead) assertions that appear
@@ -447,34 +452,34 @@ impl<'a> Repr<'a> {
/// state has no conditional epsilon transitions, then there is no need
/// to re-compute the epsilon closure.
fn look_need(&self) -> LookSet {
- LookSet::from_repr(self.0[2])
+ LookSet::read_repr(&self.0[3..])
}
/// Returns the total number of match pattern IDs in this state.
///
/// If this state is not a match state, then this always returns 0.
- fn match_count(&self) -> usize {
+ fn match_len(&self) -> usize {
if !self.is_match() {
return 0;
} else if !self.has_pattern_ids() {
1
} else {
- self.encoded_pattern_count()
+ self.encoded_pattern_len()
}
}
/// Returns the pattern ID for this match state at the given index.
///
- /// If the given index is greater than or equal to `match_count()` for this
+ /// If the given index is greater than or equal to `match_len()` for this
/// state, then this could panic or return incorrect results.
fn match_pattern(&self, index: usize) -> PatternID {
if !self.has_pattern_ids() {
PatternID::ZERO
} else {
- let offset = 7 + index * PatternID::SIZE;
+ let offset = 9 + index * PatternID::SIZE;
// This is OK since we only ever serialize valid PatternIDs to
// states.
- bytes::read_pattern_id_unchecked(&self.0[offset..]).0
+ wire::read_pattern_id_unchecked(&self.0[offset..]).0
}
}
@@ -502,9 +507,9 @@ impl<'a> Repr<'a> {
f(PatternID::ZERO);
return;
}
- let mut pids = &self.0[7..self.pattern_offset_end()];
+ let mut pids = &self.0[9..self.pattern_offset_end()];
while !pids.is_empty() {
- let pid = bytes::read_u32(pids);
+ let pid = wire::read_u32(pids);
pids = &pids[PatternID::SIZE..];
// This is OK since we only ever serialize valid PatternIDs to
// states. And since pattern IDs can never exceed a usize, the
@@ -525,20 +530,20 @@ impl<'a> Repr<'a> {
// This is OK since we only ever serialize valid StateIDs to
// states. And since state IDs can never exceed an isize, they must
// always be able to fit into a usize, and thus cast is OK.
- f(StateID::new_unchecked(sid as usize))
+ f(StateID::new_unchecked(sid.as_usize()))
}
}
/// Returns the offset into this state's representation where the pattern
/// IDs end and the NFA state IDs begin.
fn pattern_offset_end(&self) -> usize {
- let encoded = self.encoded_pattern_count();
+ let encoded = self.encoded_pattern_len();
if encoded == 0 {
- return 3;
+ return 5;
}
// This arithmetic is OK since we were able to address this many bytes
// when writing to the state, thus, it must fit into a usize.
- encoded.checked_mul(4).unwrap().checked_add(7).unwrap()
+ encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
}
/// Returns the total number of *encoded* pattern IDs in this state.
@@ -546,13 +551,13 @@ impl<'a> Repr<'a> {
/// This may return 0 even when this is a match state, since the pattern
/// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
/// the match state (the overwhelming common case).
- fn encoded_pattern_count(&self) -> usize {
+ fn encoded_pattern_len(&self) -> usize {
if !self.has_pattern_ids() {
return 0;
}
// This unwrap is OK since the total number of patterns is always
// guaranteed to fit into a usize.
- usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap()
+ usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
}
}
@@ -563,6 +568,7 @@ impl<'a> core::fmt::Debug for Repr<'a> {
f.debug_struct("Repr")
.field("is_match", &self.is_match())
.field("is_from_word", &self.is_from_word())
+ .field("is_half_crlf", &self.is_half_crlf())
.field("look_have", &self.look_have())
.field("look_need", &self.look_need())
.field("match_pattern_ids", &self.match_pattern_ids())
@@ -608,14 +614,36 @@ impl<'a> ReprVec<'a> {
self.0[0] |= 1 << 2;
}
- /// Return a mutable reference to the 'look_have' assertion set.
- fn look_have_mut(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[1])
+ /// Set this state as having seen half of a CRLF terminator.
+ ///
+ /// In the forward direction, this should be set when a `\r` has been seen.
+ /// In the reverse direction, this should be set when a `\n` has been seen.
+ fn set_is_half_crlf(&mut self) {
+ self.0[0] |= 1 << 3;
}
- /// Return a mutable reference to the 'look_need' assertion set.
- fn look_need_mut(&mut self) -> &mut LookSet {
- LookSet::from_repr_mut(&mut self.0[2])
+ /// The set of look-behind assertions that were true in the transition that
+ /// created this state.
+ fn look_have(&self) -> LookSet {
+ self.repr().look_have()
+ }
+
+ /// The set of look-around (both behind and ahead) assertions that appear
+ /// at least once in this state's set of NFA states.
+ fn look_need(&self) -> LookSet {
+ self.repr().look_need()
+ }
+
+ /// Mutate the set of look-behind assertions that were true in the
+ /// transition that created this state.
+ fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+ set(self.look_have()).write_repr(&mut self.0[1..]);
+ }
+
+ /// Mutate the set of look-around (both behind and ahead) assertions that
+ /// appear at least once in this state's set of NFA states.
+ fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+ set(self.look_need()).write_repr(&mut self.0[3..]);
}
/// Add a pattern ID to this state. All match states must have at least
@@ -675,14 +703,14 @@ impl<'a> ReprVec<'a> {
return;
}
let patsize = PatternID::SIZE;
- let pattern_bytes = self.0.len() - 7;
+ let pattern_bytes = self.0.len() - 9;
// Every pattern ID uses 4 bytes, so number of bytes should be
// divisible by 4.
assert_eq!(pattern_bytes % patsize, 0);
// This unwrap is OK since we are guaranteed that the maximum number
// of possible patterns fits into a u32.
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
- bytes::NE::write_u32(count32, &mut self.0[3..7]);
+ wire::NE::write_u32(count32, &mut self.0[5..9]);
}
/// Add an NFA state ID to this state. The order in which NFA states are
@@ -704,7 +732,7 @@ impl<'a> ReprVec<'a> {
///
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_vari32(data: &mut Vec<u8>, n: i32) {
- let mut un = (n as u32) << 1;
+ let mut un = n.to_bits() << 1;
if n < 0 {
un = !un;
}
@@ -717,7 +745,7 @@ fn write_vari32(data: &mut Vec<u8>, n: i32) {
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_vari32(data: &[u8]) -> (i32, usize) {
let (un, i) = read_varu32(data);
- let mut n = (un >> 1) as i32;
+ let mut n = i32::from_bits(un >> 1);
if un & 1 != 0 {
n = !n;
}
@@ -733,10 +761,10 @@ fn read_vari32(data: &[u8]) -> (i32, usize) {
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
while n >= 0b1000_0000 {
- data.push((n as u8) | 0b1000_0000);
+ data.push(n.low_u8() | 0b1000_0000);
n >>= 7;
}
- data.push(n as u8);
+ data.push(n.low_u8());
}
/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
@@ -750,9 +778,9 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
- return (n | ((b as u32) << shift), i + 1);
+ return (n | (u32::from(b) << shift), i + 1);
}
- n |= ((b as u32) & 0b0111_1111) << shift;
+ n |= (u32::from(b) & 0b0111_1111) << shift;
shift += 7;
}
(0, 0)
@@ -760,7 +788,7 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
/// Push a native-endian encoded `n` on to `dst`.
fn write_u32(dst: &mut Vec<u8>, n: u32) {
- use crate::util::bytes::{Endian, NE};
+ use crate::util::wire::NE;
let start = dst.len();
dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
@@ -775,6 +803,7 @@ mod tests {
use super::*;
+ #[cfg(not(miri))]
quickcheck! {
fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
// Builders states do not permit duplicate IDs.
@@ -829,7 +858,9 @@ mod tests {
s.iter_nfa_state_ids(|sid| got_sids.push(sid));
got_pids == pids && got_sids == sids
}
+ }
+ quickcheck! {
fn prop_read_write_varu32(n: u32) -> bool {
let mut buf = vec![];
write_varu32(&mut buf, n);
@@ -845,6 +876,7 @@ mod tests {
}
}
+ #[cfg(not(miri))]
fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];
@@ -858,6 +890,7 @@ mod tests {
deduped
}
+ #[cfg(not(miri))]
fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
let mut set = alloc::collections::BTreeSet::new();
let mut deduped = vec![];
diff --git a/vendor/regex-automata/src/util/empty.rs b/vendor/regex-automata/src/util/empty.rs
new file mode 100644
index 000000000..e16af3b6e
--- /dev/null
+++ b/vendor/regex-automata/src/util/empty.rs
@@ -0,0 +1,265 @@
+/*!
+This module provides helper routines for dealing with zero-width matches.
+
+The main problem being solved here is this:
+
+1. The caller wants to search something that they know is valid UTF-8, such
+as a Rust `&str`.
+2. The regex used by the caller can match the empty string. For example, `a*`.
+3. The caller should never get match offsets returned that occur within the
+encoding of a UTF-8 codepoint. It is logically incorrect, and also means that,
+e.g., slicing the `&str` at those offsets will lead to a panic.
+
+So the question here is, how do we prevent the caller from getting match
+offsets that split a codepoint? For example, strictly speaking, the regex `a*`
+matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since
+the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that
+underlies all of the matching engines in this crate doesn't have anything in
+its state graph that prevents matching between UTF-8 code units. Indeed, any
+engine derived from the `NFA` will match at those positions by virtue of the
+fact that the `NFA` is byte oriented. That is, its transitions are defined over
+bytes and the matching engines work by proceeding one byte at a time.
+
+(An alternative architecture would be to define the transitions in an `NFA`
+over codepoints, or `char`. And then make the matching engines proceed by
+decoding one codepoint at a time. This is a viable strategy, but it doesn't
+work for DFA matching engines because designing a fast and memory efficient
+transition table for an alphabet as large as Unicode is quite difficult. More
+to the point, the top-level `regex` crate supports matching on arbitrary bytes
+when Unicode mode is disabled and one is searching a `&[u8]`. So in that case,
+you can't just limit yourself to decoding codepoints and matching those. You
+really do need to be able to follow byte oriented transitions on the `NFA`.)
+
+In an older version of the regex crate, we handled this case not in the regex
+engine, but in the iterators over matches. Namely, since this case only arises
+when the match is empty, we "just" incremented the next starting position
+of the search by `N`, where `N` is the length of the codepoint encoded at
+the current position. The alternative or more "natural" solution of just
+incrementing by `1` would result in executing a search of `a*` on `☃` like
+this:
+
+* Start search at `0`.
+* Found match at `[0, 0]`.
+* Next start position is `0`.
+* To avoid an infinite loop, since it's an empty match, increment by `1`.
+* Start search at `1`.
+* Found match at `[1, 1]`. Oops.
+
+But if we instead incremented by `3` (the length in bytes of `☃`), then we get
+the following:
+
+* Start search at `0`.
+* Found match at `[0, 0]`.
+* Next start position is `0`.
+* To avoid an infinite loop, since it's an empty match, increment by `3`.
+* Start search at `3`.
+* Found match at `[3, 3]`.
+
+And we get the correct result. But does this technique work in all cases?
+Crucially, it requires that a zero-width match that splits a codepoint never
+occurs beyond the starting position of the search. Because if it did, merely
+incrementing the start position by the number of bytes in the codepoint at
+the current position wouldn't be enough. A zero-width match could just occur
+anywhere. It turns out that it is _almost_ true. We can convince ourselves by
+looking at all possible patterns that can match the empty string:
+
+* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match
+the empty string. That is, assuming there isn't an `a` at the current position,
+they will all match the empty string at the start of a search. There is no way
+to move past it because any other match would not be "leftmost."
+* `^` only matches at the beginning of the haystack, where the start position
+is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8,
+then this entire problem goes away because it implies your string type supports
+invalid UTF-8 and thus must deal with offsets that not only split a codepoint
+but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches
+between the code units of a codepoint because the start of a valid UTF-8 string
+is never within the encoding of a codepoint.
+* `$` basically the same logic as `^`, but for the end of a string. A valid
+UTF-8 string can't have an incomplete codepoint at the end of it.
+* `(?m:^)` follows similarly to `^`, but it can match immediately following
+a `\n`. However, since a `\n` is always a codepoint itself and can never
+appear within a codepoint, it follows that the position immediately following
+a `\n` in a string that is valid UTF-8 is guaranteed to not be between the
+code units of another codepoint. (One caveat here is that the line terminator
+for multi-line anchors can now be changed to any arbitrary byte, including
+things like `\x98` which might occur within a codepoint. However, this wasn't
+supported by the old regex crate. If it was, it pose the same problems as
+`(?-u:\B)`, as we'll discuss below.)
+* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a
+`(?m:$)` matches just before a `\n`. But the same argument applies.
+* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the
+CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`.
+Namely, since they only ever match at a boundary where one side is either a
+`\r` or a `\n`, neither of which can occur within a codepoint.
+* `\b` only matches at positions where both sides are valid codepoints, so
+this cannot split a codepoint.
+* `\B`, like `\b`, also only matches at positions where both sides are valid
+codepoints. So this cannot split a codepoint either.
+* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII
+word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints
+(one of the many amazing qualities of UTF-8), it follows that this too cannot
+split a codepoint.
+* `(?-u:\B)` finally represents a problem. It can matches between *any* two
+bytes that are either both word bytes or non-word bytes. Since code units like
+`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes,
+`(?-u:\B)` will match at the position between them.
+
+Thus, our approach of incrementing one codepoint at a time after seeing an
+empty match is flawed because `(?-u:\B)` can result in an empty match that
+splits a codepoint at a position past the starting point of a search. For
+example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2,
+2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because
+they correspond to word boundaries since `a` is an ASCII word byte.
+
+So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from
+regexes that could match `&str`. That might sound extreme, but a lot of other
+things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and
+`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a
+codepoint. The key difference is that those expressions could never produce an
+empty match. That ban happens when translating an `Ast` to an `Hir`, because
+that process that reason about whether an `Hir` can produce *non-empty* matches
+at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the
+`(?-u:\B)` issue by banning it.
+
+If banning `(?-u:\B)` were the only issue with the old regex crate's approach,
+then I probably would have kept it. `\B` is rarely used, so it's not such a big
+deal to have to work-around it. However, the problem with the above approach
+is that it doesn't compose. The logic for avoiding splitting a codepoint only
+lived in the iterator, which means if anyone wants to implement their own
+iterator over regex matches, they have to deal with this extremely subtle edge
+case to get full correctness.
+
+Instead, in this crate, we take the approach of pushing this complexity down
+to the lowest layers of each regex engine. The approach is pretty simple:
+
+* If this corner case doesn't apply, don't do anything. (For example, if UTF-8
+mode isn't enabled or if the regex cannot match the empty string.)
+* If an empty match is reported, explicitly check if it splits a codepoint.
+* If it doesn't, we're done, return the match.
+* If it does, then ignore the match and re-run the search.
+* Repeat the above process until the end of the haystack is reached or a match
+is found that doesn't split a codepoint or isn't zero width.
+
+And that's pretty much what this module provides. Every regex engine uses these
+methods in their lowest level public APIs, but just above the layer where
+their internal engine is used. That way, all regex engines can be arbitrarily
+composed without worrying about handling this case, and iterators don't need to
+handle it explicitly.
+
+(It turns out that a new feature I added, support for changing the line
+terminator in a regex to any arbitrary byte, also provokes the above problem.
+Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that
+support would need to be limited or banned when UTF-8 mode is enabled, just
+like we did for `(?-u:\B)`. But thankfully our more robust approach in this
+crate handles that case just fine too.)
+*/
+
+use crate::util::search::{Input, MatchError};
+
+#[cold]
+#[inline(never)]
+pub(crate) fn skip_splits_fwd<T, F>(
+ input: &Input<'_>,
+ init_value: T,
+ match_offset: usize,
+ find: F,
+) -> Result<Option<T>, MatchError>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
+{
+ skip_splits(true, input, init_value, match_offset, find)
+}
+
+#[cold]
+#[inline(never)]
+pub(crate) fn skip_splits_rev<T, F>(
+ input: &Input<'_>,
+ init_value: T,
+ match_offset: usize,
+ find: F,
+) -> Result<Option<T>, MatchError>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
+{
+ skip_splits(false, input, init_value, match_offset, find)
+}
+
+fn skip_splits<T, F>(
+ forward: bool,
+ input: &Input<'_>,
+ init_value: T,
+ mut match_offset: usize,
+ mut find: F,
+) -> Result<Option<T>, MatchError>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
+{
+ // If our config says to do an anchored search, then we're definitely
+ // done. We just need to determine whether we have a valid match or
+ // not. If we don't, then we're not allowed to continue, so we report
+ // no match.
+ //
+ // This is actually quite a subtle correctness thing. The key here is
+ // that if we got an empty match that splits a codepoint after doing an
+ // anchored search in UTF-8 mode, then that implies that we must have
+ // *started* the search at a location that splits a codepoint. This
+ // follows from the fact that if a match is reported from an anchored
+ // search, then the start offset of the match *must* match the start
+ // offset of the search.
+ //
+ // It also follows that no other non-empty match is possible. For
+ // example, you might write a regex like '(?:)|SOMETHING' and start its
+ // search in the middle of a codepoint. The first branch is an empty
+ // regex that will bubble up a match at the first position, and then
+ // get rejected here and report no match. But what if 'SOMETHING' could
+ // have matched? We reason that such a thing is impossible, because
+ // if it does, it must report a match that starts in the middle of a
+ // codepoint. This in turn implies that a match is reported whose span
+ // does not correspond to valid UTF-8, and this breaks the promise
+ // made when UTF-8 mode is enabled. (That promise *can* be broken, for
+ // example, by enabling UTF-8 mode but building an by hand NFA that
+ // produces non-empty matches that span invalid UTF-8. This is an unchecked
+ // but documented precondition violation of UTF-8 mode, and is documented
+ // to have unspecified behavior.)
+ //
+ // I believe this actually means that if an anchored search is run, and
+ // UTF-8 mode is enabled and the start position splits a codepoint,
+ // then it is correct to immediately report no match without even
+ // executing the regex engine. But it doesn't really seem worth writing
+ // out that case in every regex engine to save a tiny bit of work in an
+ // extremely pathological case, so we just handle it here.
+ if input.get_anchored().is_anchored() {
+ return Ok(if input.is_char_boundary(match_offset) {
+ Some(init_value)
+ } else {
+ None
+ });
+ }
+ // Otherwise, we have an unanchored search, so just keep looking for
+ // matches until we have one that does not split a codepoint or we hit
+ // EOI.
+ let mut value = init_value;
+ let mut input = input.clone();
+ while !input.is_char_boundary(match_offset) {
+ if forward {
+ // The unwrap is OK here because overflowing usize while
+ // iterating over a slice is impossible, at it would require
+ // a slice of length greater than isize::MAX, which is itself
+ // impossible.
+ input.set_start(input.start().checked_add(1).unwrap());
+ } else {
+ input.set_end(match input.end().checked_sub(1) {
+ None => return Ok(None),
+ Some(end) => end,
+ });
+ }
+ match find(&input)? {
+ None => return Ok(None),
+ Some((new_value, new_match_end)) => {
+ value = new_value;
+ match_offset = new_match_end;
+ }
+ }
+ }
+ Ok(Some(value))
+}
diff --git a/vendor/regex-automata/src/util/escape.rs b/vendor/regex-automata/src/util/escape.rs
new file mode 100644
index 000000000..7f6aa15f5
--- /dev/null
+++ b/vendor/regex-automata/src/util/escape.rs
@@ -0,0 +1,84 @@
+/*!
+Provides convenience routines for escaping raw bytes.
+
+Since this crate tends to deal with `&[u8]` everywhere and the default
+`Debug` implementation just shows decimal integers, it makes debugging those
+representations quite difficult. This module provides types that show `&[u8]`
+as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex
+representation.
+*/
+
+use crate::util::utf8;
+
+/// Provides a convenient `Debug` implementation for a `u8`.
+///
+/// The `Debug` impl treats the byte as an ASCII, and emits a human readable
+/// representation of it. If the byte isn't ASCII, then it's emitted as a hex
+/// escape sequence.
+#[derive(Clone, Copy)]
+pub struct DebugByte(pub u8);
+
+impl core::fmt::Debug for DebugByte {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ // Special case ASCII space. It's too hard to read otherwise, so
+ // put quotes around it. I sometimes wonder whether just '\x20' would
+ // be better...
+ if self.0 == b' ' {
+ return write!(f, "' '");
+ }
+ // 10 bytes is enough to cover any output from ascii::escape_default.
+ let mut bytes = [0u8; 10];
+ let mut len = 0;
+ for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
+ // capitalize \xab to \xAB
+ if i >= 2 && b'a' <= b && b <= b'f' {
+ b -= 32;
+ }
+ bytes[len] = b;
+ len += 1;
+ }
+ write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
+ }
+}
+
+/// Provides a convenient `Debug` implementation for `&[u8]`.
+///
+/// This generally works best when the bytes are presumed to be mostly UTF-8,
+/// but will work for anything. For any bytes that aren't UTF-8, they are
+/// emitted as hex escape sequences.
+pub struct DebugHaystack<'a>(pub &'a [u8]);
+
+impl<'a> core::fmt::Debug for DebugHaystack<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "\"")?;
+ // This is a sad re-implementation of a similar impl found in bstr.
+ let mut bytes = self.0;
+ while let Some(result) = utf8::decode(bytes) {
+ let ch = match result {
+ Ok(ch) => ch,
+ Err(byte) => {
+ write!(f, r"\x{:02x}", byte)?;
+ bytes = &bytes[1..];
+ continue;
+ }
+ };
+ bytes = &bytes[ch.len_utf8()..];
+ match ch {
+ '\0' => write!(f, "\\0")?,
+ // ASCII control characters except \0, \n, \r, \t
+ '\x01'..='\x08'
+ | '\x0b'
+ | '\x0c'
+ | '\x0e'..='\x19'
+ | '\x7f' => {
+ write!(f, "\\x{:02x}", u32::from(ch))?;
+ }
+ '\n' | '\r' | '\t' | _ => {
+ write!(f, "{}", ch.escape_debug())?;
+ }
+ }
+ }
+ write!(f, "\"")?;
+ Ok(())
+ }
+}
diff --git a/vendor/regex-automata/src/util/id.rs b/vendor/regex-automata/src/util/id.rs
deleted file mode 100644
index 70bf0a93b..000000000
--- a/vendor/regex-automata/src/util/id.rs
+++ /dev/null
@@ -1,608 +0,0 @@
-/*!
-Type definitions for identifier types.
-
-A [`StateID`] represents the possible set of identifiers used in regex engine
-implementations in this crate. For example, they are used to identify both NFA
-and DFA states.
-
-A [`PatternID`] represents the possible set of identifiers for patterns. All
-regex engine implementations in this crate support searching for multiple
-patterns simultaneously. A `PatternID` is how each pattern is uniquely
-identified for a particular instance of a regex engine. Namely, a pattern is
-assigned an auto-incrementing integer, starting at `0`, based on the order of
-patterns supplied during the construction of the regex engine.
-
-These identifier types represent a way for this crate to make correctness
-guarantees around the possible set of values that a `StateID` or a `PatternID`
-might represent. Similarly, they also provide a way of constraining the size of
-these identifiers to reduce space usage while still guaranteeing that all such
-identifiers are repsentable by a `usize` for the current target.
-
-Moreover, the identifier types clamp the range of permissible values to a range
-that is typically smaller than its internal representation. (With the maximum
-value being, e.g., `StateID::MAX`.) Users of these types may not rely this
-clamping for the purpose of memory safety. Users may, however, rely on these
-invariants to avoid panics or other types of logic bugs.
-*/
-
-// Continuing from the above comment about correctness guarantees, an example
-// of a way in which we use the guarantees on these types is delta encoding.
-// Namely, we require that IDs can be at most 2^31 - 2, which means the
-// difference between any two IDs is always representable as an i32.
-
-use core::{
- convert::{Infallible, TryFrom},
- mem, ops,
-};
-
-#[cfg(feature = "alloc")]
-use alloc::vec::Vec;
-
-/// An identifier for a regex pattern.
-///
-/// The identifier for a pattern corresponds to its relative position among
-/// other patterns in a single finite state machine. Namely, when building
-/// a multi-pattern regex engine, one must supply a sequence of patterns to
-/// match. The position (starting at 0) of each pattern in that sequence
-/// represents its identifier. This identifier is in turn used to identify and
-/// report matches of that pattern in various APIs.
-///
-/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
-/// the number of patterns in any regex engine in this crate is guaranteed to
-/// be representable by a `usize`. This applies to regex engines that have
-/// been deserialized; a deserialization error will be returned if it contains
-/// pattern IDs that violate these requirements in your current environment.
-///
-/// For extra convenience in some cases, this type also guarantees that all
-/// IDs can fit into an `i32` and an `isize` without overflowing.
-///
-/// # Representation
-///
-/// This type is always represented internally by a `u32` and is marked as
-/// `repr(transparent)`. Thus, this type always has the same representation as
-/// a `u32`.
-///
-/// # Indexing
-///
-/// For convenience, callers may use a `PatternID` to index slices.
-///
-/// # Safety
-///
-/// While a `PatternID` is meant to guarantee that its value fits into `usize`
-/// (while using a possibly smaller representation than `usize` on some
-/// targets), callers must not rely on this property for safety. Callers may
-/// choose to rely on this property for correctness however.
-#[repr(transparent)]
-#[derive(
- Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
-)]
-pub struct PatternID(u32);
-
-impl PatternID {
- /// The maximum pattern ID value, represented as a `usize`.
- #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
- pub const MAX: PatternID =
- PatternID::new_unchecked(core::i32::MAX as usize - 1);
-
- /// The maximum pattern ID value, represented as a `usize`.
- #[cfg(target_pointer_width = "16")]
- pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1);
-
- /// The total number of patterns that are allowed in any single regex
- /// engine.
- pub const LIMIT: usize = PatternID::MAX.as_usize() + 1;
-
- /// The zero pattern ID value.
- pub const ZERO: PatternID = PatternID::new_unchecked(0);
-
- /// The number of bytes that a single `PatternID` uses in memory.
- pub const SIZE: usize = core::mem::size_of::<PatternID>();
-
- /// Create a new pattern ID.
- ///
- /// If the given identifier exceeds [`PatternID::MAX`], then this returns
- /// an error.
- #[inline]
- pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
- PatternID::try_from(id)
- }
-
- /// Create a new pattern ID without checking whether the given value
- /// exceeds [`PatternID::MAX`].
- ///
- /// While this is unchecked, providing an incorrect value must never
- /// sacrifice memory safety, as documented above.
- #[inline]
- pub const fn new_unchecked(id: usize) -> PatternID {
- PatternID(id as u32)
- }
-
- /// Like [`PatternID::new`], but panics if the given ID is not valid.
- #[inline]
- pub fn must(id: usize) -> PatternID {
- PatternID::new(id).unwrap()
- }
-
- /// Return this pattern ID as a `usize`.
- #[inline]
- pub const fn as_usize(&self) -> usize {
- self.0 as usize
- }
-
- /// Return the internal u32 of this pattern ID.
- #[inline]
- pub const fn as_u32(&self) -> u32 {
- self.0
- }
-
- /// Return the internal u32 of this pattern ID represented as an i32.
- ///
- /// This is guaranteed to never overflow an `i32`.
- #[inline]
- pub const fn as_i32(&self) -> i32 {
- self.0 as i32
- }
-
- /// Returns one more than this pattern ID as a usize.
- ///
- /// Since a pattern ID has constraints on its maximum value, adding `1` to
- /// it will always fit in a `usize` (and a `u32`).
- #[inline]
- pub fn one_more(&self) -> usize {
- self.as_usize().checked_add(1).unwrap()
- }
-
- /// Decode this pattern ID from the bytes given using the native endian
- /// byte order for the current target.
- ///
- /// If the decoded integer is not representable as a pattern ID for the
- /// current target, then this returns an error.
- #[inline]
- pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> {
- let id = u32::from_ne_bytes(bytes);
- if id > PatternID::MAX.as_u32() {
- return Err(PatternIDError { attempted: id as u64 });
- }
- Ok(PatternID::new_unchecked(id as usize))
- }
-
- /// Decode this pattern ID from the bytes given using the native endian
- /// byte order for the current target.
- ///
- /// This is analogous to [`PatternID::new_unchecked`] in that is does not
- /// check whether the decoded integer is representable as a pattern ID.
- #[inline]
- pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID {
- PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
- }
-
- /// Return the underlying pattern ID integer as raw bytes in native endian
- /// format.
- #[inline]
- pub fn to_ne_bytes(&self) -> [u8; 4] {
- self.0.to_ne_bytes()
- }
-
- /// Returns an iterator over all pattern IDs from 0 up to and not including
- /// the given length.
- ///
- /// If the given length exceeds [`PatternID::LIMIT`], then this panics.
- #[cfg(feature = "alloc")]
- pub(crate) fn iter(len: usize) -> PatternIDIter {
- PatternIDIter::new(len)
- }
-}
-
-/// This error occurs when a pattern ID could not be constructed.
-///
-/// This occurs when given an integer exceeding the maximum pattern ID value.
-///
-/// When the `std` feature is enabled, this implements the `Error` trait.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct PatternIDError {
- attempted: u64,
-}
-
-impl PatternIDError {
- /// Returns the value that failed to constructed a pattern ID.
- pub fn attempted(&self) -> u64 {
- self.attempted
- }
-}
-
-#[cfg(feature = "std")]
-impl std::error::Error for PatternIDError {}
-
-impl core::fmt::Display for PatternIDError {
- fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- write!(
- f,
- "failed to create PatternID from {:?}, which exceeds {:?}",
- self.attempted(),
- PatternID::MAX,
- )
- }
-}
-
-/// An identifier for a state in a regex engine.
-///
-/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
-/// number of states in any regex engine in this crate is guaranteed to be
-/// representable by a `usize`. This applies to regex engines that have been
-/// deserialized; a deserialization error will be returned if it contains state
-/// IDs that violate these requirements in your current environment.
-///
-/// For extra convenience in some cases, this type also guarantees that all
-/// IDs can fit into an `i32` and an `isize` without overflowing.
-///
-/// # Representation
-///
-/// This type is always represented internally by a `u32` and is marked as
-/// `repr(transparent)`. Thus, this type always has the same representation as
-/// a `u32`.
-///
-/// # Indexing
-///
-/// For convenience, callers may use a `StateID` to index slices.
-///
-/// # Safety
-///
-/// While a `StateID` is meant to guarantee that its value fits into `usize`
-/// (while using a possibly smaller representation than `usize` on some
-/// targets), callers must not rely on this property for safety. Callers may
-/// choose to rely on this property for correctness however.
-#[repr(transparent)]
-#[derive(
- Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
-)]
-pub struct StateID(u32);
-
-impl StateID {
- /// The maximum state ID value.
- #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
- pub const MAX: StateID =
- StateID::new_unchecked(core::i32::MAX as usize - 1);
-
- /// The maximum state ID value.
- #[cfg(target_pointer_width = "16")]
- pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1);
-
- /// The total number of states that are allowed in any single regex
- /// engine, represented as a `usize`.
- pub const LIMIT: usize = StateID::MAX.as_usize() + 1;
-
- /// The zero state ID value.
- pub const ZERO: StateID = StateID::new_unchecked(0);
-
- /// The number of bytes that a single `StateID` uses in memory.
- pub const SIZE: usize = core::mem::size_of::<StateID>();
-
- /// Create a new state ID.
- ///
- /// If the given identifier exceeds [`StateID::MAX`], then this returns
- /// an error.
- #[inline]
- pub fn new(id: usize) -> Result<StateID, StateIDError> {
- StateID::try_from(id)
- }
-
- /// Create a new state ID without checking whether the given value
- /// exceeds [`StateID::MAX`].
- ///
- /// While this is unchecked, providing an incorrect value must never
- /// sacrifice memory safety, as documented above.
- #[inline]
- pub const fn new_unchecked(id: usize) -> StateID {
- StateID(id as u32)
- }
-
- /// Like [`StateID::new`], but panics if the given ID is not valid.
- #[inline]
- pub fn must(id: usize) -> StateID {
- StateID::new(id).unwrap()
- }
-
- /// Return this state ID as a `usize`.
- #[inline]
- pub const fn as_usize(&self) -> usize {
- self.0 as usize
- }
-
- /// Return the internal u32 of this state ID.
- #[inline]
- pub const fn as_u32(&self) -> u32 {
- self.0
- }
-
- /// Return the internal u32 of this pattern ID represented as an i32.
- ///
- /// This is guaranteed to never overflow an `i32`.
- #[inline]
- pub const fn as_i32(&self) -> i32 {
- self.0 as i32
- }
-
- /// Returns one more than this state ID as a usize.
- ///
- /// Since a state ID has constraints on its maximum value, adding `1` to
- /// it will always fit in a `usize` (and a `u32`).
- #[inline]
- pub fn one_more(&self) -> usize {
- self.as_usize().checked_add(1).unwrap()
- }
-
- /// Decode this state ID from the bytes given using the native endian byte
- /// order for the current target.
- ///
- /// If the decoded integer is not representable as a state ID for the
- /// current target, then this returns an error.
- #[inline]
- pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> {
- let id = u32::from_ne_bytes(bytes);
- if id > StateID::MAX.as_u32() {
- return Err(StateIDError { attempted: id as u64 });
- }
- Ok(StateID::new_unchecked(id as usize))
- }
-
- /// Decode this state ID from the bytes given using the native endian
- /// byte order for the current target.
- ///
- /// This is analogous to [`StateID::new_unchecked`] in that is does not
- /// check whether the decoded integer is representable as a state ID.
- #[inline]
- pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID {
- StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
- }
-
- /// Return the underlying state ID integer as raw bytes in native endian
- /// format.
- #[inline]
- pub fn to_ne_bytes(&self) -> [u8; 4] {
- self.0.to_ne_bytes()
- }
-
- /// Returns an iterator over all state IDs from 0 up to and not including
- /// the given length.
- ///
- /// If the given length exceeds [`StateID::LIMIT`], then this panics.
- #[cfg(feature = "alloc")]
- pub(crate) fn iter(len: usize) -> StateIDIter {
- StateIDIter::new(len)
- }
-}
-
-/// This error occurs when a state ID could not be constructed.
-///
-/// This occurs when given an integer exceeding the maximum state ID value.
-///
-/// When the `std` feature is enabled, this implements the `Error` trait.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub struct StateIDError {
- attempted: u64,
-}
-
-impl StateIDError {
- /// Returns the value that failed to constructed a state ID.
- pub fn attempted(&self) -> u64 {
- self.attempted
- }
-}
-
-#[cfg(feature = "std")]
-impl std::error::Error for StateIDError {}
-
-impl core::fmt::Display for StateIDError {
- fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- write!(
- f,
- "failed to create StateID from {:?}, which exceeds {:?}",
- self.attempted(),
- StateID::MAX,
- )
- }
-}
-
-/// A macro for defining exactly identical (modulo names) impls for ID types.
-macro_rules! impls {
- ($ty:ident, $tyerr:ident, $tyiter:ident) => {
- #[derive(Clone, Debug)]
- pub(crate) struct $tyiter {
- rng: ops::Range<usize>,
- }
-
- impl $tyiter {
- #[cfg(feature = "alloc")]
- fn new(len: usize) -> $tyiter {
- assert!(
- len <= $ty::LIMIT,
- "cannot create iterator with IDs when number of \
- elements exceed {:?}",
- $ty::LIMIT,
- );
- $tyiter { rng: 0..len }
- }
- }
-
- impl Iterator for $tyiter {
- type Item = $ty;
-
- fn next(&mut self) -> Option<$ty> {
- if self.rng.start >= self.rng.end {
- return None;
- }
- let next_id = self.rng.start + 1;
- let id = mem::replace(&mut self.rng.start, next_id);
- // new_unchecked is OK since we asserted that the number of
- // elements in this iterator will fit in an ID at construction.
- Some($ty::new_unchecked(id))
- }
- }
-
- impl<T> core::ops::Index<$ty> for [T] {
- type Output = T;
-
- #[inline]
- fn index(&self, index: $ty) -> &T {
- &self[index.as_usize()]
- }
- }
-
- impl<T> core::ops::IndexMut<$ty> for [T] {
- #[inline]
- fn index_mut(&mut self, index: $ty) -> &mut T {
- &mut self[index.as_usize()]
- }
- }
-
- #[cfg(feature = "alloc")]
- impl<T> core::ops::Index<$ty> for Vec<T> {
- type Output = T;
-
- #[inline]
- fn index(&self, index: $ty) -> &T {
- &self[index.as_usize()]
- }
- }
-
- #[cfg(feature = "alloc")]
- impl<T> core::ops::IndexMut<$ty> for Vec<T> {
- #[inline]
- fn index_mut(&mut self, index: $ty) -> &mut T {
- &mut self[index.as_usize()]
- }
- }
-
- impl TryFrom<usize> for $ty {
- type Error = $tyerr;
-
- fn try_from(id: usize) -> Result<$ty, $tyerr> {
- if id > $ty::MAX.as_usize() {
- return Err($tyerr { attempted: id as u64 });
- }
- Ok($ty::new_unchecked(id))
- }
- }
-
- impl TryFrom<u8> for $ty {
- type Error = Infallible;
-
- fn try_from(id: u8) -> Result<$ty, Infallible> {
- Ok($ty::new_unchecked(id as usize))
- }
- }
-
- impl TryFrom<u16> for $ty {
- type Error = $tyerr;
-
- fn try_from(id: u16) -> Result<$ty, $tyerr> {
- if id as u32 > $ty::MAX.as_u32() {
- return Err($tyerr { attempted: id as u64 });
- }
- Ok($ty::new_unchecked(id as usize))
- }
- }
-
- impl TryFrom<u32> for $ty {
- type Error = $tyerr;
-
- fn try_from(id: u32) -> Result<$ty, $tyerr> {
- if id > $ty::MAX.as_u32() {
- return Err($tyerr { attempted: id as u64 });
- }
- Ok($ty::new_unchecked(id as usize))
- }
- }
-
- impl TryFrom<u64> for $ty {
- type Error = $tyerr;
-
- fn try_from(id: u64) -> Result<$ty, $tyerr> {
- if id > $ty::MAX.as_u32() as u64 {
- return Err($tyerr { attempted: id });
- }
- Ok($ty::new_unchecked(id as usize))
- }
- }
-
- #[cfg(test)]
- impl quickcheck::Arbitrary for $ty {
- fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
- use core::cmp::max;
-
- let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
- if id > $ty::MAX.as_i32() {
- $ty::MAX
- } else {
- $ty::new(usize::try_from(id).unwrap()).unwrap()
- }
- }
- }
- };
-}
-
-impls!(PatternID, PatternIDError, PatternIDIter);
-impls!(StateID, StateIDError, StateIDIter);
-
-/// A utility trait that defines a couple of adapters for making it convenient
-/// to access indices as ID types. We require ExactSizeIterator so that
-/// iterator construction can do a single check to make sure the index of each
-/// element is representable by its ID type.
-#[cfg(feature = "alloc")]
-pub(crate) trait IteratorIDExt: Iterator {
- fn with_pattern_ids(self) -> WithPatternIDIter<Self>
- where
- Self: Sized + ExactSizeIterator,
- {
- WithPatternIDIter::new(self)
- }
-
- fn with_state_ids(self) -> WithStateIDIter<Self>
- where
- Self: Sized + ExactSizeIterator,
- {
- WithStateIDIter::new(self)
- }
-}
-
-#[cfg(feature = "alloc")]
-impl<I: Iterator> IteratorIDExt for I {}
-
-#[cfg(feature = "alloc")]
-macro_rules! iditer {
- ($ty:ident, $iterty:ident, $withiterty:ident) => {
- /// An iterator adapter that is like std::iter::Enumerate, but attaches
- /// IDs. It requires ExactSizeIterator. At construction, it ensures
- /// that the index of each element in the iterator is representable in
- /// the corresponding ID type.
- #[derive(Clone, Debug)]
- pub(crate) struct $withiterty<I> {
- it: I,
- ids: $iterty,
- }
-
- impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
- fn new(it: I) -> $withiterty<I> {
- let ids = $ty::iter(it.len());
- $withiterty { it, ids }
- }
- }
-
- impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
- type Item = ($ty, I::Item);
-
- fn next(&mut self) -> Option<($ty, I::Item)> {
- let item = self.it.next()?;
- // Number of elements in this iterator must match, according
- // to contract of ExactSizeIterator.
- let id = self.ids.next().unwrap();
- Some((id, item))
- }
- }
- };
-}
-
-#[cfg(feature = "alloc")]
-iditer!(PatternID, PatternIDIter, WithPatternIDIter);
-#[cfg(feature = "alloc")]
-iditer!(StateID, StateIDIter, WithStateIDIter);
diff --git a/vendor/regex-automata/src/util/int.rs b/vendor/regex-automata/src/util/int.rs
new file mode 100644
index 000000000..e6b13bff9
--- /dev/null
+++ b/vendor/regex-automata/src/util/int.rs
@@ -0,0 +1,252 @@
+/*!
+This module provides several integer oriented traits for converting between
+both fixed size integers and integers whose size varies based on the target
+(like `usize`).
+
+The driving design principle of this module is to attempt to centralize as many
+`as` casts as possible here. And in particular, we separate casts into two
+buckets:
+
+* Casts that we use for their truncating behavior. In this case, we use more
+descriptive names, like `low_u32` and `high_u32`.
+* Casts that we use for converting back-and-forth between `usize`. These
+conversions are generally necessary because we often store indices in different
+formats to save on memory, which requires converting to and from `usize`. In
+this case, we very specifically do not want to overflow, and so the methods
+defined here will panic if the `as` cast would be lossy in debug mode. (A
+normal `as` cast will never panic!)
+
+For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there.
+
+For regex engines, floating point is just never used, so we don't have to worry
+about `as` casts for those.
+
+Otherwise, this module pretty much covers all of our `as` needs except for one
+thing: const contexts. There are a select few places in this crate where we
+still need to use `as` because const functions on traits aren't stable yet.
+If we wind up significantly expanding our const footprint in this crate, it
+might be worth defining free functions to handle those cases. But at the time
+of writing, that just seemed like too much ceremony. Instead, I comment each
+such use of `as` in a const context with a "fixme" notice.
+
+NOTE: for simplicity, we don't take target pointer width into account here for
+`usize` conversions. Since we currently only panic in debug mode, skipping the
+check when it can be proven it isn't needed at compile time doesn't really
+matter. Now, if we wind up wanting to do as many checks as possible in release
+mode, then we would want to skip those when we know the conversions are always
+non-lossy.
+
+NOTE: this module isn't an exhaustive API. For example, we still use things
+like `u64::from` where possible, or even `usize::try_from()` for when we do
+explicitly want to panic or when we want to return an error for overflow.
+*/
+
+pub(crate) trait U8 {
+ fn as_usize(self) -> usize;
+}
+
+impl U8 for u8 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+}
+
+pub(crate) trait U16 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn high_u8(self) -> u8;
+}
+
+impl U16 for u16 {
+ fn as_usize(self) -> usize {
+ usize::from(self)
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn high_u8(self) -> u8 {
+ (self >> 8) as u8
+ }
+}
+
+pub(crate) trait U32 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn high_u16(self) -> u16;
+}
+
+impl U32 for u32 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn high_u16(self) -> u16 {
+ (self >> 16) as u16
+ }
+}
+
+pub(crate) trait U64 {
+ fn as_usize(self) -> usize;
+ fn low_u8(self) -> u8;
+ fn low_u16(self) -> u16;
+ fn low_u32(self) -> u32;
+ fn high_u32(self) -> u32;
+}
+
+impl U64 for u64 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("u64 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn low_u8(self) -> u8 {
+ self as u8
+ }
+
+ fn low_u16(self) -> u16 {
+ self as u16
+ }
+
+ fn low_u32(self) -> u32 {
+ self as u32
+ }
+
+ fn high_u32(self) -> u32 {
+ (self >> 32) as u32
+ }
+}
+
+pub(crate) trait I32 {
+ fn as_usize(self) -> usize;
+ fn to_bits(self) -> u32;
+ fn from_bits(n: u32) -> i32;
+}
+
+impl I32 for i32 {
+ fn as_usize(self) -> usize {
+ #[cfg(debug_assertions)]
+ {
+ usize::try_from(self).expect("i32 overflowed usize")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as usize
+ }
+ }
+
+ fn to_bits(self) -> u32 {
+ self as u32
+ }
+
+ fn from_bits(n: u32) -> i32 {
+ n as i32
+ }
+}
+
+pub(crate) trait Usize {
+ fn as_u8(self) -> u8;
+ fn as_u16(self) -> u16;
+ fn as_u32(self) -> u32;
+ fn as_u64(self) -> u64;
+}
+
+impl Usize for usize {
+ fn as_u8(self) -> u8 {
+ #[cfg(debug_assertions)]
+ {
+ u8::try_from(self).expect("usize overflowed u8")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u8
+ }
+ }
+
+ fn as_u16(self) -> u16 {
+ #[cfg(debug_assertions)]
+ {
+ u16::try_from(self).expect("usize overflowed u16")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u16
+ }
+ }
+
+ fn as_u32(self) -> u32 {
+ #[cfg(debug_assertions)]
+ {
+ u32::try_from(self).expect("usize overflowed u32")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u32
+ }
+ }
+
+ fn as_u64(self) -> u64 {
+ #[cfg(debug_assertions)]
+ {
+ u64::try_from(self).expect("usize overflowed u64")
+ }
+ #[cfg(not(debug_assertions))]
+ {
+ self as u64
+ }
+ }
+}
+
+// Pointers aren't integers, but we convert pointers to integers to perform
+// offset arithmetic in some places. (And no, we don't convert the integers
+// back to pointers.) So add 'as_usize' conversions here too for completeness.
+//
+// These 'as' casts are actually okay because they're always non-lossy. But the
+// idea here is to just try and remove as much 'as' as possible, particularly
+// in this crate where we are being really paranoid about offsets and making
+// sure we don't panic on inputs that might be untrusted. This way, the 'as'
+// casts become easier to audit if they're all in one place, even when some of
+// them are actually okay 100% of the time.
+
+pub(crate) trait Pointer {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> Pointer for *const T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
+
+pub(crate) trait PointerMut {
+ fn as_usize(self) -> usize;
+}
+
+impl<T> PointerMut for *mut T {
+ fn as_usize(self) -> usize {
+ self as usize
+ }
+}
diff --git a/vendor/regex-automata/src/util/interpolate.rs b/vendor/regex-automata/src/util/interpolate.rs
new file mode 100644
index 000000000..f274629df
--- /dev/null
+++ b/vendor/regex-automata/src/util/interpolate.rs
@@ -0,0 +1,579 @@
+/*!
+Provides routines for interpolating capture group references.
+
+That is, if a replacement string contains references like `$foo` or `${foo1}`,
+then they are replaced with the corresponding capture values for the groups
+named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
+is supported as well, with `1` corresponding to a capture group index and not
+a name.
+
+This module provides the free functions [`string`] and [`bytes`], which
+interpolate Rust Unicode strings and byte strings, respectively.
+
+# Format
+
+These routines support two different kinds of capture references: unbraced and
+braced.
+
+For the unbraced format, the format supported is `$ref` where `name` can be
+any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
+possible parse. So for example, `$1a` corresponds to the capture group named
+`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
+it is treated as a capture group index itself and not a name.
+
+For the braced format, the format supported is `${ref}` where `ref` can be any
+sequence of bytes except for `}`. If no closing brace occurs, then it is not
+considered a capture reference. As with the unbraced format, if `ref` matches
+`^[0-9]+$`, then it is treated as a capture group index and not a name.
+
+The braced format is useful for exerting precise control over the name of the
+capture reference. For example, `${1}a` corresponds to the capture group
+reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
+corresponds to the capture group reference `1a`. The braced format is also
+useful for expressing capture group names that use characters not supported by
+the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
+named `foo[bar].baz`.
+
+If a capture group reference is found and it does not refer to a valid capture
+group, then it will be replaced with the empty string.
+
+To write a literal `$`, use `$$`.
+
+To be clear, and as exhibited via the type signatures in the routines in this
+module, it is impossible for a replacement string to be invalid. A replacement
+string may not have the intended semantics, but the interpolation procedure
+itself can never fail.
+*/
+
+use alloc::{string::String, vec::Vec};
+
+use crate::util::memchr::memchr;
+
+/// Accepts a replacement string and interpolates capture references with their
+/// corresponding values.
+///
+/// `append` should be a function that appends the string value of a capture
+/// group at a particular index to the string given. If the capture group
+/// index is invalid, then nothing should be appended.
+///
+/// `name_to_index` should be a function that maps a capture group name to a
+/// capture group index. If the given name doesn't exist, then `None` should
+/// be returned.
+///
+/// Finally, `dst` is where the final interpolated contents should be written.
+/// If `replacement` contains no capture group references, then `dst` will be
+/// equivalent to `replacement`.
+///
+/// See the [module documentation](self) for details about the format
+/// supported.
+///
+/// # Example
+///
+/// ```
+/// use regex_automata::util::interpolate;
+///
+/// let mut dst = String::new();
+/// interpolate::string(
+/// "foo $bar baz",
+/// |index, dst| {
+/// if index == 0 {
+/// dst.push_str("BAR");
+/// }
+/// },
+/// |name| {
+/// if name == "bar" {
+/// Some(0)
+/// } else {
+/// None
+/// }
+/// },
+/// &mut dst,
+/// );
+/// assert_eq!("foo BAR baz", dst);
+/// ```
+pub fn string(
+ mut replacement: &str,
+ mut append: impl FnMut(usize, &mut String),
+ mut name_to_index: impl FnMut(&str) -> Option<usize>,
+ dst: &mut String,
+) {
+ while !replacement.is_empty() {
+ match memchr(b'$', replacement.as_bytes()) {
+ None => break,
+ Some(i) => {
+ dst.push_str(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ // Handle escaping of '$'.
+ if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
+ dst.push_str("$");
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement.as_bytes()) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push_str("$");
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => append(i, dst),
+ Ref::Named(name) => {
+ if let Some(i) = name_to_index(name) {
+ append(i, dst);
+ }
+ }
+ }
+ }
+ dst.push_str(replacement);
+}
+
+/// Accepts a replacement byte string and interpolates capture references with
+/// their corresponding values.
+///
+/// `append` should be a function that appends the byte string value of a
+/// capture group at a particular index to the byte string given. If the
+/// capture group index is invalid, then nothing should be appended.
+///
+/// `name_to_index` should be a function that maps a capture group name to a
+/// capture group index. If the given name doesn't exist, then `None` should
+/// be returned.
+///
+/// Finally, `dst` is where the final interpolated contents should be written.
+/// If `replacement` contains no capture group references, then `dst` will be
+/// equivalent to `replacement`.
+///
+/// See the [module documentation](self) for details about the format
+/// supported.
+///
+/// # Example
+///
+/// ```
+/// use regex_automata::util::interpolate;
+///
+/// let mut dst = vec![];
+/// interpolate::bytes(
+/// b"foo $bar baz",
+/// |index, dst| {
+/// if index == 0 {
+/// dst.extend_from_slice(b"BAR");
+/// }
+/// },
+/// |name| {
+/// if name == "bar" {
+/// Some(0)
+/// } else {
+/// None
+/// }
+/// },
+/// &mut dst,
+/// );
+/// assert_eq!(&b"foo BAR baz"[..], dst);
+/// ```
+pub fn bytes(
+ mut replacement: &[u8],
+ mut append: impl FnMut(usize, &mut Vec<u8>),
+ mut name_to_index: impl FnMut(&str) -> Option<usize>,
+ dst: &mut Vec<u8>,
+) {
+ while !replacement.is_empty() {
+ match memchr(b'$', replacement) {
+ None => break,
+ Some(i) => {
+ dst.extend_from_slice(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ // Handle escaping of '$'.
+ if replacement.get(1).map_or(false, |&b| b == b'$') {
+ dst.push(b'$');
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push(b'$');
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => append(i, dst),
+ Ref::Named(name) => {
+ if let Some(i) = name_to_index(name) {
+ append(i, dst);
+ }
+ }
+ }
+ }
+ dst.extend_from_slice(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text following the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+ cap: Ref<'a>,
+ end: usize,
+}
+
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Ref<'a> {
+ Named(&'a str),
+ Number(usize),
+}
+
+impl<'a> From<&'a str> for Ref<'a> {
+ fn from(x: &'a str) -> Ref<'a> {
+ Ref::Named(x)
+ }
+}
+
+impl From<usize> for Ref<'static> {
+ fn from(x: usize) -> Ref<'static> {
+ Ref::Number(x)
+ }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+///
+/// Note that this returns a "possible" reference because this routine doesn't
+/// know whether the reference is to a valid group or not. If it winds up not
+/// being a valid reference, then it should be replaced with the empty string.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
+ let mut i = 0;
+ let rep: &[u8] = replacement;
+ if rep.len() <= 1 || rep[0] != b'$' {
+ return None;
+ }
+ i += 1;
+ if rep[i] == b'{' {
+ return find_cap_ref_braced(rep, i + 1);
+ }
+ let mut cap_end = i;
+ while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
+ cap_end += 1;
+ }
+ if cap_end == i {
+ return None;
+ }
+ // We just verified that the range 0..cap_end is valid ASCII, so it must
+ // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+ // check via an unchecked conversion or by parsing the number straight from
+ // &[u8].
+ let cap = core::str::from_utf8(&rep[i..cap_end])
+ .expect("valid UTF-8 capture name");
+ Some(CaptureRef {
+ cap: match cap.parse::<usize>() {
+ Ok(i) => Ref::Number(i),
+ Err(_) => Ref::Named(cap),
+ },
+ end: cap_end,
+ })
+}
+
+/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
+/// brace has been found at `i-1` in `rep`. This then looks for a closing
+/// brace and returns the capture reference within the brace.
+fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
+ assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
+ let start = i;
+ while rep.get(i).map_or(false, |&b| b != b'}') {
+ i += 1;
+ }
+ if !rep.get(i).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ // When looking at braced names, we don't put any restrictions on the name,
+ // so it's possible it could be invalid UTF-8. But a capture group name
+ // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
+ // safely return None.
+ let cap = match core::str::from_utf8(&rep[start..i]) {
+ Err(_) => return None,
+ Ok(cap) => cap,
+ };
+ Some(CaptureRef {
+ cap: match cap.parse::<usize>() {
+ Ok(i) => Ref::Number(i),
+ Err(_) => Ref::Named(cap),
+ },
+ end: i + 1,
+ })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name
+/// written in non-brace form.
+fn is_valid_cap_letter(b: u8) -> bool {
+ match b {
+ b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{string::String, vec, vec::Vec};
+
+ use super::{find_cap_ref, CaptureRef};
+
+ macro_rules! find {
+ ($name:ident, $text:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
+ }
+ };
+ ($name:ident, $text:expr, $capref:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! c {
+ ($name_or_number:expr, $pos:expr) => {
+ CaptureRef { cap: $name_or_number.into(), end: $pos }
+ };
+ }
+
+ find!(find_cap_ref1, "$foo", c!("foo", 4));
+ find!(find_cap_ref2, "${foo}", c!("foo", 6));
+ find!(find_cap_ref3, "$0", c!(0, 2));
+ find!(find_cap_ref4, "$5", c!(5, 2));
+ find!(find_cap_ref5, "$10", c!(10, 3));
+ // See https://github.com/rust-lang/regex/pull/585
+ // for more on characters following numbers
+ find!(find_cap_ref6, "$42a", c!("42a", 4));
+ find!(find_cap_ref7, "${42}a", c!(42, 5));
+ find!(find_cap_ref8, "${42");
+ find!(find_cap_ref9, "${42 ");
+ find!(find_cap_ref10, " $0 ");
+ find!(find_cap_ref11, "$");
+ find!(find_cap_ref12, " ");
+ find!(find_cap_ref13, "");
+ find!(find_cap_ref14, "$1-$2", c!(1, 2));
+ find!(find_cap_ref15, "$1_$2", c!("1_", 3));
+ find!(find_cap_ref16, "$x-$y", c!("x", 2));
+ find!(find_cap_ref17, "$x_$y", c!("x_", 3));
+ find!(find_cap_ref18, "${#}", c!("#", 4));
+ find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+ find!(find_cap_ref20, "${¾}", c!("¾", 5));
+ find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
+ find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
+ find!(find_cap_ref23, "${☃}", c!("☃", 6));
+ find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
+ find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
+ find!(find_cap_ref26, "${名字}", c!("名字", 9));
+
+ fn interpolate_string(
+ mut name_to_index: Vec<(&'static str, usize)>,
+ caps: Vec<&'static str>,
+ replacement: &str,
+ ) -> String {
+ name_to_index.sort_by_key(|x| x.0);
+
+ let mut dst = String::new();
+ super::string(
+ replacement,
+ |i, dst| {
+ if let Some(&s) = caps.get(i) {
+ dst.push_str(s);
+ }
+ },
+ |name| -> Option<usize> {
+ name_to_index
+ .binary_search_by_key(&name, |x| x.0)
+ .ok()
+ .map(|i| name_to_index[i].1)
+ },
+ &mut dst,
+ );
+ dst
+ }
+
+ fn interpolate_bytes(
+ mut name_to_index: Vec<(&'static str, usize)>,
+ caps: Vec<&'static str>,
+ replacement: &str,
+ ) -> String {
+ name_to_index.sort_by_key(|x| x.0);
+
+ let mut dst = vec![];
+ super::bytes(
+ replacement.as_bytes(),
+ |i, dst| {
+ if let Some(&s) = caps.get(i) {
+ dst.extend_from_slice(s.as_bytes());
+ }
+ },
+ |name| -> Option<usize> {
+ name_to_index
+ .binary_search_by_key(&name, |x| x.0)
+ .ok()
+ .map(|i| name_to_index[i].1)
+ },
+ &mut dst,
+ );
+ String::from_utf8(dst).unwrap()
+ }
+
+ macro_rules! interp {
+ ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
+ #[test]
+ fn $name() {
+ assert_eq!(
+ $expected,
+ interpolate_string($map, $caps, $hay),
+ "interpolate::string failed",
+ );
+ assert_eq!(
+ $expected,
+ interpolate_bytes($map, $caps, $hay),
+ "interpolate::bytes failed",
+ );
+ }
+ };
+ }
+
+ interp!(
+ interp1,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo test",
+ "test xxx test",
+ );
+
+ interp!(
+ interp2,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$footest",
+ "test",
+ );
+
+ interp!(
+ interp3,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${foo}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp4,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$2test",
+ "test",
+ );
+
+ interp!(
+ interp5,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${2}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp6,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $$foo test",
+ "test $foo test",
+ );
+
+ interp!(
+ interp7,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo",
+ "test xxx",
+ );
+
+ interp!(
+ interp8,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "$foo test",
+ "xxx test",
+ );
+
+ interp!(
+ interp9,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $bar$foo",
+ "test yyyxxx",
+ );
+
+ interp!(
+ interp10,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $ test",
+ "test $ test",
+ );
+
+ interp!(
+ interp11,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${} test",
+ "test test",
+ );
+
+ interp!(
+ interp12,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${ } test",
+ "test test",
+ );
+
+ interp!(
+ interp13,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a b} test",
+ "test test",
+ );
+
+ interp!(
+ interp14,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a} test",
+ "test test",
+ );
+
+ // This is a funny case where a braced reference is never closed, but
+ // within the unclosed braced reference, there is an unbraced reference.
+ // In this case, the braced reference is just treated literally and the
+ // unbraced reference is found.
+ interp!(
+ interp15,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${wat $bar ok",
+ "test ${wat yyy ok",
+ );
+}
diff --git a/vendor/regex-automata/src/util/iter.rs b/vendor/regex-automata/src/util/iter.rs
new file mode 100644
index 000000000..a789fa042
--- /dev/null
+++ b/vendor/regex-automata/src/util/iter.rs
@@ -0,0 +1,1027 @@
+/*!
+Generic helpers for iteration of matches from a regex engine in a haystack.
+
+The principle type in this module is a [`Searcher`]. A `Searcher` provides
+its own lower level iterator-like API in addition to methods for constructing
+types that implement `Iterator`. The documentation for `Searcher` explains a
+bit more about why these different APIs exist.
+
+Currently, this module supports iteration over any regex engine that works
+with the [`HalfMatch`], [`Match`] or [`Captures`] types.
+*/
+
+#[cfg(feature = "alloc")]
+use crate::util::captures::Captures;
+use crate::util::search::{HalfMatch, Input, Match, MatchError};
+
+/// A searcher for creating iterators and performing lower level iteration.
+///
+/// This searcher encapsulates the logic required for finding all successive
+/// non-overlapping matches in a haystack. In theory, iteration would look
+/// something like this:
+///
+/// 1. Setting the start position to `0`.
+/// 2. Execute a regex search. If no match, end iteration.
+/// 3. Report the match and set the start position to the end of the match.
+/// 4. Go back to (2).
+///
+/// And if this were indeed the case, it's likely that `Searcher` wouldn't
+/// exist. Unfortunately, because a regex may match the empty string, the above
+/// logic won't work for all possible regexes. Namely, if an empty match is
+/// found, then step (3) would set the start position of the search to the
+/// position it was at. Thus, iteration would never end.
+///
+/// Instead, a `Searcher` knows how to detect these cases and forcefully
+/// advance iteration in the case of an empty match that overlaps with a
+/// previous match.
+///
+/// If you know that your regex cannot match any empty string, then the simple
+/// algorithm described above will work correctly.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// In particular, a `Searcher` is not itself an iterator. Instead, it provides
+/// `advance` routines that permit moving the search along explicitly. It also
+/// provides various routines, like [`Searcher::into_matches_iter`], that
+/// accept a closure (representing how a regex engine executes a search) and
+/// returns a conventional iterator.
+///
+/// The lifetime parameters come from the [`Input`] type passed to
+/// [`Searcher::new`]:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// # Searcher vs Iterator
+///
+/// Why does a search type with "advance" APIs exist at all when we also have
+/// iterators? Unfortunately, the reasoning behind this split is a complex
+/// combination of the following things:
+///
+/// 1. While many of the regex engines expose their own iterators, it is also
+/// nice to expose this lower level iteration helper because it permits callers
+/// to provide their own `Input` configuration. Moreover, a `Searcher` can work
+/// with _any_ regex engine instead of only the ones defined in this crate.
+/// This way, everyone benefits from a shared iteration implementation.
+/// 2. There are many different regex engines that, while they have the same
+/// match semantics, they have slightly different APIs. Iteration is just
+/// complex enough to want to share code, and so we need a way of abstracting
+/// over those different regex engines. While we could define a new trait that
+/// describes any regex engine search API, it would wind up looking very close
+/// to a closure. While there may still be reasons for the more generic trait
+/// to exist, for now and for the purposes of iteration, we use a closure.
+/// Closures also provide a lot of easy flexibility at the call site, in that
+/// they permit the caller to borrow any kind of state they want for use during
+/// each search call.
+/// 3. As a result of using closures, and because closures are anonymous types
+/// that cannot be named, it is difficult to encapsulate them without both
+/// costs to speed and added complexity to the public API. For example, in
+/// defining an iterator type like
+/// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches),
+/// if we use a closure internally, it's not possible to name this type in the
+/// return type of the iterator constructor. Thus, the only way around it is
+/// to erase the type by boxing it and turning it into a `Box<dyn FnMut ...>`.
+/// This boxed closure is unlikely to be inlined _and_ it infects the public
+/// API in subtle ways. Namely, unless you declare the closure as implementing
+/// `Send` and `Sync`, then the resulting iterator type won't implement it
+/// either. But there are practical issues with requiring the closure to
+/// implement `Send` and `Sync` that result in other API complexities that
+/// are beyond the scope of this already long exposition.
+/// 4. Some regex engines expose more complex match information than just
+/// "which pattern matched" and "at what offsets." For example, the PikeVM
+/// exposes match spans for each capturing group that participated in the
+/// match. In such cases, it can be quite beneficial to reuse the capturing
+/// group allocation on subsequent searches. A proper iterator doesn't permit
+/// this API due to its interface, so it's useful to have something a bit lower
+/// level that permits callers to amortize allocations while also reusing a
+/// shared implementation of iteration. (See the documentation for
+/// [`Searcher::advance`] for an example of using the "advance" API with the
+/// PikeVM.)
+///
+/// What this boils down to is that there are "advance" APIs which require
+/// handing a closure to it for every call, and there are also APIs to create
+/// iterators from a closure. The former are useful for _implementing_
+/// iterators or when you need more flexibility, while the latter are useful
+/// for conveniently writing custom iterators on-the-fly.
+///
+/// # Example: iterating with captures
+///
+/// Several regex engines in this crate over convenient iterator APIs over
+/// [`Captures`] values. To do so, this requires allocating a new `Captures`
+/// value for each iteration step. This can perhaps be more costly than you
+/// might want. Instead of implementing your own iterator to avoid that
+/// cost (which can be a little subtle if you want to handle empty matches
+/// correctly), you can use this `Searcher` to do it for you:
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// util::iter::Searcher,
+/// Input, Span,
+/// };
+///
+/// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
+/// let haystack = "foo1 foo12 foo123";
+///
+/// let mut caps = re.create_captures();
+/// let mut cache = re.create_cache();
+/// let mut matches = vec![];
+/// let mut searcher = Searcher::new(Input::new(haystack));
+/// while let Some(_) = searcher.advance(|input| {
+/// re.search(&mut cache, input, &mut caps);
+/// Ok(caps.get_match())
+/// }) {
+/// // The unwrap is OK since 'numbers' matches if the pattern matches.
+/// matches.push(caps.get_group_by_name("numbers").unwrap());
+/// }
+/// assert_eq!(matches, vec![
+/// Span::from(3..4),
+/// Span::from(8..10),
+/// Span::from(14..17),
+/// ]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Searcher<'h> {
+ /// The input parameters to give to each regex engine call.
+ ///
+ /// The start position of the search is mutated during iteration.
+ input: Input<'h>,
+ /// Records the end offset of the most recent match. This is necessary to
+ /// handle a corner case for preventing empty matches from overlapping with
+ /// the ending bounds of a prior match.
+ last_match_end: Option<usize>,
+}
+
+impl<'h> Searcher<'h> {
+ /// Create a new fallible non-overlapping matches iterator.
+ ///
+ /// The given `input` provides the parameters (including the haystack),
+ /// while the `finder` represents a closure that calls the underlying regex
+ /// engine. The closure may borrow any additional state that is needed,
+ /// such as a prefilter scanner.
+ pub fn new(input: Input<'h>) -> Searcher<'h> {
+ Searcher { input, last_match_end: None }
+ }
+
+ /// Returns the current `Input` used by this searcher.
+ ///
+ /// The `Input` returned is generally equivalent to the one given to
+ /// [`Searcher::new`], but its start position may be different to reflect
+ /// the start of the next search to be executed.
+ pub fn input<'s>(&'s self) -> &'s Input<'h> {
+ &self.input
+ }
+
+ /// Return the next half match for an infallible search if one exists, and
+ /// advance to the next position.
+ ///
+ /// This is like `try_advance_half`, except errors are converted into
+ /// panics.
+ ///
+ /// # Panics
+ ///
+ /// If the given closure returns an error, then this panics. This is useful
+ /// when you know your underlying regex engine has been configured to not
+ /// return an error.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use a `Searcher` to iterate over all matches
+ /// when using a DFA, which only provides "half" matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::iter::Searcher,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
+ /// let mut it = Searcher::new(input);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 10));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 21));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 32));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = None;
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This correctly moves iteration forward even when an empty match occurs:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::iter::Searcher,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let re = DFA::new(r"a|")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("abba");
+ /// let mut it = Searcher::new(input);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 1));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 2));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = None;
+ /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn advance_half<F>(&mut self, finder: F) -> Option<HalfMatch>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+ {
+ match self.try_advance_half(finder) {
+ Ok(m) => m,
+ Err(err) => panic!(
+ "unexpected regex half find error: {}\n\
+ to handle find errors, use 'try' or 'search' methods",
+ err,
+ ),
+ }
+ }
+
+ /// Return the next match for an infallible search if one exists, and
+ /// advance to the next position.
+ ///
+ /// The search is advanced even in the presence of empty matches by
+ /// forbidding empty matches from overlapping with any other match.
+ ///
+ /// This is like `try_advance`, except errors are converted into panics.
+ ///
+ /// # Panics
+ ///
+ /// If the given closure returns an error, then this panics. This is useful
+ /// when you know your underlying regex engine has been configured to not
+ /// return an error.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use a `Searcher` to iterate over all matches
+ /// when using a regex based on lazy DFAs:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::regex::Regex,
+ /// util::iter::Searcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
+ /// let mut it = Searcher::new(input);
+ ///
+ /// let expected = Some(Match::must(0, 0..10));
+ /// let got = it.advance(|input| re.try_search(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(Match::must(0, 11..21));
+ /// let got = it.advance(|input| re.try_search(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(Match::must(0, 22..32));
+ /// let got = it.advance(|input| re.try_search(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = None;
+ /// let got = it.advance(|input| re.try_search(&mut cache, input));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This example shows the same as above, but with the PikeVM. This example
+ /// is useful because it shows how to use this API even when the regex
+ /// engine doesn't directly return a `Match`.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::iter::Searcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
+ /// let mut it = Searcher::new(input);
+ ///
+ /// let expected = Some(Match::must(0, 0..10));
+ /// let got = it.advance(|input| {
+ /// re.search(&mut cache, input, &mut caps);
+ /// Ok(caps.get_match())
+ /// });
+ /// // Note that if we wanted to extract capturing group spans, we could
+ /// // do that here with 'caps'.
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(Match::must(0, 11..21));
+ /// let got = it.advance(|input| {
+ /// re.search(&mut cache, input, &mut caps);
+ /// Ok(caps.get_match())
+ /// });
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = Some(Match::must(0, 22..32));
+ /// let got = it.advance(|input| {
+ /// re.search(&mut cache, input, &mut caps);
+ /// Ok(caps.get_match())
+ /// });
+ /// assert_eq!(expected, got);
+ ///
+ /// let expected = None;
+ /// let got = it.advance(|input| {
+ /// re.search(&mut cache, input, &mut caps);
+ /// Ok(caps.get_match())
+ /// });
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn advance<F>(&mut self, finder: F) -> Option<Match>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+ {
+ match self.try_advance(finder) {
+ Ok(m) => m,
+ Err(err) => panic!(
+ "unexpected regex find error: {}\n\
+ to handle find errors, use 'try' or 'search' methods",
+ err,
+ ),
+ }
+ }
+
+ /// Return the next half match for a fallible search if one exists, and
+ /// advance to the next position.
+ ///
+ /// This is like `advance_half`, except it permits callers to handle errors
+ /// during iteration.
+ #[inline]
+ pub fn try_advance_half<F>(
+ &mut self,
+ mut finder: F,
+ ) -> Result<Option<HalfMatch>, MatchError>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+ {
+ let mut m = match finder(&self.input)? {
+ None => return Ok(None),
+ Some(m) => m,
+ };
+ if Some(m.offset()) == self.last_match_end {
+ m = match self.handle_overlapping_empty_half_match(m, finder)? {
+ None => return Ok(None),
+ Some(m) => m,
+ };
+ }
+ self.input.set_start(m.offset());
+ self.last_match_end = Some(m.offset());
+ Ok(Some(m))
+ }
+
+ /// Return the next match for a fallible search if one exists, and advance
+ /// to the next position.
+ ///
+ /// This is like `advance`, except it permits callers to handle errors
+ /// during iteration.
+ #[inline]
+ pub fn try_advance<F>(
+ &mut self,
+ mut finder: F,
+ ) -> Result<Option<Match>, MatchError>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+ {
+ let mut m = match finder(&self.input)? {
+ None => return Ok(None),
+ Some(m) => m,
+ };
+ if m.is_empty() && Some(m.end()) == self.last_match_end {
+ m = match self.handle_overlapping_empty_match(m, finder)? {
+ None => return Ok(None),
+ Some(m) => m,
+ };
+ }
+ self.input.set_start(m.end());
+ self.last_match_end = Some(m.end());
+ Ok(Some(m))
+ }
+
+ /// Given a closure that executes a single search, return an iterator over
+ /// all successive non-overlapping half matches.
+ ///
+ /// The iterator returned yields result values. If the underlying regex
+ /// engine is configured to never return an error, consider calling
+ /// [`TryHalfMatchesIter::infallible`] to convert errors into panics.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use a `Searcher` to create a proper
+ /// iterator over half matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::iter::Searcher,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
+ /// let mut it = Searcher::new(input).into_half_matches_iter(|input| {
+ /// re.try_search_fwd(&mut cache, input)
+ /// });
+ ///
+ /// let expected = Some(Ok(HalfMatch::must(0, 10)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = Some(Ok(HalfMatch::must(0, 21)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = Some(Ok(HalfMatch::must(0, 32)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = None;
+ /// assert_eq!(expected, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn into_half_matches_iter<F>(
+ self,
+ finder: F,
+ ) -> TryHalfMatchesIter<'h, F>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+ {
+ TryHalfMatchesIter { it: self, finder }
+ }
+
+ /// Given a closure that executes a single search, return an iterator over
+ /// all successive non-overlapping matches.
+ ///
+ /// The iterator returned yields result values. If the underlying regex
+ /// engine is configured to never return an error, consider calling
+ /// [`TryMatchesIter::infallible`] to convert errors into panics.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use a `Searcher` to create a proper
+ /// iterator over matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::regex::Regex,
+ /// util::iter::Searcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
+ /// let mut it = Searcher::new(input).into_matches_iter(|input| {
+ /// re.try_search(&mut cache, input)
+ /// });
+ ///
+ /// let expected = Some(Ok(Match::must(0, 0..10)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = Some(Ok(Match::must(0, 11..21)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = Some(Ok(Match::must(0, 22..32)));
+ /// assert_eq!(expected, it.next());
+ ///
+ /// let expected = None;
+ /// assert_eq!(expected, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn into_matches_iter<F>(self, finder: F) -> TryMatchesIter<'h, F>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+ {
+ TryMatchesIter { it: self, finder }
+ }
+
+ /// Given a closure that executes a single search, return an iterator over
+ /// all successive non-overlapping `Captures` values.
+ ///
+ /// The iterator returned yields result values. If the underlying regex
+ /// engine is configured to never return an error, consider calling
+ /// [`TryCapturesIter::infallible`] to convert errors into panics.
+ ///
+ /// Unlike the other iterator constructors, this accepts an initial
+ /// `Captures` value. This `Captures` value is reused for each search, and
+ /// the iterator implementation clones it before returning it. The caller
+ /// must provide this value because the iterator is purposely ignorant
+ /// of the underlying regex engine and thus doesn't know how to create
+ /// one itself. More to the point, a `Captures` value itself has a few
+ /// different constructors, which change which kind of information is
+ /// available to query in exchange for search performance.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use a `Searcher` to create a proper iterator
+ /// over `Captures` values, which provides access to all capturing group
+ /// spans for each match.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::iter::Searcher,
+ /// Input,
+ /// };
+ ///
+ /// let re = PikeVM::new(
+ /// r"(?P<y>[0-9]{4})-(?P<m>[0-9]{2})-(?P<d>[0-9]{2})",
+ /// )?;
+ /// let (mut cache, caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let haystack = "2010-03-14 2016-10-08 2020-10-22";
+ /// let input = Input::new(haystack);
+ /// let mut it = Searcher::new(input)
+ /// .into_captures_iter(caps, |input, caps| {
+ /// re.search(&mut cache, input, caps);
+ /// Ok(())
+ /// });
+ ///
+ /// let got = it.next().expect("first date")?;
+ /// let year = got.get_group_by_name("y").expect("must match");
+ /// assert_eq!("2010", &haystack[year]);
+ ///
+ /// let got = it.next().expect("second date")?;
+ /// let month = got.get_group_by_name("m").expect("must match");
+ /// assert_eq!("10", &haystack[month]);
+ ///
+ /// let got = it.next().expect("third date")?;
+ /// let day = got.get_group_by_name("d").expect("must match");
+ /// assert_eq!("22", &haystack[day]);
+ ///
+ /// assert!(it.next().is_none());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ pub fn into_captures_iter<F>(
+ self,
+ caps: Captures,
+ finder: F,
+ ) -> TryCapturesIter<'h, F>
+ where
+ F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>,
+ {
+ TryCapturesIter { it: self, caps, finder }
+ }
+
+ /// Handles the special case of a match that begins where the previous
+ /// match ended. Without this special handling, it'd be possible to get
+ /// stuck where an empty match never results in forward progress. This
+ /// also makes it more consistent with how presiding general purpose regex
+ /// engines work.
+ #[cold]
+ #[inline(never)]
+ fn handle_overlapping_empty_half_match<F>(
+ &mut self,
+ _: HalfMatch,
+ mut finder: F,
+ ) -> Result<Option<HalfMatch>, MatchError>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+ {
+ // Since we are only here when 'm.offset()' matches the offset of the
+ // last match, it follows that this must have been an empty match.
+ // Since we both need to make progress *and* prevent overlapping
+ // matches, we discard this match and advance the search by 1.
+ //
+ // Note that this may start a search in the middle of a codepoint. The
+ // regex engines themselves are expected to deal with that and not
+ // report any matches within a codepoint if they are configured in
+ // UTF-8 mode.
+ self.input.set_start(self.input.start().checked_add(1).unwrap());
+ finder(&self.input)
+ }
+
+ /// Handles the special case of an empty match by ensuring that 1) the
+ /// iterator always advances and 2) empty matches never overlap with other
+ /// matches.
+ ///
+ /// (1) is necessary because we principally make progress by setting the
+ /// starting location of the next search to the ending location of the last
+ /// match. But if a match is empty, then this results in a search that does
+ /// not advance and thus does not terminate.
+ ///
+ /// (2) is not strictly necessary, but makes intuitive sense and matches
+ /// the presiding behavior of most general purpose regex engines. The
+ /// "intuitive sense" here is that we want to report NON-overlapping
+ /// matches. So for example, given the regex 'a|(?:)' against the haystack
+ /// 'a', without the special handling, you'd get the matches [0, 1) and [1,
+ /// 1), where the latter overlaps with the end bounds of the former.
+ ///
+ /// Note that we mark this cold and forcefully prevent inlining because
+ /// handling empty matches like this is extremely rare and does require
+ /// quite a bit of code, comparatively. Keeping this code out of the main
+ /// iterator function keeps it smaller and more amenable to inlining
+ /// itself.
+ #[cold]
+ #[inline(never)]
+ fn handle_overlapping_empty_match<F>(
+ &mut self,
+ m: Match,
+ mut finder: F,
+ ) -> Result<Option<Match>, MatchError>
+ where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+ {
+ assert!(m.is_empty());
+ self.input.set_start(self.input.start().checked_add(1).unwrap());
+ finder(&self.input)
+ }
+}
+
+/// An iterator over all non-overlapping half matches for a fallible search.
+///
+/// The iterator yields a `Result<HalfMatch, MatchError>` value until no more
+/// matches could be found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_half_matches_iter`].
+pub struct TryHalfMatchesIter<'h, F> {
+ it: Searcher<'h>,
+ finder: F,
+}
+
+impl<'h, F> TryHalfMatchesIter<'h, F> {
+ /// Return an infallible version of this iterator.
+ ///
+ /// Any item yielded that corresponds to an error results in a panic. This
+ /// is useful if your underlying regex engine is configured in a way that
+ /// it is guaranteed to never return an error.
+ pub fn infallible(self) -> HalfMatchesIter<'h, F> {
+ HalfMatchesIter(self)
+ }
+
+ /// Returns the current `Input` used by this iterator.
+ ///
+ /// The `Input` returned is generally equivalent to the one used to
+ /// construct this iterator, but its start position may be different to
+ /// reflect the start of the next search to be executed.
+ pub fn input<'i>(&'i self) -> &'i Input<'h> {
+ self.it.input()
+ }
+}
+
+impl<'h, F> Iterator for TryHalfMatchesIter<'h, F>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+{
+ type Item = Result<HalfMatch, MatchError>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Result<HalfMatch, MatchError>> {
+ self.it.try_advance_half(&mut self.finder).transpose()
+ }
+}
+
+impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("TryHalfMatchesIter")
+ .field("it", &self.it)
+ .field("finder", &"<closure>")
+ .finish()
+ }
+}
+
+/// An iterator over all non-overlapping half matches for an infallible search.
+///
+/// The iterator yields a [`HalfMatch`] value until no more matches could be
+/// found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_half_matches_iter`] and
+/// then calling [`TryHalfMatchesIter::infallible`].
+#[derive(Debug)]
+pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>);
+
+impl<'h, F> HalfMatchesIter<'h, F> {
+ /// Returns the current `Input` used by this iterator.
+ ///
+ /// The `Input` returned is generally equivalent to the one used to
+ /// construct this iterator, but its start position may be different to
+ /// reflect the start of the next search to be executed.
+ pub fn input<'i>(&'i self) -> &'i Input<'h> {
+ self.0.it.input()
+ }
+}
+
+impl<'h, F> Iterator for HalfMatchesIter<'h, F>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<HalfMatch>, MatchError>,
+{
+ type Item = HalfMatch;
+
+ #[inline]
+ fn next(&mut self) -> Option<HalfMatch> {
+ match self.0.next()? {
+ Ok(m) => Some(m),
+ Err(err) => panic!(
+ "unexpected regex half find error: {}\n\
+ to handle find errors, use 'try' or 'search' methods",
+ err,
+ ),
+ }
+ }
+}
+
+/// An iterator over all non-overlapping matches for a fallible search.
+///
+/// The iterator yields a `Result<Match, MatchError>` value until no more
+/// matches could be found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_matches_iter`].
+pub struct TryMatchesIter<'h, F> {
+ it: Searcher<'h>,
+ finder: F,
+}
+
+impl<'h, F> TryMatchesIter<'h, F> {
+ /// Return an infallible version of this iterator.
+ ///
+ /// Any item yielded that corresponds to an error results in a panic. This
+ /// is useful if your underlying regex engine is configured in a way that
+ /// it is guaranteed to never return an error.
+ pub fn infallible(self) -> MatchesIter<'h, F> {
+ MatchesIter(self)
+ }
+
+ /// Returns the current `Input` used by this iterator.
+ ///
+ /// The `Input` returned is generally equivalent to the one used to
+ /// construct this iterator, but its start position may be different to
+ /// reflect the start of the next search to be executed.
+ pub fn input<'i>(&'i self) -> &'i Input<'h> {
+ self.it.input()
+ }
+}
+
+impl<'h, F> Iterator for TryMatchesIter<'h, F>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+{
+ type Item = Result<Match, MatchError>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Result<Match, MatchError>> {
+ self.it.try_advance(&mut self.finder).transpose()
+ }
+}
+
+impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("TryMatchesIter")
+ .field("it", &self.it)
+ .field("finder", &"<closure>")
+ .finish()
+ }
+}
+
+/// An iterator over all non-overlapping matches for an infallible search.
+///
+/// The iterator yields a [`Match`] value until no more matches could be found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_matches_iter`] and
+/// then calling [`TryMatchesIter::infallible`].
+#[derive(Debug)]
+pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>);
+
+impl<'h, F> MatchesIter<'h, F> {
+ /// Returns the current `Input` used by this iterator.
+ ///
+ /// The `Input` returned is generally equivalent to the one used to
+ /// construct this iterator, but its start position may be different to
+ /// reflect the start of the next search to be executed.
+ pub fn input<'i>(&'i self) -> &'i Input<'h> {
+ self.0.it.input()
+ }
+}
+
+impl<'h, F> Iterator for MatchesIter<'h, F>
+where
+ F: FnMut(&Input<'_>) -> Result<Option<Match>, MatchError>,
+{
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ match self.0.next()? {
+ Ok(m) => Some(m),
+ Err(err) => panic!(
+ "unexpected regex find error: {}\n\
+ to handle find errors, use 'try' or 'search' methods",
+ err,
+ ),
+ }
+ }
+}
+
+/// An iterator over all non-overlapping captures for a fallible search.
+///
+/// The iterator yields a `Result<Captures, MatchError>` value until no more
+/// matches could be found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_captures_iter`].
+#[cfg(feature = "alloc")]
+pub struct TryCapturesIter<'h, F> {
+ it: Searcher<'h>,
+ caps: Captures,
+ finder: F,
+}
+
+#[cfg(feature = "alloc")]
+impl<'h, F> TryCapturesIter<'h, F> {
+ /// Return an infallible version of this iterator.
+ ///
+ /// Any item yielded that corresponds to an error results in a panic. This
+ /// is useful if your underlying regex engine is configured in a way that
+ /// it is guaranteed to never return an error.
+ pub fn infallible(self) -> CapturesIter<'h, F> {
+ CapturesIter(self)
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'h, F> Iterator for TryCapturesIter<'h, F>
+where
+ F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>,
+{
+ type Item = Result<Captures, MatchError>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Result<Captures, MatchError>> {
+ let TryCapturesIter { ref mut it, ref mut caps, ref mut finder } =
+ *self;
+ let result = it
+ .try_advance(|input| {
+ (finder)(input, caps)?;
+ Ok(caps.get_match())
+ })
+ .transpose()?;
+ match result {
+ Ok(_) => Some(Ok(caps.clone())),
+ Err(err) => Some(Err(err)),
+ }
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'h, F> core::fmt::Debug for TryCapturesIter<'h, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("TryCapturesIter")
+ .field("it", &self.it)
+ .field("caps", &self.caps)
+ .field("finder", &"<closure>")
+ .finish()
+ }
+}
+
+/// An iterator over all non-overlapping captures for an infallible search.
+///
+/// The iterator yields a [`Captures`] value until no more matches could be
+/// found.
+///
+/// The type parameters are as follows:
+///
+/// * `F` represents the type of a closure that executes the search.
+///
+/// The lifetime parameters come from the [`Input`] type:
+///
+/// * `'h` is the lifetime of the underlying haystack.
+///
+/// When possible, prefer the iterators defined on the regex engine you're
+/// using. This tries to abstract over the regex engine and is thus a bit more
+/// unwieldy to use.
+///
+/// This iterator is created by [`Searcher::into_captures_iter`] and then
+/// calling [`TryCapturesIter::infallible`].
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub struct CapturesIter<'h, F>(TryCapturesIter<'h, F>);
+
+#[cfg(feature = "alloc")]
+impl<'h, F> Iterator for CapturesIter<'h, F>
+where
+ F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>,
+{
+ type Item = Captures;
+
+ #[inline]
+ fn next(&mut self) -> Option<Captures> {
+ match self.0.next()? {
+ Ok(m) => Some(m),
+ Err(err) => panic!(
+ "unexpected regex captures error: {}\n\
+ to handle find errors, use 'try' or 'search' methods",
+ err,
+ ),
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs
index d8cac6ef4..de27a2a6e 100644
--- a/vendor/regex-automata/src/util/lazy.rs
+++ b/vendor/regex-automata/src/util/lazy.rs
@@ -1,31 +1,465 @@
-use core::{
- cell::Cell,
- ptr,
- sync::atomic::{AtomicPtr, Ordering},
-};
-
-use alloc::{boxed::Box, vec::Vec};
-
-#[inline(always)]
-pub(crate) fn get_or_init<T: Send + Sync + 'static>(
- location: &'static AtomicPtr<T>,
- init: impl FnOnce() -> T,
-) -> &'static T {
- let mut ptr = location.load(Ordering::Acquire);
- if ptr.is_null() {
- let new_dfa = Box::new(init());
- ptr = Box::into_raw(new_dfa);
- let result = location.compare_exchange(
- ptr::null_mut(),
- ptr,
- Ordering::AcqRel,
- Ordering::Acquire,
- );
- if let Err(old) = result {
- let redundant = unsafe { Box::from_raw(ptr) };
- drop(redundant);
- ptr = old;
- }
- }
- unsafe { &*ptr }
+/*!
+A lazily initialized value for safe sharing between threads.
+
+The principal type in this module is `Lazy`, which makes it easy to construct
+values that are shared safely across multiple threads simultaneously.
+*/
+
+use core::fmt;
+
+/// A lazily initialized value that implements `Deref` for `T`.
+///
+/// A `Lazy` takes an initialization function and permits callers from any
+/// thread to access the result of that initialization function in a safe
+/// manner. In effect, this permits one-time initialization of global resources
+/// in a (possibly) multi-threaded program.
+///
+/// This type and its functionality are available even when neither the `alloc`
+/// nor the `std` features are enabled. In exchange, a `Lazy` does **not**
+/// guarantee that the given `create` function is called at most once. It
+/// might be called multiple times. Moreover, a call to `Lazy::get` (either
+/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T`
+/// is available.
+///
+/// This is very similar to `lazy_static` or `once_cell`, except it doesn't
+/// guarantee that the initialization function will be run once and it works
+/// in no-alloc no-std environments. With that said, if you need stronger
+/// guarantees or a more flexible API, then it is recommended to use either
+/// `lazy_static` or `once_cell`.
+///
+/// # Warning: may use a spin lock
+///
+/// When this crate is compiled _without_ the `alloc` feature, then this type
+/// may used a spin lock internally. This can have subtle effects that may
+/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more
+/// thorough treatment of this topic.
+///
+/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
+///
+/// # Example
+///
+/// This type is useful for creating regexes once, and then using them from
+/// multiple threads simultaneously without worrying about synchronization.
+///
+/// ```
+/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match};
+///
+/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap());
+///
+/// let expected = Some(Match::must(0, 3..14));
+/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz"));
+/// ```
+pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>);
+
+impl<T, F> Lazy<T, F> {
+ /// Create a new `Lazy` value that is initialized via the given function.
+ ///
+ /// The `T` type is automatically inferred from the return type of the
+ /// `create` function given.
+ pub const fn new(create: F) -> Lazy<T, F> {
+ Lazy(lazy::Lazy::new(create))
+ }
+}
+
+impl<T, F: Fn() -> T> Lazy<T, F> {
+ /// Return a reference to the lazily initialized value.
+ ///
+ /// This routine may block if another thread is initializing a `T`.
+ ///
+ /// Note that given a `x` which has type `Lazy`, this must be called via
+ /// `Lazy::get(x)` and not `x.get()`. This routine is defined this way
+ /// because `Lazy` impls `Deref` with a target of `T`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the `create` function inside this lazy value panics.
+ /// If the panic occurred in another thread, then this routine _may_ also
+ /// panic (but is not guaranteed to do so).
+ pub fn get(this: &Lazy<T, F>) -> &T {
+ this.0.get()
+ }
+}
+
+impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> {
+ type Target = T;
+
+ fn deref(&self) -> &T {
+ Lazy::get(self)
+ }
+}
+
+impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ self.0.fmt(f)
+ }
+}
+
+#[cfg(feature = "alloc")]
+mod lazy {
+ use core::{
+ fmt,
+ marker::PhantomData,
+ sync::atomic::{AtomicPtr, Ordering},
+ };
+
+ use alloc::boxed::Box;
+
+ /// A non-std lazy initialized value.
+ ///
+ /// This might run the initialization function more than once, but will
+ /// never block.
+ ///
+ /// I wish I could get these semantics into the non-alloc non-std Lazy
+ /// type below, but I'm not sure how to do it. If you can do an alloc,
+ /// then the implementation becomes very simple if you don't care about
+ /// redundant work precisely because a pointer can be atomically swapped.
+ ///
+ /// Perhaps making this approach work in the non-alloc non-std case
+ /// requires asking the caller for a pointer? It would make the API less
+ /// convenient I think.
+ pub(super) struct Lazy<T, F> {
+ data: AtomicPtr<T>,
+ create: F,
+ // This indicates to the compiler that this type can drop T. It's not
+ // totally clear how the absence of this marker could lead to trouble,
+ // but putting here doesn't have any downsides so we hedge until somone
+ // can from the Unsafe Working Group can tell us definitively that we
+ // don't need it.
+ //
+ // See: https://github.com/BurntSushi/regex-automata/issues/30
+ owned: PhantomData<Box<T>>,
+ }
+
+ // SAFETY: So long as T and &T (and F and &F) can themselves be safely
+ // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
+ // permits accessing a &T and initialization is free of data races. So if T
+ // is thread safe, then so to is Lazy<T, _>.
+ //
+ // We specifically require that T: Send in order for Lazy<T> to be Sync.
+ // Without that requirement, it's possible to send a T from one thread to
+ // another via Lazy's destructor.
+ //
+ // It's not clear whether we need F: Send+Sync for Lazy to be Sync. But
+ // we're conservative for now and keep both.
+ unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
+
+ impl<T, F> Lazy<T, F> {
+ /// Create a new alloc but non-std lazy value that is racily
+ /// initialized. That is, the 'create' function may be called more than
+ /// once.
+ pub(super) const fn new(create: F) -> Lazy<T, F> {
+ Lazy {
+ data: AtomicPtr::new(core::ptr::null_mut()),
+ create,
+ owned: PhantomData,
+ }
+ }
+ }
+
+ impl<T, F: Fn() -> T> Lazy<T, F> {
+ /// Get the underlying lazy value. If it hasn't been initialized
+ /// yet, then always attempt to initialize it (even if some other
+ /// thread is initializing it) and atomically attach it to this lazy
+ /// value before returning it.
+ pub(super) fn get(&self) -> &T {
+ if let Some(data) = self.poll() {
+ return data;
+ }
+ let data = (self.create)();
+ let mut ptr = Box::into_raw(Box::new(data));
+ // We attempt to stuff our initialized value into our atomic
+ // pointer. Upon success, we don't need to do anything. But if
+ // someone else beat us to the punch, then we need to make sure
+ // our newly created value is dropped.
+ let result = self.data.compare_exchange(
+ core::ptr::null_mut(),
+ ptr,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ );
+ if let Err(old) = result {
+ // SAFETY: We created 'ptr' via Box::into_raw above, so turning
+ // it back into a Box via from_raw is safe.
+ drop(unsafe { Box::from_raw(ptr) });
+ ptr = old;
+ }
+ // SAFETY: We just set the pointer above to a non-null value, even
+ // in the error case, and set it to a fully initialized value
+ // returned by 'create'.
+ unsafe { &*ptr }
+ }
+
+ /// If this lazy value has been initialized successfully, then return
+ /// that value. Otherwise return None immediately. This never attempts
+ /// to run initialization itself.
+ fn poll(&self) -> Option<&T> {
+ let ptr = self.data.load(Ordering::Acquire);
+ if ptr.is_null() {
+ return None;
+ }
+ // SAFETY: We just checked that the pointer is not null. Since it's
+ // not null, it must have been fully initialized by 'get' at some
+ // point.
+ Some(unsafe { &*ptr })
+ }
+ }
+
+ impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("Lazy").field("data", &self.poll()).finish()
+ }
+ }
+
+ impl<T, F> Drop for Lazy<T, F> {
+ fn drop(&mut self) {
+ let ptr = *self.data.get_mut();
+ if !ptr.is_null() {
+ // SAFETY: We just checked that 'ptr' is not null. And since
+ // we have exclusive access, there are no races to worry about.
+ drop(unsafe { Box::from_raw(ptr) });
+ }
+ }
+ }
+}
+
+#[cfg(not(feature = "alloc"))]
+mod lazy {
+ use core::{
+ cell::Cell,
+ fmt,
+ mem::MaybeUninit,
+ panic::{RefUnwindSafe, UnwindSafe},
+ sync::atomic::{AtomicU8, Ordering},
+ };
+
+ /// Our 'Lazy' value can be in one of three states:
+ ///
+ /// * INIT is where it starts, and also ends up back here if the
+ /// 'create' routine panics.
+ /// * BUSY is where it sits while initialization is running in exactly
+ /// one thread.
+ /// * DONE is where it sits after 'create' has completed and 'data' has
+ /// been fully initialized.
+ const LAZY_STATE_INIT: u8 = 0;
+ const LAZY_STATE_BUSY: u8 = 1;
+ const LAZY_STATE_DONE: u8 = 2;
+
+ /// A non-alloc non-std lazy initialized value.
+ ///
+ /// This guarantees initialization only happens once, but uses a spinlock
+ /// to block in the case of simultaneous access. Blocking occurs so that
+ /// one thread waits while another thread initializes the value.
+ ///
+ /// I would much rather have the semantics of the 'alloc' Lazy type above.
+ /// Namely, that we might run the initialization function more than once,
+ /// but we never otherwise block. However, I don't know how to do that in
+ /// a non-alloc non-std context.
+ pub(super) struct Lazy<T, F> {
+ state: AtomicU8,
+ create: Cell<Option<F>>,
+ data: Cell<MaybeUninit<T>>,
+ }
+
+ // SAFETY: So long as T and &T (and F and &F) can themselves be safely
+ // shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
+ // permits accessing a &T and initialization is free of data races. So if T
+ // is thread safe, then so to is Lazy<T, _>.
+ unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
+ // A reference to a Lazy is unwind safe because we specifically take
+ // precautions to poison all accesses to a Lazy if the caller-provided
+ // 'create' function panics.
+ impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe
+ for Lazy<T, F>
+ {
+ }
+
+ impl<T, F> Lazy<T, F> {
+ /// Create a new non-alloc non-std lazy value that is initialized
+ /// exactly once on first use using the given function.
+ pub(super) const fn new(create: F) -> Lazy<T, F> {
+ Lazy {
+ state: AtomicU8::new(LAZY_STATE_INIT),
+ create: Cell::new(Some(create)),
+ data: Cell::new(MaybeUninit::uninit()),
+ }
+ }
+ }
+
+ impl<T, F: FnOnce() -> T> Lazy<T, F> {
+ /// Get the underlying lazy value. If it isn't been initialized
+ /// yet, then either initialize it or block until some other thread
+ /// initializes it. If the 'create' function given to Lazy::new panics
+ /// (even in another thread), then this panics too.
+ pub(super) fn get(&self) -> &T {
+ // This is effectively a spinlock. We loop until we enter a DONE
+ // state, and if possible, initialize it ourselves. The only way
+ // we exit the loop is if 'create' panics, we initialize 'data' or
+ // some other thread initializes 'data'.
+ //
+ // Yes, I have read spinlocks considered harmful[1]. And that
+ // article is why this spinlock is only active when 'alloc' isn't
+ // enabled. I did this because I don't think there is really
+ // another choice without 'alloc', other than not providing this at
+ // all. But I think that's a big bummer.
+ //
+ // [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
+ while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE {
+ // Check if we're the first ones to get here. If so, we'll be
+ // the ones who initialize.
+ let result = self.state.compare_exchange(
+ LAZY_STATE_INIT,
+ LAZY_STATE_BUSY,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ );
+ // This means we saw the INIT state and nobody else can. So we
+ // must take responsibility for initializing. And by virtue of
+ // observing INIT, we have also told anyone else trying to
+ // get here that we are BUSY. If someone else sees BUSY, then
+ // they will spin until we finish initialization.
+ if let Ok(_) = result {
+ // Since we are guaranteed to be the only ones here, we
+ // know that 'create' is there... Unless someone else got
+ // here before us and 'create' panicked. In which case,
+ // 'self.create' is now 'None' and we forward the panic
+ // to the caller. (i.e., We implement poisoning.)
+ //
+ // SAFETY: Our use of 'self.state' guarantees that we are
+ // the only thread executing this line, and thus there are
+ // no races.
+ let create = unsafe {
+ (*self.create.as_ptr()).take().expect(
+ "Lazy's create function panicked, \
+ preventing initialization,
+ poisoning current thread",
+ )
+ };
+ let guard = Guard { state: &self.state };
+ // SAFETY: Our use of 'self.state' guarantees that we are
+ // the only thread executing this line, and thus there are
+ // no races.
+ unsafe {
+ (*self.data.as_ptr()).as_mut_ptr().write(create());
+ }
+ // All is well. 'self.create' ran successfully, so we
+ // forget the guard.
+ core::mem::forget(guard);
+ // Everything is initialized, so we can declare success.
+ self.state.store(LAZY_STATE_DONE, Ordering::Release);
+ break;
+ }
+ core::hint::spin_loop();
+ }
+ // We only get here if data is fully initialized, and thus poll
+ // will always return something.
+ self.poll().unwrap()
+ }
+
+ /// If this lazy value has been initialized successfully, then return
+ /// that value. Otherwise return None immediately. This never blocks.
+ fn poll(&self) -> Option<&T> {
+ if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE {
+ // SAFETY: The DONE state only occurs when data has been fully
+ // initialized.
+ Some(unsafe { &*(*self.data.as_ptr()).as_ptr() })
+ } else {
+ None
+ }
+ }
+ }
+
+ impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("Lazy")
+ .field("state", &self.state.load(Ordering::Acquire))
+ .field("create", &"<closure>")
+ .field("data", &self.poll())
+ .finish()
+ }
+ }
+
+ impl<T, F> Drop for Lazy<T, F> {
+ fn drop(&mut self) {
+ if *self.state.get_mut() == LAZY_STATE_DONE {
+ // SAFETY: state is DONE if and only if data has been fully
+ // initialized. At which point, it is safe to drop.
+ unsafe {
+ // MSRV(1.60): Use assume_init_drop. The below is how
+ // assume_init_drop is implemented.
+ core::ptr::drop_in_place(
+ (*self.data.as_ptr()).as_mut_ptr(),
+ )
+ }
+ }
+ }
+ }
+
+ /// A guard that will reset a Lazy's state back to INIT when dropped. The
+ /// idea here is to 'forget' this guard on success. On failure (when a
+ /// panic occurs), the Drop impl runs and causes all in-progress and future
+ /// 'get' calls to panic. Without this guard, all in-progress and future
+ /// 'get' calls would spin forever. Crashing is much better than getting
+ /// stuck in an infinite loop.
+ struct Guard<'a> {
+ state: &'a AtomicU8,
+ }
+
+ impl<'a> Drop for Guard<'a> {
+ fn drop(&mut self) {
+ // We force ourselves back into an INIT state. This will in turn
+ // cause any future 'get' calls to attempt calling 'self.create'
+ // again which will in turn panic because 'self.create' will now
+ // be 'None'.
+ self.state.store(LAZY_STATE_INIT, Ordering::Release);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn assert_send<T: Send>() {}
+ fn assert_sync<T: Sync>() {}
+ fn assert_unwind<T: core::panic::UnwindSafe>() {}
+ fn assert_refunwind<T: core::panic::RefUnwindSafe>() {}
+
+ #[test]
+ fn oibits() {
+ assert_send::<Lazy<u64>>();
+ assert_sync::<Lazy<u64>>();
+ assert_unwind::<Lazy<u64>>();
+ assert_refunwind::<Lazy<u64>>();
+ }
+
+ // This is a regression test because we used to rely on the inferred Sync
+ // impl for the Lazy type defined above (for 'alloc' mode). In the
+ // inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But
+ // if we have that, we can actually make use of the fact that Lazy<T> drops
+ // T to create a value on one thread and drop it on another. This *should*
+ // require T: Send, but our missing bounds before let it sneak by.
+ //
+ // Basically, this test should not compile, so we... comment it out. We
+ // don't have a great way of testing compile-fail tests right now.
+ //
+ // See: https://github.com/BurntSushi/regex-automata/issues/30
+ /*
+ #[test]
+ fn sync_not_send() {
+ #[allow(dead_code)]
+ fn inner<T: Sync + Default>() {
+ let lazy = Lazy::new(move || T::default());
+ std::thread::scope(|scope| {
+ scope.spawn(|| {
+ Lazy::get(&lazy); // We create T in this thread
+ });
+ });
+ // And drop in this thread.
+ drop(lazy);
+ // So we have send a !Send type over threads. (with some more
+ // legwork, its possible to even sneak the value out of drop
+ // through thread local)
+ }
+ }
+ */
}
diff --git a/vendor/regex-automata/src/util/look.rs b/vendor/regex-automata/src/util/look.rs
new file mode 100644
index 000000000..aee31b34e
--- /dev/null
+++ b/vendor/regex-automata/src/util/look.rs
@@ -0,0 +1,1748 @@
+/*!
+Types and routines for working with look-around assertions.
+
+This module principally defines two types:
+
+* [`Look`] enumerates all of the assertions supported by this crate.
+* [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
+* [`LookMatcher`] provides routines for checking whether a `Look` or a
+`LookSet` matches at a particular position in a haystack.
+*/
+
+// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
+// copied verbatim from the regex-syntax crate. I would have no problems using
+// the regex-syntax types and defining the matching routines (only found
+// in this crate) as free functions, except the `Look` and `LookSet` types
+// are used in lots of places. Including in places we expect to work when
+// regex-syntax is *not* enabled, such as in the definition of the NFA itself.
+//
+// Thankfully the code we copy is pretty simple and there isn't much of it.
+// Otherwise, the rest of this module deals with *matching* the assertions,
+// which is not something that regex-syntax handles.
+
+use crate::util::{escape::DebugByte, utf8};
+
+/// A look-around assertion.
+///
+/// An assertion matches at a position between characters in a haystack.
+/// Namely, it does not actually "consume" any input as most parts of a regular
+/// expression do. Assertions are a way of stating that some property must be
+/// true at a particular point during matching.
+///
+/// For example, `(?m)^[a-z]+$` is a pattern that:
+///
+/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That
+/// occurs at either the beginning of the haystack, or immediately following
+/// a `\n` character.
+/// * Looks for one or more occurrences of `[a-z]`.
+/// * Once `[a-z]+` has matched as much as it can, an overall match is only
+/// reported when `[a-z]+` stops just before a `\n`.
+///
+/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
+///
+/// Assertions are also called "look-around," "look-behind" and "look-ahead."
+/// Specifically, some assertions are look-behind (like `^`), other assertions
+/// are look-ahead (like `$`) and yet other assertions are both look-ahead and
+/// look-behind (like `\b`).
+///
+/// # Assertions in an NFA
+///
+/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
+/// thought of as a conditional epsilon transition. That is, a matching engine
+/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
+/// moving through conditional epsilon transitions when their condition
+/// is satisfied at whatever position the `PikeVM` is currently at in the
+/// haystack.
+///
+/// How assertions are handled in a `DFA` is trickier, since a DFA does not
+/// have epsilon transitions at all. In this case, they are compiled into the
+/// automaton itself, at the expense of more states than what would be required
+/// without an assertion.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Look {
+ /// Match the beginning of text. Specifically, this matches at the starting
+ /// position of the input.
+ Start = 1 << 0,
+ /// Match the end of text. Specifically, this matches at the ending
+ /// position of the input.
+ End = 1 << 1,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following a `\n` character.
+ StartLF = 1 << 2,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\n` character.
+ EndLF = 1 << 3,
+ /// Match the beginning of a line or the beginning of text. Specifically,
+ /// this matches at the starting position of the input, or at the position
+ /// immediately following either a `\r` or `\n` character, but never after
+ /// a `\r` when a `\n` follows.
+ StartCRLF = 1 << 4,
+ /// Match the end of a line or the end of text. Specifically, this matches
+ /// at the end position of the input, or at the position immediately
+ /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
+ /// precedes it.
+ EndCRLF = 1 << 5,
+ /// Match an ASCII-only word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ WordAscii = 1 << 6,
+ /// Match an ASCII-only negation of a word boundary.
+ WordAsciiNegate = 1 << 7,
+ /// Match a Unicode-aware word boundary. That is, this matches a position
+ /// where the left adjacent character and right adjacent character
+ /// correspond to a word and non-word or a non-word and word character.
+ WordUnicode = 1 << 8,
+ /// Match a Unicode-aware negation of a word boundary.
+ WordUnicodeNegate = 1 << 9,
+}
+
+impl Look {
+ /// Flip the look-around assertion to its equivalent for reverse searches.
+ /// For example, `StartLF` gets translated to `EndLF`.
+ ///
+ /// Some assertions, such as `WordUnicode`, remain the same since they
+ /// match the same positions regardless of the direction of the search.
+ #[inline]
+ pub const fn reversed(self) -> Look {
+ match self {
+ Look::Start => Look::End,
+ Look::End => Look::Start,
+ Look::StartLF => Look::EndLF,
+ Look::EndLF => Look::StartLF,
+ Look::StartCRLF => Look::EndCRLF,
+ Look::EndCRLF => Look::StartCRLF,
+ Look::WordAscii => Look::WordAscii,
+ Look::WordAsciiNegate => Look::WordAsciiNegate,
+ Look::WordUnicode => Look::WordUnicode,
+ Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ }
+ }
+
+ /// Return the underlying representation of this look-around enumeration
+ /// as an integer. Giving the return value to the [`Look::from_repr`]
+ /// constructor is guaranteed to return the same look-around variant that
+ /// one started with within a semver compatible release of this crate.
+ #[inline]
+ pub const fn as_repr(self) -> u16 {
+ // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+ // actual int.
+ self as u16
+ }
+
+ /// Given the underlying representation of a `Look` value, return the
+ /// corresponding `Look` value if the representation is valid. Otherwise
+ /// `None` is returned.
+ #[inline]
+ pub const fn from_repr(repr: u16) -> Option<Look> {
+ match repr {
+ 0b00_0000_0001 => Some(Look::Start),
+ 0b00_0000_0010 => Some(Look::End),
+ 0b00_0000_0100 => Some(Look::StartLF),
+ 0b00_0000_1000 => Some(Look::EndLF),
+ 0b00_0001_0000 => Some(Look::StartCRLF),
+ 0b00_0010_0000 => Some(Look::EndCRLF),
+ 0b00_0100_0000 => Some(Look::WordAscii),
+ 0b00_1000_0000 => Some(Look::WordAsciiNegate),
+ 0b01_0000_0000 => Some(Look::WordUnicode),
+ 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+ _ => None,
+ }
+ }
+
+ /// Returns a convenient single codepoint representation of this
+ /// look-around assertion. Each assertion is guaranteed to be represented
+ /// by a distinct character.
+ ///
+ /// This is useful for succinctly representing a look-around assertion in
+ /// human friendly but succinct output intended for a programmer working on
+ /// regex internals.
+ #[inline]
+ pub const fn as_char(self) -> char {
+ match self {
+ Look::Start => 'A',
+ Look::End => 'z',
+ Look::StartLF => '^',
+ Look::EndLF => '$',
+ Look::StartCRLF => 'r',
+ Look::EndCRLF => 'R',
+ Look::WordAscii => 'b',
+ Look::WordAsciiNegate => 'B',
+ Look::WordUnicode => '𝛃',
+ Look::WordUnicodeNegate => '𝚩',
+ }
+ }
+}
+
+/// LookSet is a memory-efficient set of look-around assertions.
+///
+/// This is useful for efficiently tracking look-around assertions. For
+/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
+/// that return `LookSet`s.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+pub struct LookSet {
+ /// The underlying representation this set is exposed to make it possible
+ /// to store it somewhere efficiently. The representation is that
+ /// of a bitset, where each assertion occupies bit `i` where `i =
+ /// Look::as_repr()`.
+ ///
+ /// Note that users of this internal representation must permit the full
+ /// range of `u16` values to be represented. For example, even if the
+ /// current implementation only makes use of the 10 least significant bits,
+ /// it may use more bits in a future semver compatible release.
+ pub bits: u16,
+}
+
+impl LookSet {
+ /// Create an empty set of look-around assertions.
+ #[inline]
+ pub fn empty() -> LookSet {
+ LookSet { bits: 0 }
+ }
+
+ /// Create a full set of look-around assertions.
+ ///
+ /// This set contains all possible look-around assertions.
+ #[inline]
+ pub fn full() -> LookSet {
+ LookSet { bits: !0 }
+ }
+
+ /// Create a look-around set containing the look-around assertion given.
+ ///
+ /// This is a convenience routine for creating an empty set and inserting
+ /// one look-around assertions.
+ #[inline]
+ pub fn singleton(look: Look) -> LookSet {
+ LookSet::empty().insert(look)
+ }
+
+ /// Returns the total number of look-around assertions in this set.
+ #[inline]
+ pub fn len(self) -> usize {
+ // OK because max value always fits in a u8, which in turn always
+ // fits in a usize, regardless of target.
+ usize::try_from(self.bits.count_ones()).unwrap()
+ }
+
+ /// Returns true if and only if this set is empty.
+ #[inline]
+ pub fn is_empty(self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns true if and only if the given look-around assertion is in this
+ /// set.
+ #[inline]
+ pub fn contains(self, look: Look) -> bool {
+ self.bits & look.as_repr() != 0
+ }
+
+ /// Returns true if and only if this set contains any anchor assertions.
+ /// This includes both "start/end of haystack" and "start/end of line."
+ #[inline]
+ pub fn contains_anchor(&self) -> bool {
+ self.contains_anchor_haystack() || self.contains_anchor_line()
+ }
+
+ /// Returns true if and only if this set contains any "start/end of
+ /// haystack" anchors. This doesn't include "start/end of line" anchors.
+ #[inline]
+ pub fn contains_anchor_haystack(&self) -> bool {
+ self.contains(Look::Start) || self.contains(Look::End)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors. This doesn't include "start/end of haystack" anchors. This
+ /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
+ #[inline]
+ pub fn contains_anchor_line(&self) -> bool {
+ self.contains(Look::StartLF)
+ || self.contains(Look::EndLF)
+ || self.contains(Look::StartCRLF)
+ || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that only treat `\n` as line terminators. This does not include
+ /// haystack anchors or CRLF aware line anchors.
+ #[inline]
+ pub fn contains_anchor_lf(&self) -> bool {
+ self.contains(Look::StartLF) || self.contains(Look::EndLF)
+ }
+
+ /// Returns true if and only if this set contains any "start/end of line"
+ /// anchors that are CRLF-aware. This doesn't include "start/end of
+ /// haystack" or "start/end of line-feed" anchors.
+ #[inline]
+ pub fn contains_anchor_crlf(&self) -> bool {
+ self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
+ }
+
+ /// Returns true if and only if this set contains any word boundary or
+ /// negated word boundary assertions. This include both Unicode and ASCII
+ /// word boundaries.
+ #[inline]
+ pub fn contains_word(self) -> bool {
+ self.contains_word_unicode() || self.contains_word_ascii()
+ }
+
+ /// Returns true if and only if this set contains any Unicode word boundary
+ /// or negated Unicode word boundary assertions.
+ #[inline]
+ pub fn contains_word_unicode(self) -> bool {
+ self.contains(Look::WordUnicode)
+ || self.contains(Look::WordUnicodeNegate)
+ }
+
+ /// Returns true if and only if this set contains any ASCII word boundary
+ /// or negated ASCII word boundary assertions.
+ #[inline]
+ pub fn contains_word_ascii(self) -> bool {
+ self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
+ }
+
+ /// Returns an iterator over all of the look-around assertions in this set.
+ #[inline]
+ pub fn iter(self) -> LookSetIter {
+ LookSetIter { set: self }
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion added to it. If the assertion is already in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn insert(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits | look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of inserting the given
+ /// assertion into this set.
+ #[inline]
+ pub fn set_insert(&mut self, look: Look) {
+ *self = self.insert(look);
+ }
+
+ /// Return a new set that is equivalent to the original, but with the given
+ /// assertion removed from it. If the assertion is not in the set, then the
+ /// returned set is equivalent to the original.
+ #[inline]
+ pub fn remove(self, look: Look) -> LookSet {
+ LookSet { bits: self.bits & !look.as_repr() }
+ }
+
+ /// Updates this set in place with the result of removing the given
+ /// assertion from this set.
+ #[inline]
+ pub fn set_remove(&mut self, look: Look) {
+ *self = self.remove(look);
+ }
+
+ /// Returns a new set that is the result of subtracting the given set from
+ /// this set.
+ #[inline]
+ pub fn subtract(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & !other.bits }
+ }
+
+ /// Updates this set in place with the result of subtracting the given set
+ /// from this set.
+ #[inline]
+ pub fn set_subtract(&mut self, other: LookSet) {
+ *self = self.subtract(other);
+ }
+
+ /// Returns a new set that is the union of this and the one given.
+ #[inline]
+ pub fn union(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits | other.bits }
+ }
+
+ /// Updates this set in place with the result of unioning it with the one
+ /// given.
+ #[inline]
+ pub fn set_union(&mut self, other: LookSet) {
+ *self = self.union(other);
+ }
+
+ /// Returns a new set that is the intersection of this and the one given.
+ #[inline]
+ pub fn intersect(self, other: LookSet) -> LookSet {
+ LookSet { bits: self.bits & other.bits }
+ }
+
+ /// Updates this set in place with the result of intersecting it with the
+ /// one given.
+ #[inline]
+ pub fn set_intersect(&mut self, other: LookSet) {
+ *self = self.intersect(other);
+ }
+
+ /// Return a `LookSet` from the slice given as a native endian 16-bit
+ /// integer.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 2`.
+ #[inline]
+ pub fn read_repr(slice: &[u8]) -> LookSet {
+ let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+ LookSet { bits }
+ }
+
+ /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+ /// of the slice given.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `slice.len() < 2`.
+ #[inline]
+ pub fn write_repr(self, slice: &mut [u8]) {
+ let raw = self.bits.to_ne_bytes();
+ slice[0] = raw[0];
+ slice[1] = raw[1];
+ }
+
+ /// Checks that all assertions in this set can be matched.
+ ///
+ /// Some assertions, such as Unicode word boundaries, require optional (but
+ /// enabled by default) tables that may not be available. If there are
+ /// assertions in this set that require tables that are not available, then
+ /// this will return an error.
+ ///
+ /// Specifically, this returns an error when the the
+ /// `unicode-word-boundary` feature is _not_ enabled _and_ this set
+ /// contains a Unicode word boundary assertion.
+ ///
+ /// It can be useful to use this on the result of
+ /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
+ /// when building a matcher engine to ensure methods like
+ /// [`LookMatcher::matches_set`] do not panic at search time.
+ pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
+ if self.contains_word_unicode() {
+ UnicodeWordBoundaryError::check()?;
+ }
+ Ok(())
+ }
+}
+
+impl core::fmt::Debug for LookSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_empty() {
+ return write!(f, "∅");
+ }
+ for look in self.iter() {
+ write!(f, "{}", look.as_char())?;
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all look-around assertions in a [`LookSet`].
+///
+/// This iterator is created by [`LookSet::iter`].
+#[derive(Clone, Debug)]
+pub struct LookSetIter {
+ set: LookSet,
+}
+
+impl Iterator for LookSetIter {
+ type Item = Look;
+
+ #[inline]
+ fn next(&mut self) -> Option<Look> {
+ if self.set.is_empty() {
+ return None;
+ }
+ // We'll never have more than u8::MAX distinct look-around assertions,
+ // so 'repr' will always fit into a u16.
+ let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
+ let look = Look::from_repr(1 << repr)?;
+ self.set = self.set.remove(look);
+ Some(look)
+ }
+}
+
+/// A matcher for look-around assertions.
+///
+/// This matcher permits configuring aspects of how look-around assertions are
+/// matched.
+///
+/// # Example
+///
+/// A `LookMatcher` can change the line terminator used for matching multi-line
+/// anchors such as `(?m:^)` and `(?m:$)`.
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{self, pikevm::PikeVM},
+/// util::look::LookMatcher,
+/// Match, Input,
+/// };
+///
+/// let mut lookm = LookMatcher::new();
+/// lookm.set_line_terminator(b'\x00');
+///
+/// let re = PikeVM::builder()
+/// .thompson(thompson::Config::new().look_matcher(lookm))
+/// .build(r"(?m)^[a-z]+$")?;
+/// let mut cache = re.create_cache();
+///
+/// // Multi-line assertions now use NUL as a terminator.
+/// assert_eq!(
+/// Some(Match::must(0, 1..4)),
+/// re.find(&mut cache, b"\x00abc\x00"),
+/// );
+/// // ... and \n is no longer recognized as a terminator.
+/// assert_eq!(
+/// None,
+/// re.find(&mut cache, b"\nabc\n"),
+/// );
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct LookMatcher {
+ lineterm: DebugByte,
+}
+
+impl LookMatcher {
+ /// Creates a new default matcher for look-around assertions.
+ pub fn new() -> LookMatcher {
+ LookMatcher { lineterm: DebugByte(b'\n') }
+ }
+
+ /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
+ ///
+ /// Namely, instead of `^` matching after `\n` and `$` matching immediately
+ /// before a `\n`, this will cause it to match after and before the byte
+ /// given.
+ ///
+ /// It can occasionally be useful to use this to configure the line
+ /// terminator to the NUL byte when searching binary data.
+ ///
+ /// Note that this does not apply to CRLF-aware line anchors such as
+ /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
+ /// use `\r` and `\n`.
+ pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
+ self.lineterm.0 = byte;
+ self
+ }
+
+ /// Returns the line terminator that was configured for this matcher.
+ ///
+ /// If no line terminator was configured, then this returns `\n`.
+ ///
+ /// Note that the line terminator should only be used for matching `(?m:^)`
+ /// and `(?m:$)` assertions. It specifically should _not_ be used for
+ /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
+ pub fn get_line_terminator(&self) -> u8 {
+ self.lineterm.0
+ }
+
+ /// Returns true when the position `at` in `haystack` satisfies the given
+ /// look-around assertion.
+ ///
+ /// # Panics
+ ///
+ /// This panics when testing any Unicode word boundary assertion in this
+ /// set and when the Unicode word data is not available. Specifically, this
+ /// only occurs when the `unicode-word-boundary` feature is not enabled.
+ ///
+ /// Since it's generally expected that this routine is called inside of
+ /// a matching engine, callers should check the error condition when
+ /// building the matching engine. If there is a Unicode word boundary
+ /// in the matcher and the data isn't available, then the matcher should
+ /// fail to build.
+ ///
+ /// Callers can check the error condition with [`LookSet::available`].
+ ///
+ /// This also may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
+ self.matches_inline(look, haystack, at)
+ }
+
+ /// Like `matches`, but forcefully inlined.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn matches_inline(
+ &self,
+ look: Look,
+ haystack: &[u8],
+ at: usize,
+ ) -> bool {
+ match look {
+ Look::Start => self.is_start(haystack, at),
+ Look::End => self.is_end(haystack, at),
+ Look::StartLF => self.is_start_lf(haystack, at),
+ Look::EndLF => self.is_end_lf(haystack, at),
+ Look::StartCRLF => self.is_start_crlf(haystack, at),
+ Look::EndCRLF => self.is_end_crlf(haystack, at),
+ Look::WordAscii => self.is_word_ascii(haystack, at),
+ Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
+ Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
+ Look::WordUnicodeNegate => {
+ self.is_word_unicode_negate(haystack, at).unwrap()
+ }
+ }
+ }
+
+ /// Returns true when _all_ of the assertions in the given set match at the
+ /// given position in the haystack.
+ ///
+ /// # Panics
+ ///
+ /// This panics when testing any Unicode word boundary assertion in this
+ /// set and when the Unicode word data is not available. Specifically, this
+ /// only occurs when the `unicode-word-boundary` feature is not enabled.
+ ///
+ /// Since it's generally expected that this routine is called inside of
+ /// a matching engine, callers should check the error condition when
+ /// building the matching engine. If there is a Unicode word boundary
+ /// in the matcher and the data isn't available, then the matcher should
+ /// fail to build.
+ ///
+ /// Callers can check the error condition with [`LookSet::available`].
+ ///
+ /// This also may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn matches_set(
+ &self,
+ set: LookSet,
+ haystack: &[u8],
+ at: usize,
+ ) -> bool {
+ self.matches_set_inline(set, haystack, at)
+ }
+
+ /// Like `LookSet::matches`, but forcefully inlined for perf.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn matches_set_inline(
+ &self,
+ set: LookSet,
+ haystack: &[u8],
+ at: usize,
+ ) -> bool {
+ // This used to luse LookSet::iter with Look::matches on each element,
+ // but that proved to be quite diastrous for perf. The manual "if
+ // the set has this assertion, check it" turns out to be quite a bit
+ // faster.
+ if set.contains(Look::Start) {
+ if !self.is_start(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::End) {
+ if !self.is_end(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::StartLF) {
+ if !self.is_start_lf(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::EndLF) {
+ if !self.is_end_lf(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::StartCRLF) {
+ if !self.is_start_crlf(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::EndCRLF) {
+ if !self.is_end_crlf(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordAscii) {
+ if !self.is_word_ascii(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordAsciiNegate) {
+ if !self.is_word_ascii_negate(haystack, at) {
+ return false;
+ }
+ }
+ if set.contains(Look::WordUnicode) {
+ if !self.is_word_unicode(haystack, at).unwrap() {
+ return false;
+ }
+ }
+ if set.contains(Look::WordUnicodeNegate) {
+ if !self.is_word_unicode_negate(haystack, at).unwrap() {
+ return false;
+ }
+ }
+ true
+ }
+
+ /// Split up the given byte classes into equivalence classes in a way that
+ /// is consistent with this look-around assertion.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn add_to_byteset(
+ &self,
+ look: Look,
+ set: &mut crate::util::alphabet::ByteClassSet,
+ ) {
+ match look {
+ Look::Start | Look::End => {}
+ Look::StartLF | Look::EndLF => {
+ set.set_range(self.lineterm.0, self.lineterm.0);
+ }
+ Look::StartCRLF | Look::EndCRLF => {
+ set.set_range(b'\r', b'\r');
+ set.set_range(b'\n', b'\n');
+ }
+ Look::WordAscii
+ | Look::WordAsciiNegate
+ | Look::WordUnicode
+ | Look::WordUnicodeNegate => {
+ // We need to mark all ranges of bytes whose pairs result in
+ // evaluating \b differently. This isn't technically correct
+ // for Unicode word boundaries, but DFAs can't handle those
+ // anyway, and thus, the byte classes don't need to either
+ // since they are themselves only used in DFAs.
+ //
+ // FIXME: It seems like the calls to 'set_range' here are
+ // completely invariant, which means we could just hard-code
+ // them here without needing to write a loop. And we only need
+ // to do this dance at most once per regex.
+ //
+ // FIXME: Is this correct for \B?
+ let iswb = utf8::is_word_byte;
+ // This unwrap is OK because we guard every use of 'asu8' with
+ // a check that the input is <= 255.
+ let asu8 = |b: u16| u8::try_from(b).unwrap();
+ let mut b1: u16 = 0;
+ let mut b2: u16;
+ while b1 <= 255 {
+ b2 = b1 + 1;
+ while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) {
+ b2 += 1;
+ }
+ // The guards above guarantee that b2 can never get any
+ // bigger.
+ assert!(b2 <= 256);
+ // Subtracting 1 from b2 is always OK because it is always
+ // at least 1 greater than b1, and the assert above
+ // guarantees that the asu8 conversion will succeed.
+ set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));
+ b1 = b2;
+ }
+ }
+ }
+ }
+
+ /// Returns true when [`Look::Start`] is satisfied `at` the given position
+ /// in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
+ at == 0
+ }
+
+ /// Returns true when [`Look::End`] is satisfied `at` the given position in
+ /// `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
+ at == haystack.len()
+ }
+
+ /// Returns true when [`Look::StartLF`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
+ self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0
+ }
+
+ /// Returns true when [`Look::EndLF`] is satisfied `at` the given position
+ /// in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
+ self.is_end(haystack, at) || haystack[at] == self.lineterm.0
+ }
+
+ /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
+ self.is_start(haystack, at)
+ || haystack[at - 1] == b'\n'
+ || (haystack[at - 1] == b'\r'
+ && (at >= haystack.len() || haystack[at] != b'\n'))
+ }
+
+ /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
+ self.is_end(haystack, at)
+ || haystack[at] == b'\r'
+ || (haystack[at] == b'\n'
+ && (at == 0 || haystack[at - 1] != b'\r'))
+ }
+
+ /// Returns true when [`Look::WordAscii`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
+ let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
+ let word_after =
+ at < haystack.len() && utf8::is_word_byte(haystack[at]);
+ word_before != word_after
+ }
+
+ /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ #[inline]
+ pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
+ !self.is_word_ascii(haystack, at)
+ }
+
+ /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
+ /// position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_unicode(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ let word_before = is_word_char::rev(haystack, at)?;
+ let word_after = is_word_char::fwd(haystack, at)?;
+ Ok(word_before != word_after)
+ }
+
+ /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
+ /// given position in `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// This may panic when `at > haystack.len()`. Note that `at ==
+ /// haystack.len()` is legal and guaranteed not to panic.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error when Unicode word boundary tables
+ /// are not available. Specifically, this only occurs when the
+ /// `unicode-word-boundary` feature is not enabled.
+ #[inline]
+ pub fn is_word_unicode_negate(
+ &self,
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, UnicodeWordBoundaryError> {
+ // This is pretty subtle. Why do we need to do UTF-8 decoding here?
+ // Well... at time of writing, the is_word_char_{fwd,rev} routines will
+ // only return true if there is a valid UTF-8 encoding of a "word"
+ // codepoint, and false in every other case (including invalid UTF-8).
+ // This means that in regions of invalid UTF-8 (which might be a
+ // subset of valid UTF-8!), it would result in \B matching. While this
+ // would be questionable in the context of truly invalid UTF-8, it is
+ // *certainly* wrong to report match boundaries that split the encoding
+ // of a codepoint. So to work around this, we ensure that we can decode
+ // a codepoint on either side of `at`. If either direction fails, then
+ // we don't permit \B to match at all.
+ //
+ // Now, this isn't exactly optimal from a perf perspective. We could
+ // try and detect this in is_word_char::{fwd,rev}, but it's not clear
+ // if it's worth it. \B is, after all, rarely used. Even worse,
+ // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
+ // will wind up doing UTF-8 decoding twice. Owch. We could fix this
+ // with more code complexity, but it just doesn't feel worth it for \B.
+ //
+ // And in particular, we do *not* have to do this with \b, because \b
+ // *requires* that at least one side of `at` be a "word" codepoint,
+ // which in turn implies one side of `at` must be valid UTF-8. This in
+ // turn implies that \b can never split a valid UTF-8 encoding of a
+ // codepoint. In the case where one side of `at` is truly invalid UTF-8
+ // and the other side IS a word codepoint, then we want \b to match
+ // since it represents a valid UTF-8 boundary. It also makes sense. For
+ // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
+ //
+ // Note also that this is not just '!is_word_unicode(..)' like it is
+ // for the ASCII case. For example, neither \b nor \B is satisfied
+ // within invalid UTF-8 sequences.
+ let word_before = at > 0
+ && match utf8::decode_last(&haystack[..at]) {
+ None | Some(Err(_)) => return Ok(false),
+ Some(Ok(_)) => is_word_char::rev(haystack, at)?,
+ };
+ let word_after = at < haystack.len()
+ && match utf8::decode(&haystack[at..]) {
+ None | Some(Err(_)) => return Ok(false),
+ Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
+ };
+ Ok(word_before == word_after)
+ }
+}
+
+impl Default for LookMatcher {
+ fn default() -> LookMatcher {
+ LookMatcher::new()
+ }
+}
+
+/// An error that occurs when the Unicode-aware `\w` class is unavailable.
+///
+/// This error can occur when the data tables necessary for the Unicode aware
+/// Perl character class `\w` are unavailable. The `\w` class is used to
+/// determine whether a codepoint is considered a word character or not when
+/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
+/// position.
+///
+/// This error can only occur when the `unicode-word-boundary` feature is
+/// disabled.
+#[derive(Clone, Debug)]
+pub struct UnicodeWordBoundaryError(());
+
+impl UnicodeWordBoundaryError {
+ #[cfg(not(feature = "unicode-word-boundary"))]
+ pub(crate) fn new() -> UnicodeWordBoundaryError {
+ UnicodeWordBoundaryError(())
+ }
+
+ /// Returns an error if and only if Unicode word boundary data is
+ /// unavailable.
+ pub fn check() -> Result<(), UnicodeWordBoundaryError> {
+ is_word_char::check()
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for UnicodeWordBoundaryError {}
+
+impl core::fmt::Display for UnicodeWordBoundaryError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(
+ f,
+ "Unicode-aware \\b and \\B are unavailable because the \
+ requisite data tables are missing, please enable the \
+ unicode-word-boundary feature"
+ )
+ }
+}
+
+// Below are FOUR different ways for checking whether whether a "word"
+// codepoint exists at a particular position in the haystack. The four
+// different approaches are, in order of preference:
+//
+// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
+// first call, and then use that DFA for all subsequent calls.
+// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
+// 3. Do UTF-8 decoding and use our own 'perl_word' table.
+// 4. Return an error.
+//
+// The reason for all of these approaches is a combination of perf and
+// permitting one to build regex-automata without the Unicode data necessary
+// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
+// still work.)
+//
+// The DFA approach is the fastest, but it requires the regex parser, the
+// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
+// bring in, but if it's available, it's (probably) the best we can do.
+//
+// Approaches (2) and (3) are effectively equivalent, but (2) reuses the
+// data in regex-syntax and avoids duplicating it in regex-automata.
+//
+// Finally, (4) unconditionally returns an error since the requisite data isn't
+// available anywhere.
+//
+// There are actually more approaches possible that we didn't implement. For
+// example, if the DFA builder is available but the syntax parser is not, we
+// could technically hand construct our own NFA from the 'perl_word' data
+// table. But to avoid some pretty hairy code duplication, we would in turn
+// need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
+//
+// A possibly more sensible alternative is to use a lazy DFA when the full
+// DFA builder isn't available...
+//
+// Yet another choice would be to build the full DFA and then embed it into the
+// source. Then we'd only need to bring in the DFA search runtime, which is
+// considerably smaller than the DFA builder code. The problem here is that the
+// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
+// we'd need to build regex-cli, which depends on regex-automata in order to
+// build some part of regex-automata. But to be honest, something like this has
+// to be allowed somehow? I just don't know what the right process is.
+//
+// There are perhaps other choices as well. Why did I stop at these 4? Because
+// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
+// approach eventually, as the benefits of the DFA approach are somewhat
+// compelling. The 'boundary-words-holmes' benchmark tests this:
+//
+// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
+//
+// Then I changed the code below so that the util/unicode_data/perl_word table
+// was used and re-ran the benchmark:
+//
+// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
+//
+// And compared them:
+//
+// $ regex-cli bench diff dfa.csv table.csv
+// benchmark engine dfa table
+// --------- ------ --- -----
+// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s
+//
+// Which is a nice improvement.
+//
+// UPDATE: It turns out that it takes approximately 22ms to build the reverse
+// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
+// the grand scheme things, but that is a significant latency cost. So I'm not
+// sure that's a good idea. I then tried using a lazy DFA instead, and that
+// eliminated the overhead, but since the lazy DFA requires mutable working
+// memory, that requires introducing a 'Cache' for every simultaneous call.
+//
+// I ended up deciding for now to just keep the "UTF-8 decode and check the
+// table." The DFA and lazy DFA approaches are still below, but commented out.
+//
+// [1]: https://github.com/BurntSushi/ucd-generate/issues/11
+
+/*
+/// A module that looks for word codepoints using lazy DFAs.
+#[cfg(all(
+ feature = "unicode-word-boundary",
+ feature = "syntax",
+ feature = "unicode-perl",
+ feature = "hybrid"
+))]
+mod is_word_char {
+ use alloc::vec::Vec;
+
+ use crate::{
+ hybrid::dfa::{Cache, DFA},
+ nfa::thompson::NFA,
+ util::{lazy::Lazy, pool::Pool, primitives::StateID},
+ Anchored, Input,
+ };
+
+ pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
+ Ok(())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn fwd(
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());
+ static CACHE: Lazy<Pool<Cache>> =
+ Lazy::new(|| Pool::new(|| WORD.create_cache()));
+ let dfa = Lazy::get(&WORD);
+ let mut cache = Lazy::get(&CACHE).get();
+ let mut sid = dfa
+ .start_state_forward(
+ &mut cache,
+ &Input::new("").anchored(Anchored::Yes),
+ )
+ .unwrap();
+ while at < haystack.len() {
+ let byte = haystack[at];
+ sid = dfa.next_state(&mut cache, sid, byte).unwrap();
+ at += 1;
+ if sid.is_tagged() {
+ if sid.is_match() {
+ return Ok(true);
+ } else if sid.is_dead() {
+ return Ok(false);
+ }
+ }
+ }
+ Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn rev(
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ static WORD: Lazy<DFA> = Lazy::new(|| {
+ DFA::builder()
+ .thompson(NFA::config().reverse(true))
+ .build(r"\w")
+ .unwrap()
+ });
+ static CACHE: Lazy<Pool<Cache>> =
+ Lazy::new(|| Pool::new(|| WORD.create_cache()));
+ let dfa = Lazy::get(&WORD);
+ let mut cache = Lazy::get(&CACHE).get();
+ let mut sid = dfa
+ .start_state_reverse(
+ &mut cache,
+ &Input::new("").anchored(Anchored::Yes),
+ )
+ .unwrap();
+ while at > 0 {
+ at -= 1;
+ let byte = haystack[at];
+ sid = dfa.next_state(&mut cache, sid, byte).unwrap();
+ if sid.is_tagged() {
+ if sid.is_match() {
+ return Ok(true);
+ } else if sid.is_dead() {
+ return Ok(false);
+ }
+ }
+ }
+ Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
+ }
+}
+*/
+
+/*
+/// A module that looks for word codepoints using fully compiled DFAs.
+#[cfg(all(
+ feature = "unicode-word-boundary",
+ feature = "syntax",
+ feature = "unicode-perl",
+ feature = "dfa-build"
+))]
+mod is_word_char {
+ use alloc::vec::Vec;
+
+ use crate::{
+ dfa::{dense::DFA, Automaton, StartKind},
+ nfa::thompson::NFA,
+ util::{lazy::Lazy, primitives::StateID},
+ Anchored, Input,
+ };
+
+ pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
+ Ok(())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn fwd(
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
+ let dfa = DFA::builder()
+ .configure(DFA::config().start_kind(StartKind::Anchored))
+ .build(r"\w")
+ .unwrap();
+ // OK because our regex has no look-around.
+ let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
+ (dfa, start_id)
+ });
+ let &(ref dfa, mut sid) = Lazy::get(&WORD);
+ while at < haystack.len() {
+ let byte = haystack[at];
+ sid = dfa.next_state(sid, byte);
+ at += 1;
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return Ok(true);
+ } else if dfa.is_dead_state(sid) {
+ return Ok(false);
+ }
+ }
+ }
+ Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn rev(
+ haystack: &[u8],
+ mut at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
+ let dfa = DFA::builder()
+ .configure(DFA::config().start_kind(StartKind::Anchored))
+ // From ad hoc measurements, it looks like setting
+ // shrink==false is slightly faster than shrink==true. I kind
+ // of feel like this indicates that shrinking is probably a
+ // failure, although it can help in some cases. Sigh.
+ .thompson(NFA::config().reverse(true).shrink(false))
+ .build(r"\w")
+ .unwrap();
+ // OK because our regex has no look-around.
+ let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
+ (dfa, start_id)
+ });
+ let &(ref dfa, mut sid) = Lazy::get(&WORD);
+ while at > 0 {
+ at -= 1;
+ let byte = haystack[at];
+ sid = dfa.next_state(sid, byte);
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return Ok(true);
+ } else if dfa.is_dead_state(sid) {
+ return Ok(false);
+ }
+ }
+ }
+ Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
+ }
+}
+*/
+
+/// A module that looks for word codepoints using regex-syntax's data tables.
+#[cfg(all(
+ feature = "unicode-word-boundary",
+ feature = "syntax",
+ feature = "unicode-perl",
+))]
+mod is_word_char {
+ use regex_syntax::try_is_word_character;
+
+ use crate::util::utf8;
+
+ pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
+ Ok(())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn fwd(
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Ok(match utf8::decode(&haystack[at..]) {
+ None | Some(Err(_)) => false,
+ Some(Ok(ch)) => try_is_word_character(ch).expect(
+ "since unicode-word-boundary, syntax and unicode-perl \
+ are all enabled, it is expected that \
+ try_is_word_character succeeds",
+ ),
+ })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn rev(
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Ok(match utf8::decode_last(&haystack[..at]) {
+ None | Some(Err(_)) => false,
+ Some(Ok(ch)) => try_is_word_character(ch).expect(
+ "since unicode-word-boundary, syntax and unicode-perl \
+ are all enabled, it is expected that \
+ try_is_word_character succeeds",
+ ),
+ })
+ }
+}
+
+/// A module that looks for word codepoints using regex-automata's data tables
+/// (which are only compiled when regex-syntax's tables aren't available).
+///
+/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
+/// perl_word.
+#[cfg(all(
+ feature = "unicode-word-boundary",
+ not(all(feature = "syntax", feature = "unicode-perl")),
+))]
+mod is_word_char {
+ use crate::util::utf8;
+
+ pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
+ Ok(())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn fwd(
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Ok(match utf8::decode(&haystack[at..]) {
+ None | Some(Err(_)) => false,
+ Some(Ok(ch)) => is_word_character(ch),
+ })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn rev(
+ haystack: &[u8],
+ at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Ok(match utf8::decode_last(&haystack[..at]) {
+ None | Some(Err(_)) => false,
+ Some(Ok(ch)) => is_word_character(ch),
+ })
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_word_character(c: char) -> bool {
+ use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
+
+ // MSRV(1.59): Use 'u8::try_from(c)' instead.
+ if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) {
+ return true;
+ }
+ PERL_WORD
+ .binary_search_by(|&(start, end)| {
+ use core::cmp::Ordering;
+
+ if start <= c && c <= end {
+ Ordering::Equal
+ } else if start > c {
+ Ordering::Greater
+ } else {
+ Ordering::Less
+ }
+ })
+ .is_ok()
+ }
+}
+
+/// A module that always returns an error if Unicode word boundaries are
+/// disabled. When this feature is disabled, then regex-automata will not
+/// include its own data tables even if regex-syntax is disabled.
+#[cfg(not(feature = "unicode-word-boundary"))]
+mod is_word_char {
+ pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
+ Err(super::UnicodeWordBoundaryError::new())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn fwd(
+ _bytes: &[u8],
+ _at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Err(super::UnicodeWordBoundaryError::new())
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(super) fn rev(
+ _bytes: &[u8],
+ _at: usize,
+ ) -> Result<bool, super::UnicodeWordBoundaryError> {
+ Err(super::UnicodeWordBoundaryError::new())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ macro_rules! testlook {
+ ($look:expr, $haystack:expr, $at:expr) => {
+ LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
+ };
+ }
+
+ #[test]
+ fn look_matches_start_line() {
+ let look = Look::StartLF;
+
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "\n", 0));
+ assert!(testlook!(look, "\n", 1));
+ assert!(testlook!(look, "a", 0));
+ assert!(testlook!(look, "\na", 1));
+
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a\na", 1));
+ }
+
+ #[test]
+ fn look_matches_end_line() {
+ let look = Look::EndLF;
+
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "\n", 1));
+ assert!(testlook!(look, "\na", 0));
+ assert!(testlook!(look, "\na", 2));
+ assert!(testlook!(look, "a\na", 1));
+
+ assert!(!testlook!(look, "a", 0));
+ assert!(!testlook!(look, "\na", 1));
+ assert!(!testlook!(look, "a\na", 0));
+ assert!(!testlook!(look, "a\na", 2));
+ }
+
+ #[test]
+ fn look_matches_start_text() {
+ let look = Look::Start;
+
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "\n", 0));
+ assert!(testlook!(look, "a", 0));
+
+ assert!(!testlook!(look, "\n", 1));
+ assert!(!testlook!(look, "\na", 1));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a\na", 1));
+ }
+
+ #[test]
+ fn look_matches_end_text() {
+ let look = Look::End;
+
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "\n", 1));
+ assert!(testlook!(look, "\na", 2));
+
+ assert!(!testlook!(look, "\na", 0));
+ assert!(!testlook!(look, "a\na", 1));
+ assert!(!testlook!(look, "a", 0));
+ assert!(!testlook!(look, "\na", 1));
+ assert!(!testlook!(look, "a\na", 0));
+ assert!(!testlook!(look, "a\na", 2));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_unicode() {
+ let look = Look::WordUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_matches_word_ascii() {
+ let look = Look::WordAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(testlook!(look, "a", 0));
+ assert!(testlook!(look, "a", 1));
+ assert!(testlook!(look, "a ", 1));
+ assert!(testlook!(look, " a ", 1));
+ assert!(testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!testlook!(look, "", 0));
+ assert!(!testlook!(look, "ab", 1));
+ assert!(!testlook!(look, "a ", 2));
+ assert!(!testlook!(look, " a ", 0));
+ assert!(!testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃b", 4));
+ assert!(!testlook!(look, "𝛃 ", 5));
+ assert!(!testlook!(look, " 𝛃 ", 0));
+ assert!(!testlook!(look, " 𝛃 ", 6));
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ assert!(!testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
+ fn look_matches_word_unicode_negate() {
+ let look = Look::WordUnicodeNegate;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃", 0));
+ assert!(!testlook!(look, "𝛃", 4));
+ assert!(!testlook!(look, "𝛃 ", 4));
+ assert!(!testlook!(look, " 𝛃 ", 1));
+ assert!(!testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(!testlook!(look, "𝛃𐆀", 0));
+ assert!(!testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(testlook!(look, "𝛃b", 4));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ // These don't match because they could otherwise return an offset that
+ // splits the UTF-8 encoding of a codepoint.
+ assert!(!testlook!(look, "𝛃", 1));
+ assert!(!testlook!(look, "𝛃", 2));
+ assert!(!testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints. These also don't
+ // match because they could otherwise return an offset that splits the
+ // UTF-8 encoding of a codepoint.
+ assert!(!testlook!(look, "𝛃𐆀", 1));
+ assert!(!testlook!(look, "𝛃𐆀", 2));
+ assert!(!testlook!(look, "𝛃𐆀", 3));
+ assert!(!testlook!(look, "𝛃𐆀", 5));
+ assert!(!testlook!(look, "𝛃𐆀", 6));
+ assert!(!testlook!(look, "𝛃𐆀", 7));
+ // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
+ // of the haystack. So the "end" of the haystack isn't a word and 𐆀
+ // isn't a word, thus, \B matches.
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_matches_word_ascii_negate() {
+ let look = Look::WordAsciiNegate;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!testlook!(look, "a", 0));
+ assert!(!testlook!(look, "a", 1));
+ assert!(!testlook!(look, "a ", 1));
+ assert!(!testlook!(look, " a ", 1));
+ assert!(!testlook!(look, " a ", 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃", 0));
+ assert!(testlook!(look, "𝛃", 4));
+ assert!(testlook!(look, "𝛃 ", 4));
+ assert!(testlook!(look, " 𝛃 ", 1));
+ assert!(testlook!(look, " 𝛃 ", 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(testlook!(look, "𝛃𐆀", 0));
+ assert!(testlook!(look, "𝛃𐆀", 4));
+
+ // Non word boundaries for ASCII.
+ assert!(testlook!(look, "", 0));
+ assert!(testlook!(look, "ab", 1));
+ assert!(testlook!(look, "a ", 2));
+ assert!(testlook!(look, " a ", 0));
+ assert!(testlook!(look, " a ", 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!testlook!(look, "𝛃b", 4));
+ assert!(testlook!(look, "𝛃 ", 5));
+ assert!(testlook!(look, " 𝛃 ", 0));
+ assert!(testlook!(look, " 𝛃 ", 6));
+ assert!(testlook!(look, "𝛃", 1));
+ assert!(testlook!(look, "𝛃", 2));
+ assert!(testlook!(look, "𝛃", 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(testlook!(look, "𝛃𐆀", 1));
+ assert!(testlook!(look, "𝛃𐆀", 2));
+ assert!(testlook!(look, "𝛃𐆀", 3));
+ assert!(testlook!(look, "𝛃𐆀", 5));
+ assert!(testlook!(look, "𝛃𐆀", 6));
+ assert!(testlook!(look, "𝛃𐆀", 7));
+ assert!(testlook!(look, "𝛃𐆀", 8));
+ }
+
+ #[test]
+ fn look_set() {
+ let mut f = LookSet::default();
+ assert!(!f.contains(Look::Start));
+ assert!(!f.contains(Look::End));
+ assert!(!f.contains(Look::StartLF));
+ assert!(!f.contains(Look::EndLF));
+ assert!(!f.contains(Look::WordUnicode));
+ assert!(!f.contains(Look::WordUnicodeNegate));
+ assert!(!f.contains(Look::WordAscii));
+ assert!(!f.contains(Look::WordAsciiNegate));
+
+ f = f.insert(Look::Start);
+ assert!(f.contains(Look::Start));
+ f = f.remove(Look::Start);
+ assert!(!f.contains(Look::Start));
+
+ f = f.insert(Look::End);
+ assert!(f.contains(Look::End));
+ f = f.remove(Look::End);
+ assert!(!f.contains(Look::End));
+
+ f = f.insert(Look::StartLF);
+ assert!(f.contains(Look::StartLF));
+ f = f.remove(Look::StartLF);
+ assert!(!f.contains(Look::StartLF));
+
+ f = f.insert(Look::EndLF);
+ assert!(f.contains(Look::EndLF));
+ f = f.remove(Look::EndLF);
+ assert!(!f.contains(Look::EndLF));
+
+ f = f.insert(Look::StartCRLF);
+ assert!(f.contains(Look::StartCRLF));
+ f = f.remove(Look::StartCRLF);
+ assert!(!f.contains(Look::StartCRLF));
+
+ f = f.insert(Look::EndCRLF);
+ assert!(f.contains(Look::EndCRLF));
+ f = f.remove(Look::EndCRLF);
+ assert!(!f.contains(Look::EndCRLF));
+
+ f = f.insert(Look::WordUnicode);
+ assert!(f.contains(Look::WordUnicode));
+ f = f.remove(Look::WordUnicode);
+ assert!(!f.contains(Look::WordUnicode));
+
+ f = f.insert(Look::WordUnicodeNegate);
+ assert!(f.contains(Look::WordUnicodeNegate));
+ f = f.remove(Look::WordUnicodeNegate);
+ assert!(!f.contains(Look::WordUnicodeNegate));
+
+ f = f.insert(Look::WordAscii);
+ assert!(f.contains(Look::WordAscii));
+ f = f.remove(Look::WordAscii);
+ assert!(!f.contains(Look::WordAscii));
+
+ f = f.insert(Look::WordAsciiNegate);
+ assert!(f.contains(Look::WordAsciiNegate));
+ f = f.remove(Look::WordAsciiNegate);
+ assert!(!f.contains(Look::WordAsciiNegate));
+ }
+
+ #[test]
+ fn look_set_iter() {
+ let set = LookSet::empty();
+ assert_eq!(0, set.iter().count());
+
+ let set = LookSet::full();
+ assert_eq!(10, set.iter().count());
+
+ let set =
+ LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
+ assert_eq!(2, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::StartLF);
+ assert_eq!(1, set.iter().count());
+
+ let set = LookSet::empty().insert(Look::WordAsciiNegate);
+ assert_eq!(1, set.iter().count());
+ }
+
+ #[test]
+ #[cfg(feature = "alloc")]
+ fn look_set_debug() {
+ let res = alloc::format!("{:?}", LookSet::empty());
+ assert_eq!("∅", res);
+ let res = alloc::format!("{:?}", LookSet::full());
+ assert_eq!("Az^$rRbB𝛃𝚩", res);
+ }
+}
diff --git a/vendor/regex-automata/src/util/matchtypes.rs b/vendor/regex-automata/src/util/matchtypes.rs
deleted file mode 100644
index de0fa65bf..000000000
--- a/vendor/regex-automata/src/util/matchtypes.rs
+++ /dev/null
@@ -1,356 +0,0 @@
-use crate::util::id::PatternID;
-
-/// The kind of match semantics to use for a DFA.
-///
-/// The default match kind is `LeftmostFirst`.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum MatchKind {
- /// Report all possible matches.
- All,
- /// Report only the leftmost matches. When multiple leftmost matches exist,
- /// report the match corresponding to the part of the regex that appears
- /// first in the syntax.
- LeftmostFirst,
- /// Hints that destructuring should not be exhaustive.
- ///
- /// This enum may grow additional variants, so this makes sure clients
- /// don't count on exhaustive matching. (Otherwise, adding a new variant
- /// could break existing code.)
- #[doc(hidden)]
- __Nonexhaustive,
- // There is prior art in RE2 that shows that we should be able to add
- // LeftmostLongest too. The tricky part of it is supporting ungreedy
- // repetitions. Instead of treating all NFA states as having equivalent
- // priority (as in 'All') or treating all NFA states as having distinct
- // priority based on order (as in 'LeftmostFirst'), we instead group NFA
- // states into sets, and treat members of each set as having equivalent
- // priority, but having greater priority than all following members
- // of different sets.
- //
- // However, it's not clear whether it's really worth adding this. After
- // all, leftmost-longest can be emulated when using literals by using
- // leftmost-first and sorting the literals by length in descending order.
- // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
- // always match `a` in `ab` when using leftmost-first, but leftmost-longest
- // would match `ab`.
-}
-
-impl MatchKind {
- #[cfg(feature = "alloc")]
- pub(crate) fn continue_past_first_match(&self) -> bool {
- *self == MatchKind::All
- }
-}
-
-impl Default for MatchKind {
- fn default() -> MatchKind {
- MatchKind::LeftmostFirst
- }
-}
-
-/// A representation of a match reported by a regex engine.
-///
-/// A match records the start and end offsets of the match in the haystack.
-///
-/// Every match guarantees that `start <= end`.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub struct Match {
- /// The start offset of the match, inclusive.
- start: usize,
- /// The end offset of the match, exclusive.
- end: usize,
-}
-
-impl Match {
- /// Create a new match from a byte offset span.
- ///
- /// # Panics
- ///
- /// This panics if `end < start`.
- #[inline]
- pub fn new(start: usize, end: usize) -> Match {
- assert!(start <= end);
- Match { start, end }
- }
-
- /// The starting position of the match.
- #[inline]
- pub fn start(&self) -> usize {
- self.start
- }
-
- /// The ending position of the match.
- #[inline]
- pub fn end(&self) -> usize {
- self.end
- }
-
- /// Returns the match location as a range.
- #[inline]
- pub fn range(&self) -> core::ops::Range<usize> {
- self.start..self.end
- }
-
- /// Returns true if and only if this match is empty. That is, when
- /// `start() == end()`.
- ///
- /// An empty match can only be returned when the empty string was among
- /// the patterns used to build the Aho-Corasick automaton.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.start == self.end
- }
-}
-
-/// A representation of a match reported by a DFA.
-///
-/// This is called a "half" match because it only includes the end location
-/// (or start location for a reverse match) of a match. This corresponds to the
-/// information that a single DFA scan can report. Getting the other half of
-/// the match requires a second scan with a reversed DFA.
-///
-/// A half match also includes the pattern that matched. The pattern is
-/// identified by an ID, which corresponds to its position (starting from `0`)
-/// relative to other patterns used to construct the corresponding DFA. If only
-/// a single pattern is provided to the DFA, then all matches are guaranteed to
-/// have a pattern ID of `0`.
-#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
-pub struct HalfMatch {
- /// The pattern ID.
- pub(crate) pattern: PatternID,
- /// The offset of the match.
- ///
- /// For forward searches, the offset is exclusive. For reverse searches,
- /// the offset is inclusive.
- pub(crate) offset: usize,
-}
-
-impl HalfMatch {
- /// Create a new half match from a pattern ID and a byte offset.
- #[inline]
- pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
- HalfMatch { pattern, offset }
- }
-
- /// Create a new half match from a pattern ID and a byte offset.
- ///
- /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
- /// [`PatternID`]. This panics if the given `usize` is not representable
- /// as a `PatternID`.
- #[inline]
- pub fn must(pattern: usize, offset: usize) -> HalfMatch {
- HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
- }
-
- /// Returns the ID of the pattern that matched.
- ///
- /// The ID of a pattern is derived from the position in which it was
- /// originally inserted into the corresponding DFA. The first pattern has
- /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
- #[inline]
- pub fn pattern(&self) -> PatternID {
- self.pattern
- }
-
- /// The position of the match.
- ///
- /// If this match was produced by a forward search, then the offset is
- /// exclusive. If this match was produced by a reverse search, then the
- /// offset is inclusive.
- #[inline]
- pub fn offset(&self) -> usize {
- self.offset
- }
-}
-
-/// A representation of a multi match reported by a regex engine.
-///
-/// A multi match has two essential pieces of information: the identifier of
-/// the pattern that matched, along with the start and end offsets of the match
-/// in the haystack.
-///
-/// The pattern is identified by an ID, which corresponds to its position
-/// (starting from `0`) relative to other patterns used to construct the
-/// corresponding regex engine. If only a single pattern is provided, then all
-/// multi matches are guaranteed to have a pattern ID of `0`.
-///
-/// Every multi match guarantees that `start <= end`.
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub struct MultiMatch {
- /// The pattern ID.
- pattern: PatternID,
- /// The start offset of the match, inclusive.
- start: usize,
- /// The end offset of the match, exclusive.
- end: usize,
-}
-
-impl MultiMatch {
- /// Create a new match from a pattern ID and a byte offset span.
- ///
- /// # Panics
- ///
- /// This panics if `end < start`.
- #[inline]
- pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
- assert!(start <= end);
- MultiMatch { pattern, start, end }
- }
-
- /// Create a new match from a pattern ID and a byte offset span.
- ///
- /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
- /// [`PatternID`]. This panics if the given `usize` is not representable
- /// as a `PatternID`.
- ///
- /// # Panics
- ///
- /// This panics if `end < start` or if `pattern > PatternID::MAX`.
- #[inline]
- pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
- MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
- }
-
- /// Returns the ID of the pattern that matched.
- ///
- /// The ID of a pattern is derived from the position in which it was
- /// originally inserted into the corresponding regex engine. The first
- /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
- /// so on.
- #[inline]
- pub fn pattern(&self) -> PatternID {
- self.pattern
- }
-
- /// The starting position of the match.
- #[inline]
- pub fn start(&self) -> usize {
- self.start
- }
-
- /// The ending position of the match.
- #[inline]
- pub fn end(&self) -> usize {
- self.end
- }
-
- /// Returns the match location as a range.
- #[inline]
- pub fn range(&self) -> core::ops::Range<usize> {
- self.start..self.end
- }
-
- /// Returns true if and only if this match is empty. That is, when
- /// `start() == end()`.
- ///
- /// An empty match can only be returned when the empty string was among
- /// the patterns used to build the Aho-Corasick automaton.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.start == self.end
- }
-}
-
-/// An error type indicating that a search stopped prematurely without finding
-/// a match.
-///
-/// This error type implies that one cannot assume that no matches occur, since
-/// the search stopped before completing.
-///
-/// Normally, when one searches for something, the response is either an
-/// affirmative "it was found at this location" or a negative "not found at
-/// all." However, in some cases, a regex engine can be configured to stop its
-/// search before concluding whether a match exists or not. When this happens,
-/// it may be important for the caller to know why the regex engine gave up and
-/// where in the input it gave up at. This error type exposes the 'why' and the
-/// 'where.'
-///
-/// For example, the DFAs provided by this library generally cannot correctly
-/// implement Unicode word boundaries. Instead, they provide an option to
-/// eagerly support them on ASCII text (since Unicode word boundaries are
-/// equivalent to ASCII word boundaries when searching ASCII text), but will
-/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
-/// required to either report the failure to the caller (unergonomic) or
-/// otherwise fall back to some other regex engine (ergonomic, but potentially
-/// costly).
-///
-/// More generally, some regex engines offer the ability for callers to specify
-/// certain bytes that will trigger the regex engine to automatically quit if
-/// they are seen.
-///
-/// Still yet, there may be other reasons for a failed match. For example,
-/// the hybrid DFA provided by this crate can be configured to give up if it
-/// believes that it is not efficient. This in turn permits callers to choose a
-/// different regex engine.
-///
-/// # Advice
-///
-/// While this form of error reporting adds complexity, it is generally
-/// possible for callers to configure regex engines to never give up a search,
-/// and thus never return an error. Indeed, the default configuration for every
-/// regex engine in this crate is such that they will never stop searching
-/// early. Therefore, the only way to get a match error is if the regex engine
-/// is explicitly configured to do so. Options that enable this behavior
-/// document the new error conditions they imply.
-///
-/// Regex engines for which no errors are possible for any configuration will
-/// return the normal `Option<Match>` and not use this error type at all.
-///
-/// For example, regex engines in the `dfa` sub-module will only report
-/// `MatchError::Quit` if instructed by either
-/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
-/// or by
-/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
-#[derive(Clone, Debug, Eq, Hash, PartialEq)]
-pub enum MatchError {
- // Note that the first version of this type was called `SearchError` and it
- // included a third `None` variant to indicate that the search completed
- // and no match was found. However, this was problematic for iterator
- // APIs where the `None` sentinel for stopping iteration corresponds
- // precisely to the "match not found" case. The fact that the `None`
- // variant was buried inside this type was in turn quite awkward. So
- // instead, I removed the `None` variant, renamed the type and used
- // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
- // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
- // ergonomics by only putting the more complex API in the `try_` variants
- // ("fallible") of search methods. The infallible APIs will instead just
- // return `Option<Match>` and panic on error.
- /// The search saw a "quit" byte at which it was instructed to stop
- /// searching.
- Quit {
- /// The "quit" byte that was observed that caused the search to stop.
- byte: u8,
- /// The offset at which the quit byte was observed.
- offset: usize,
- },
- /// The search, based on heuristics, determined that it would be better
- /// to stop, typically to provide the caller an opportunity to use an
- /// alternative regex engine.
- ///
- /// Currently, the only way for this to occur is via the lazy DFA and
- /// only when it is configured to do so (it will not return this error by
- /// default).
- GaveUp {
- /// The offset at which the search stopped. This corresponds to the
- /// position immediately following the last byte scanned.
- offset: usize,
- },
-}
-
-#[cfg(feature = "std")]
-impl std::error::Error for MatchError {}
-
-impl core::fmt::Display for MatchError {
- fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- match *self {
- MatchError::Quit { byte, offset } => write!(
- f,
- "quit search after observing byte \\x{:02X} at offset {}",
- byte, offset,
- ),
- MatchError::GaveUp { offset } => {
- write!(f, "gave up searching at offset {}", offset)
- }
- }
- }
-}
diff --git a/vendor/regex-automata/src/util/memchr.rs b/vendor/regex-automata/src/util/memchr.rs
new file mode 100644
index 000000000..a2cbb0732
--- /dev/null
+++ b/vendor/regex-automata/src/util/memchr.rs
@@ -0,0 +1,93 @@
+/*!
+This module defines simple wrapper routines for the memchr functions from the
+`memchr` crate. Basically, when the `memchr` crate is available, we use it,
+otherwise we use a naive implementation which is still pretty fast.
+*/
+
+pub(crate) use self::inner::*;
+
+#[cfg(feature = "perf-literal-substring")]
+pub(super) mod inner {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ memchr::memchr(n1, haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+ memchr::memchr2(n1, n2, haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr3(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ haystack: &[u8],
+ ) -> Option<usize> {
+ memchr::memchr3(n1, n2, n3, haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ memchr::memrchr(n1, haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+ memchr::memrchr2(n1, n2, haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr3(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ haystack: &[u8],
+ ) -> Option<usize> {
+ memchr::memrchr3(n1, n2, n3, haystack)
+ }
+}
+
+#[cfg(not(feature = "perf-literal-substring"))]
+pub(super) mod inner {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == n1)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().position(|&b| b == n1 || b == n2)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memchr3(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ haystack: &[u8],
+ ) -> Option<usize> {
+ haystack.iter().position(|&b| b == n1 || b == n2 || b == n3)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().rposition(|&b| b == n1)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+ haystack.iter().rposition(|&b| b == n1 || b == n2)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn memrchr3(
+ n1: u8,
+ n2: u8,
+ n3: u8,
+ haystack: &[u8],
+ ) -> Option<usize> {
+ haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3)
+ }
+}
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs
index 798507da2..bb739df1d 100644
--- a/vendor/regex-automata/src/util/mod.rs
+++ b/vendor/regex-automata/src/util/mod.rs
@@ -1,275 +1,57 @@
/*!
-TODO
+A collection of modules that provide APIs that are useful across many regex
+engines.
+
+While one should explore the sub-modules directly to get a sense of what's
+there, here are some highlights that tie the sub-modules to higher level
+use cases:
+
+* `alphabet` contains APIs that are useful if you're doing low level things
+with the DFAs in this crate. For example, implementing determinization or
+walking its state graph directly.
+* `captures` contains APIs for dealing with capture group matches and their
+mapping to "slots" used inside an NFA graph. This is also where you can find
+iterators over capture group names.
+* `escape` contains types for pretty-printing raw byte slices as strings.
+* `iter` contains API helpers for writing regex iterators.
+* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and
+`once_cell`.
+* `look` contains APIs for matching and configuring look-around assertions.
+* `pool` provides a way to reuse mutable memory allocated in a thread safe
+manner.
+* `prefilter` provides APIs for building prefilters and using them in searches.
+* `primitives` are what you might use if you're doing lower level work on
+automata, such as walking an NFA state graph.
+* `syntax` provides some higher level convenience functions for interacting
+with the `regex-syntax` crate.
+* `wire` is useful if you're working with DFA serialization.
*/
-use core::{ascii, fmt, str};
-
-#[cfg(feature = "alloc")]
-use alloc::vec::Vec;
-
pub mod alphabet;
-pub(crate) mod bytes;
#[cfg(feature = "alloc")]
-pub(crate) mod determinize;
-pub mod id;
+pub mod captures;
+pub mod escape;
#[cfg(feature = "alloc")]
-pub(crate) mod lazy;
-pub(crate) mod matchtypes;
+pub mod interpolate;
+pub mod iter;
+pub mod lazy;
+pub mod look;
+#[cfg(feature = "alloc")]
+pub mod pool;
pub mod prefilter;
+pub mod primitives;
+#[cfg(feature = "syntax")]
+pub mod syntax;
+pub mod wire;
+
+#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
+pub(crate) mod determinize;
+pub(crate) mod empty;
+pub(crate) mod int;
+pub(crate) mod memchr;
+pub(crate) mod search;
#[cfg(feature = "alloc")]
pub(crate) mod sparse_set;
pub(crate) mod start;
-#[cfg(feature = "alloc")]
-pub(crate) mod syntax;
-
-/// The offset, in bytes, that a match is delayed by in the DFAs generated by
-/// this crate. (This includes lazy DFAs.)
-///
-/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
-/// and $. In particular, both of these operators may require the
-/// identification of the end of input in order to confirm a match. Not only
-/// does this mean that all matches must therefore be delayed by a single byte,
-/// but that a special EOI value is added to the alphabet of all DFAs. (Which
-/// means that even though the alphabet of a DFA is typically all byte values,
-/// the actual maximum alphabet size is 257 due to the extra EOI value.)
-///
-/// Since we delay matches by only 1 byte, this can't fully support a
-/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
-/// DFAs in this crate do not support it. (It's not as simple as just
-/// increasing the match offset to do it---otherwise we would---but building
-/// the full Unicode-aware word boundary detection into an automaton is quite
-/// tricky.)
-pub(crate) const MATCH_OFFSET: usize = 1;
-
-/// A type that wraps a single byte with a convenient fmt::Debug impl that
-/// escapes the byte.
-pub(crate) struct DebugByte(pub u8);
-
-impl fmt::Debug for DebugByte {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- // 10 bytes is enough to cover any output from ascii::escape_default.
- let mut bytes = [0u8; 10];
- let mut len = 0;
- for (i, mut b) in ascii::escape_default(self.0).enumerate() {
- // capitalize \xab to \xAB
- if i >= 2 && b'a' <= b && b <= b'f' {
- b -= 32;
- }
- bytes[len] = b;
- len += 1;
- }
- write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
- }
-}
-
-/// Returns the smallest possible index of the next valid UTF-8 sequence
-/// starting after `i`.
-///
-/// For all inputs, including invalid UTF-8 and any value of `i`, the return
-/// value is guaranteed to be greater than `i`.
-///
-/// Generally speaking, this should only be called on `text` when it is
-/// permitted to assume that it is valid UTF-8 and where either `i >=
-/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
-#[inline(always)]
-pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
- let b = match text.get(i) {
- None => return i.checked_add(1).unwrap(),
- Some(&b) => b,
- };
- // For cases where we see an invalid UTF-8 byte, there isn't much we can do
- // other than just start at the next byte.
- let inc = utf8_len(b).unwrap_or(1);
- i.checked_add(inc).unwrap()
-}
-
-/// Returns true if and only if the given byte is considered a word character.
-/// This only applies to ASCII.
-///
-/// This was copied from regex-syntax so that we can use it to determine the
-/// starting DFA state while searching without depending on regex-syntax. The
-/// definition is never going to change, so there's no maintenance/bit-rot
-/// hazard here.
-#[inline(always)]
-pub(crate) fn is_word_byte(b: u8) -> bool {
- match b {
- b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
- _ => false,
- }
-}
-
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-#[inline(always)]
-pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
- if bytes.is_empty() {
- return None;
- }
- let len = match utf8_len(bytes[0]) {
- None => return Some(Err(bytes[0])),
- Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
- Some(1) => return Some(Ok(bytes[0] as char)),
- Some(len) => len,
- };
- match str::from_utf8(&bytes[..len]) {
- Ok(s) => Some(Ok(s.chars().next().unwrap())),
- Err(_) => Some(Err(bytes[0])),
- }
-}
-
-/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the end of the given byte
-/// slice, then the last byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-#[inline(always)]
-pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
- if bytes.is_empty() {
- return None;
- }
- let mut start = bytes.len() - 1;
- let limit = bytes.len().saturating_sub(4);
- while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
- start -= 1;
- }
- match decode_utf8(&bytes[start..]) {
- None => None,
- Some(Ok(ch)) => Some(Ok(ch)),
- Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
- }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-#[inline(always)]
-fn utf8_len(byte: u8) -> Option<usize> {
- if byte <= 0x7F {
- return Some(1);
- } else if byte & 0b1100_0000 == 0b1000_0000 {
- return None;
- } else if byte <= 0b1101_1111 {
- Some(2)
- } else if byte <= 0b1110_1111 {
- Some(3)
- } else if byte <= 0b1111_0111 {
- Some(4)
- } else {
- None
- }
-}
-
-/// Returns true if and only if the given byte is either a valid leading UTF-8
-/// byte, or is otherwise an invalid byte that can never appear anywhere in a
-/// valid UTF-8 sequence.
-#[inline(always)]
-fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
- // In the ASCII case, the most significant bit is never set. The leading
- // byte of a 2/3/4-byte sequence always has the top two most significant
- // bits set. For bytes that can never appear anywhere in valid UTF-8, this
- // also returns true, since every such byte has its two most significant
- // bits set:
- //
- // \xC0 :: 11000000
- // \xC1 :: 11000001
- // \xF5 :: 11110101
- // \xF6 :: 11110110
- // \xF7 :: 11110111
- // \xF8 :: 11111000
- // \xF9 :: 11111001
- // \xFA :: 11111010
- // \xFB :: 11111011
- // \xFC :: 11111100
- // \xFD :: 11111101
- // \xFE :: 11111110
- // \xFF :: 11111111
- (b & 0b1100_0000) != 0b1000_0000
-}
-
-#[cfg(feature = "alloc")]
-#[inline(always)]
-pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
- use core::{ptr, sync::atomic::AtomicPtr};
-
- use crate::{
- dfa::{
- dense::{self, DFA},
- Automaton,
- },
- util::lazy,
- };
-
- static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
-
- let dfa = lazy::get_or_init(&WORD, || {
- // TODO: Should we use a lazy DFA here instead? It does complicate
- // things somewhat, since we then need a mutable cache, which probably
- // means a thread local.
- dense::Builder::new()
- .configure(dense::Config::new().anchored(true))
- .build(r"\w")
- .unwrap()
- });
- // This is OK since '\w' contains no look-around.
- let mut sid = dfa.universal_start_state();
- while at < bytes.len() {
- let byte = bytes[at];
- sid = dfa.next_state(sid, byte);
- at += 1;
- if dfa.is_special_state(sid) {
- if dfa.is_match_state(sid) {
- return true;
- } else if dfa.is_dead_state(sid) {
- return false;
- }
- }
- }
- dfa.is_match_state(dfa.next_eoi_state(sid))
-}
-
-#[cfg(feature = "alloc")]
-#[inline(always)]
-pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
- use core::{ptr, sync::atomic::AtomicPtr};
-
- use crate::{
- dfa::{
- dense::{self, DFA},
- Automaton,
- },
- nfa::thompson::NFA,
- };
-
- static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
-
- let dfa = lazy::get_or_init(&WORD, || {
- dense::Builder::new()
- .configure(dense::Config::new().anchored(true))
- .thompson(NFA::config().reverse(true).shrink(true))
- .build(r"\w")
- .unwrap()
- });
-
- // This is OK since '\w' contains no look-around.
- let mut sid = dfa.universal_start_state();
- while at > 0 {
- at -= 1;
- let byte = bytes[at];
- sid = dfa.next_state(sid, byte);
- if dfa.is_special_state(sid) {
- if dfa.is_match_state(sid) {
- return true;
- } else if dfa.is_dead_state(sid) {
- return false;
- }
- }
- }
- dfa.is_match_state(dfa.next_eoi_state(sid))
-}
+pub(crate) mod unicode_data;
+pub(crate) mod utf8;
diff --git a/vendor/regex-automata/src/util/pool.rs b/vendor/regex-automata/src/util/pool.rs
new file mode 100644
index 000000000..c03d7b013
--- /dev/null
+++ b/vendor/regex-automata/src/util/pool.rs
@@ -0,0 +1,1142 @@
+// This module provides a relatively simple thread-safe pool of reusable
+// objects. For the most part, it's implemented by a stack represented by a
+// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
+// costly, in the case where a pool is accessed by the first thread that tried
+// to get a value, we bypass the mutex. Here are some benchmarks showing the
+// difference.
+//
+// 2022-10-15: These benchmarks are from the old regex crate and they aren't
+// easy to reproduce because some rely on older implementations of Pool that
+// are no longer around. I've left the results here for posterity, but any
+// enterprising individual should feel encouraged to re-litigate the way Pool
+// works. I am not at all certain it is the best approach.
+//
+// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
+// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
+// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
+// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
+//
+// (1) represents our baseline: the master branch at the time of writing when
+// using the 'thread_local' crate to implement the pool below.
+//
+// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
+// is no special trick for bypassing the mutex.
+//
+// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
+// fast because a Box<T> is much smaller than the T we use with a Pool in this
+// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
+// than for T.
+//
+// (4) is the same as (3), but with the trick for bypassing the mutex in the
+// case of the first-to-get thread.
+//
+// Why move off of thread_local? Even though (4) is a hair faster than (1)
+// above, this was not the main goal. The main goal was to move off of
+// thread_local and find a way to *simply* re-capture some of its speed for
+// regex's specific case. So again, why move off of it? The *primary* reason is
+// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
+// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
+// "use as much safe code as possible to minimize risk and be as sure as I can
+// be that it is correct.")
+//
+// My guess is that the thread_local design is probably not appropriate for
+// regex since its memory usage scales to the number of active threads that
+// have used a regex, where as the pool below scales to the number of threads
+// that simultaneously use a regex. While neither case permits contraction,
+// since we own the pool data structure below, we can add contraction if a
+// clear use case pops up in the wild. More pressingly though, it seems that
+// there are at least some use case patterns where one might have many threads
+// sitting around that might have used a regex at one point. While thread_local
+// does try to reuse space previously used by a thread that has since stopped,
+// its maximal memory usage still scales with the total number of active
+// threads. In contrast, the pool below scales with the total number of threads
+// *simultaneously* using the pool. The hope is that this uses less memory
+// overall. And if it doesn't, we can hopefully tune it somehow.
+//
+// It seems that these sort of conditions happen frequently
+// in FFI inside of other more "managed" languages. This was
+// mentioned in the issue linked above, and also mentioned here:
+// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
+// confirm that disabling the use of thread_local resolves the leak.
+//
+// There were other weaker reasons for moving off of thread_local as well.
+// Namely, at the time, I was looking to reduce dependencies. And for something
+// like regex, maintenance can be simpler when we own the full dependency tree.
+//
+// Note that I am not entirely happy with this pool. It has some subtle
+// implementation details and is overall still observable (even with the
+// thread owner optimization) in benchmarks. If someone wants to take a crack
+// at building something better, please file an issue. Even if it means a
+// different API. The API exposed by this pool is not the minimal thing that
+// something like a 'Regex' actually needs. It could adapt to, for example,
+// an API more like what is found in the 'thread_local' crate. However, we do
+// really need to support the no-std alloc-only context, or else the regex
+// crate wouldn't be able to support no-std alloc-only. However, I'm generally
+// okay with making the alloc-only context slower (as it is here), although I
+// do find it unfortunate.
+
+/*!
+A thread safe memory pool.
+
+The principal type in this module is a [`Pool`]. It main use case is for
+holding a thread safe collection of mutable scratch spaces (usually called
+`Cache` in this crate) that regex engines need to execute a search. This then
+permits sharing the same read-only regex object across multiple threads while
+having a quick way of reusing scratch space in a thread safe way. This avoids
+needing to re-create the scratch space for every search, which could wind up
+being quite expensive.
+*/
+
+/// A thread safe pool that works in an `alloc`-only context.
+///
+/// Getting a value out comes with a guard. When that guard is dropped, the
+/// value is automatically put back in the pool. The guard provides both a
+/// `Deref` and a `DerefMut` implementation for easy access to an underlying
+/// `T`.
+///
+/// A `Pool` impls `Sync` when `T` is `Send` (even if `T` is not `Sync`). This
+/// is possible because a pool is guaranteed to provide a value to exactly one
+/// thread at any time.
+///
+/// Currently, a pool never contracts in size. Its size is proportional to the
+/// maximum number of simultaneous uses. This may change in the future.
+///
+/// A `Pool` is a particularly useful data structure for this crate because
+/// many of the regex engines require a mutable "cache" in order to execute
+/// a search. Since regexes themselves tend to be global, the problem is then:
+/// how do you get a mutable cache to execute a search? You could:
+///
+/// 1. Use a `thread_local!`, which requires the standard library and requires
+/// that the regex pattern be statically known.
+/// 2. Use a `Pool`.
+/// 3. Make the cache an explicit dependency in your code and pass it around.
+/// 4. Put the cache state in a `Mutex`, but this means only one search can
+/// execute at a time.
+/// 5. Create a new cache for every search.
+///
+/// A `thread_local!` is perhaps the best choice if it works for your use case.
+/// Putting the cache in a mutex or creating a new cache for every search are
+/// perhaps the worst choices. Of the remaining two choices, whether you use
+/// this `Pool` or thread through a cache explicitly in your code is a matter
+/// of taste and depends on your code architecture.
+///
+/// # Warning: may use a spin lock
+///
+/// When this crate is compiled _without_ the `std` feature, then this type
+/// may used a spin lock internally. This can have subtle effects that may
+/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more
+/// thorough treatment of this topic.
+///
+/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
+///
+/// # Example
+///
+/// This example shows how to share a single hybrid regex among multiple
+/// threads, while also safely getting exclusive access to a hybrid's
+/// [`Cache`](crate::hybrid::regex::Cache) without preventing other searches
+/// from running while your thread uses the `Cache`.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::regex::{Cache, Regex},
+/// util::{lazy::Lazy, pool::Pool},
+/// Match,
+/// };
+///
+/// static RE: Lazy<Regex> =
+/// Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap());
+/// static CACHE: Lazy<Pool<Cache>> =
+/// Lazy::new(|| Pool::new(|| RE.create_cache()));
+///
+/// let expected = Some(Match::must(0, 3..14));
+/// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz"));
+/// ```
+pub struct Pool<T, F = fn() -> T>(alloc::boxed::Box<inner::Pool<T, F>>);
+
+impl<T, F> Pool<T, F> {
+ /// Create a new pool. The given closure is used to create values in
+ /// the pool when necessary.
+ pub fn new(create: F) -> Pool<T, F> {
+ Pool(alloc::boxed::Box::new(inner::Pool::new(create)))
+ }
+}
+
+impl<T: Send, F: Fn() -> T> Pool<T, F> {
+ /// Get a value from the pool. The caller is guaranteed to have
+ /// exclusive access to the given value. Namely, it is guaranteed that
+ /// this will never return a value that was returned by another call to
+ /// `get` but was not put back into the pool.
+ ///
+ /// When the guard goes out of scope and its destructor is called, then
+ /// it will automatically be put back into the pool. Alternatively,
+ /// [`PoolGuard::put`] may be used to explicitly put it back in the pool
+ /// without relying on its destructor.
+ ///
+ /// Note that there is no guarantee provided about which value in the
+ /// pool is returned. That is, calling get, dropping the guard (causing
+ /// the value to go back into the pool) and then calling get again is
+ /// *not* guaranteed to return the same value received in the first `get`
+ /// call.
+ pub fn get(&self) -> PoolGuard<'_, T, F> {
+ PoolGuard(self.0.get())
+ }
+}
+
+impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("Pool").field(&self.0).finish()
+ }
+}
+
+/// A guard that is returned when a caller requests a value from the pool.
+///
+/// The purpose of the guard is to use RAII to automatically put the value
+/// back in the pool once it's dropped.
+pub struct PoolGuard<'a, T: Send, F: Fn() -> T>(inner::PoolGuard<'a, T, F>);
+
+impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
+ /// Consumes this guard and puts it back into the pool.
+ ///
+ /// This circumvents the guard's `Drop` implementation. This can be useful
+ /// in circumstances where the automatic `Drop` results in poorer codegen,
+ /// such as calling non-inlined functions.
+ pub fn put(this: PoolGuard<'_, T, F>) {
+ inner::PoolGuard::put(this.0);
+ }
+}
+
+impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> {
+ type Target = T;
+
+ fn deref(&self) -> &T {
+ self.0.value()
+ }
+}
+
+impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> {
+ fn deref_mut(&mut self) -> &mut T {
+ self.0.value_mut()
+ }
+}
+
+impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug
+ for PoolGuard<'a, T, F>
+{
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("PoolGuard").field(&self.0).finish()
+ }
+}
+
+#[cfg(feature = "std")]
+mod inner {
+ use core::{
+ cell::UnsafeCell,
+ panic::{RefUnwindSafe, UnwindSafe},
+ sync::atomic::{AtomicUsize, Ordering},
+ };
+
+ use alloc::{boxed::Box, vec, vec::Vec};
+
+ use std::{sync::Mutex, thread_local};
+
+ /// An atomic counter used to allocate thread IDs.
+ ///
+ /// We specifically start our counter at 3 so that we can use the values
+ /// less than it as sentinels.
+ static COUNTER: AtomicUsize = AtomicUsize::new(3);
+
+ /// A thread ID indicating that there is no owner. This is the initial
+ /// state of a pool. Once a pool has an owner, there is no way to change
+ /// it.
+ static THREAD_ID_UNOWNED: usize = 0;
+
+ /// A thread ID indicating that the special owner value is in use and not
+ /// available. This state is useful for avoiding a case where the owner
+ /// of a pool calls `get` before putting the result of a previous `get`
+ /// call back into the pool.
+ static THREAD_ID_INUSE: usize = 1;
+
+ /// This sentinel is used to indicate that a guard has already been dropped
+ /// and should not be re-dropped. We use this because our drop code can be
+ /// called outside of Drop and thus there could be a bug in the internal
+ /// implementation that results in trying to put the same guard back into
+ /// the same pool multiple times, and *that* could result in UB if we
+ /// didn't mark the guard as already having been put back in the pool.
+ ///
+ /// So this isn't strictly necessary, but this let's us define some
+ /// routines as safe (like PoolGuard::put_imp) that we couldn't otherwise
+ /// do.
+ static THREAD_ID_DROPPED: usize = 2;
+
+ /// The number of stacks we use inside of the pool. These are only used for
+ /// non-owners. That is, these represent the "slow" path.
+ ///
+ /// In the original implementation of this pool, we only used a single
+ /// stack. While this might be okay for a couple threads, the prevalence of
+ /// 32, 64 and even 128 core CPUs has made it untenable. The contention
+ /// such an environment introduces when threads are doing a lot of searches
+ /// on short haystacks (a not uncommon use case) is palpable and leads to
+ /// huge slowdowns.
+ ///
+ /// This constant reflects a change from using one stack to the number of
+ /// stacks that this constant is set to. The stack for a particular thread
+ /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind
+ /// this setup is that there should be a good chance that accesses to the
+ /// pool will be distributed over several stacks instead of all of them
+ /// converging to one.
+ ///
+ /// This is not a particularly smart or dynamic strategy. Fixing this to a
+ /// specific number has at least two downsides. First is that it will help,
+ /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially,
+ /// it will still help the 128 core case.) Second is that this may wind
+ /// up being a little wasteful with respect to memory usage. Namely, if a
+ /// regex is used on one thread and then moved to another thread, then it
+ /// could result in creating a new copy of the data in the pool even though
+ /// only one is actually needed.
+ ///
+ /// And that memory usage bit is why this is set to 8 and not, say, 64.
+ /// Keeping it at 8 limits, to an extent, how much unnecessary memory can
+ /// be allocated.
+ ///
+ /// In an ideal world, we'd be able to have something like this:
+ ///
+ /// * Grow the number of stacks as the number of concurrent callers
+ /// increases. I spent a little time trying this, but even just adding an
+ /// atomic addition/subtraction for each pop/push for tracking concurrent
+ /// callers led to a big perf hit. Since even more work would seemingly be
+ /// required than just an addition/subtraction, I abandoned this approach.
+ /// * The maximum amount of memory used should scale with respect to the
+ /// number of concurrent callers and *not* the total number of existing
+ /// threads. This is primarily why the `thread_local` crate isn't used, as
+ /// as some environments spin up a lot of threads. This led to multiple
+ /// reports of extremely high memory usage (often described as memory
+ /// leaks).
+ /// * Even more ideally, the pool should contract in size. That is, it
+ /// should grow with bursts and then shrink. But this is a pretty thorny
+ /// issue to tackle and it might be better to just not.
+ /// * It would be nice to explore the use of, say, a lock-free stack
+ /// instead of using a mutex to guard a `Vec` that is ultimately just
+ /// treated as a stack. The main thing preventing me from exploring this
+ /// is the ABA problem. The `crossbeam` crate has tools for dealing with
+ /// this sort of problem (via its epoch based memory reclamation strategy),
+ /// but I can't justify bringing in all of `crossbeam` as a dependency of
+ /// `regex` for this.
+ ///
+ /// See this issue for more context and discussion:
+ /// https://github.com/rust-lang/regex/issues/934
+ const MAX_POOL_STACKS: usize = 8;
+
+ thread_local!(
+ /// A thread local used to assign an ID to a thread.
+ static THREAD_ID: usize = {
+ let next = COUNTER.fetch_add(1, Ordering::Relaxed);
+ // SAFETY: We cannot permit the reuse of thread IDs since reusing a
+ // thread ID might result in more than one thread "owning" a pool,
+ // and thus, permit accessing a mutable value from multiple threads
+ // simultaneously without synchronization. The intent of this panic
+ // is to be a sanity check. It is not expected that the thread ID
+ // space will actually be exhausted in practice. Even on a 32-bit
+ // system, it would require spawning 2^32 threads (although they
+ // wouldn't all need to run simultaneously, so it is in theory
+ // possible).
+ //
+ // This checks that the counter never wraps around, since atomic
+ // addition wraps around on overflow.
+ if next == 0 {
+ panic!("regex: thread ID allocation space exhausted");
+ }
+ next
+ };
+ );
+
+ /// This puts each stack in the pool below into its own cache line. This is
+ /// an absolutely critical optimization that tends to have the most impact
+ /// in high contention workloads. Without forcing each mutex protected
+ /// into its own cache line, high contention exacerbates the performance
+ /// problem by causing "false sharing." By putting each mutex in its own
+ /// cache-line, we avoid the false sharing problem and the affects of
+ /// contention are greatly reduced.
+ #[derive(Debug)]
+ #[repr(C, align(64))]
+ struct CacheLine<T>(T);
+
+ /// A thread safe pool utilizing std-only features.
+ ///
+ /// The main difference between this and the simplistic alloc-only pool is
+ /// the use of std::sync::Mutex and an "owner thread" optimization that
+ /// makes accesses by the owner of a pool faster than all other threads.
+ /// This makes the common case of running a regex within a single thread
+ /// faster by avoiding mutex unlocking.
+ pub(super) struct Pool<T, F> {
+ /// A function to create more T values when stack is empty and a caller
+ /// has requested a T.
+ create: F,
+ /// Multiple stacks of T values to hand out. These are used when a Pool
+ /// is accessed by a thread that didn't create it.
+ ///
+ /// Conceptually this is `Mutex<Vec<Box<T>>>`, but sharded out to make
+ /// it scale better under high contention work-loads. We index into
+ /// this sequence via `thread_id % stacks.len()`.
+ stacks: Vec<CacheLine<Mutex<Vec<Box<T>>>>>,
+ /// The ID of the thread that owns this pool. The owner is the thread
+ /// that makes the first call to 'get'. When the owner calls 'get', it
+ /// gets 'owner_val' directly instead of returning a T from 'stack'.
+ /// See comments elsewhere for details, but this is intended to be an
+ /// optimization for the common case that makes getting a T faster.
+ ///
+ /// It is initialized to a value of zero (an impossible thread ID) as a
+ /// sentinel to indicate that it is unowned.
+ owner: AtomicUsize,
+ /// A value to return when the caller is in the same thread that
+ /// first called `Pool::get`.
+ ///
+ /// This is set to None when a Pool is first created, and set to Some
+ /// once the first thread calls Pool::get.
+ owner_val: UnsafeCell<Option<T>>,
+ }
+
+ // SAFETY: Since we want to use a Pool from multiple threads simultaneously
+ // behind an Arc, we need for it to be Sync. In cases where T is sync,
+ // Pool<T> would be Sync. However, since we use a Pool to store mutable
+ // scratch space, we wind up using a T that has interior mutability and is
+ // thus itself not Sync. So what we *really* want is for our Pool<T> to by
+ // Sync even when T is not Sync (but is at least Send).
+ //
+ // The only non-sync aspect of a Pool is its 'owner_val' field, which is
+ // used to implement faster access to a pool value in the common case of
+ // a pool being accessed in the same thread in which it was created. The
+ // 'stack' field is also shared, but a Mutex<T> where T: Send is already
+ // Sync. So we only need to worry about 'owner_val'.
+ //
+ // The key is to guarantee that 'owner_val' can only ever be accessed from
+ // one thread. In our implementation below, we guarantee this by only
+ // returning the 'owner_val' when the ID of the current thread matches the
+ // ID of the thread that first called 'Pool::get'. Since this can only ever
+ // be one thread, it follows that only one thread can access 'owner_val' at
+ // any point in time. Thus, it is safe to declare that Pool<T> is Sync when
+ // T is Send.
+ //
+ // If there is a way to achieve our performance goals using safe code, then
+ // I would very much welcome a patch. As it stands, the implementation
+ // below tries to balance safety with performance. The case where a Regex
+ // is used from multiple threads simultaneously will suffer a bit since
+ // getting a value out of the pool will require unlocking a mutex.
+ //
+ // We require `F: Send + Sync` because we call `F` at any point on demand,
+ // potentially from multiple threads simultaneously.
+ unsafe impl<T: Send, F: Send + Sync> Sync for Pool<T, F> {}
+
+ // If T is UnwindSafe, then since we provide exclusive access to any
+ // particular value in the pool, the pool should therefore also be
+ // considered UnwindSafe.
+ //
+ // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
+ // point on demand, so it needs to be unwind safe on both dimensions for
+ // the entire Pool to be unwind safe.
+ impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> UnwindSafe for Pool<T, F> {}
+
+ // If T is UnwindSafe, then since we provide exclusive access to any
+ // particular value in the pool, the pool should therefore also be
+ // considered RefUnwindSafe.
+ //
+ // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any
+ // point on demand, so it needs to be unwind safe on both dimensions for
+ // the entire Pool to be unwind safe.
+ impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe
+ for Pool<T, F>
+ {
+ }
+
+ impl<T, F> Pool<T, F> {
+ /// Create a new pool. The given closure is used to create values in
+ /// the pool when necessary.
+ pub(super) fn new(create: F) -> Pool<T, F> {
+ // MSRV(1.63): Mark this function as 'const'. I've arranged the
+ // code such that it should "just work." Then mark the public
+ // 'Pool::new' method as 'const' too. (The alloc-only Pool::new
+ // is already 'const', so that should "just work" too.) The only
+ // thing we're waiting for is Mutex::new to be const.
+ let mut stacks = Vec::with_capacity(MAX_POOL_STACKS);
+ for _ in 0..stacks.capacity() {
+ stacks.push(CacheLine(Mutex::new(vec![])));
+ }
+ let owner = AtomicUsize::new(THREAD_ID_UNOWNED);
+ let owner_val = UnsafeCell::new(None); // init'd on first access
+ Pool { create, stacks, owner, owner_val }
+ }
+ }
+
+ impl<T: Send, F: Fn() -> T> Pool<T, F> {
+ /// Get a value from the pool. This may block if another thread is also
+ /// attempting to retrieve a value from the pool.
+ pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
+ // Our fast path checks if the caller is the thread that "owns"
+ // this pool. Or stated differently, whether it is the first thread
+ // that tried to extract a value from the pool. If it is, then we
+ // can return a T to the caller without going through a mutex.
+ //
+ // SAFETY: We must guarantee that only one thread gets access
+ // to this value. Since a thread is uniquely identified by the
+ // THREAD_ID thread local, it follows that if the caller's thread
+ // ID is equal to the owner, then only one thread may receive this
+ // value. This is also why we can get away with what looks like a
+ // racy load and a store. We know that if 'owner == caller', then
+ // only one thread can be here, so we don't need to worry about any
+ // other thread setting the owner to something else.
+ let caller = THREAD_ID.with(|id| *id);
+ let owner = self.owner.load(Ordering::Acquire);
+ if caller == owner {
+ // N.B. We could also do a CAS here instead of a load/store,
+ // but ad hoc benchmarking suggests it is slower. And a lot
+ // slower in the case where `get_slow` is common.
+ self.owner.store(THREAD_ID_INUSE, Ordering::Release);
+ return self.guard_owned(caller);
+ }
+ self.get_slow(caller, owner)
+ }
+
+ /// This is the "slow" version that goes through a mutex to pop an
+ /// allocated value off a stack to return to the caller. (Or, if the
+ /// stack is empty, a new value is created.)
+ ///
+ /// If the pool has no owner, then this will set the owner.
+ #[cold]
+ fn get_slow(
+ &self,
+ caller: usize,
+ owner: usize,
+ ) -> PoolGuard<'_, T, F> {
+ if owner == THREAD_ID_UNOWNED {
+ // This sentinel means this pool is not yet owned. We try to
+ // atomically set the owner. If we do, then this thread becomes
+ // the owner and we can return a guard that represents the
+ // special T for the owner.
+ //
+ // Note that we set the owner to a different sentinel that
+ // indicates that the owned value is in use. The owner ID will
+ // get updated to the actual ID of this thread once the guard
+ // returned by this function is put back into the pool.
+ let res = self.owner.compare_exchange(
+ THREAD_ID_UNOWNED,
+ THREAD_ID_INUSE,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ );
+ if res.is_ok() {
+ // SAFETY: A successful CAS above implies this thread is
+ // the owner and that this is the only such thread that
+ // can reach here. Thus, there is no data race.
+ unsafe {
+ *self.owner_val.get() = Some((self.create)());
+ }
+ return self.guard_owned(caller);
+ }
+ }
+ let stack_id = caller % self.stacks.len();
+ // We try to acquire exclusive access to this thread's stack, and
+ // if so, grab a value from it if we can. We put this in a loop so
+ // that it's easy to tweak and experiment with a different number
+ // of tries. In the end, I couldn't see anything obviously better
+ // than one attempt in ad hoc testing.
+ for _ in 0..1 {
+ let mut stack = match self.stacks[stack_id].0.try_lock() {
+ Err(_) => continue,
+ Ok(stack) => stack,
+ };
+ if let Some(value) = stack.pop() {
+ return self.guard_stack(value);
+ }
+ // Unlock the mutex guarding the stack before creating a fresh
+ // value since we no longer need the stack.
+ drop(stack);
+ let value = Box::new((self.create)());
+ return self.guard_stack(value);
+ }
+ // We're only here if we could get access to our stack, so just
+ // create a new value. This seems like it could be wasteful, but
+ // waiting for exclusive access to a stack when there's high
+ // contention is brutal for perf.
+ self.guard_stack_transient(Box::new((self.create)()))
+ }
+
+ /// Puts a value back into the pool. Callers don't need to call this.
+ /// Once the guard that's returned by 'get' is dropped, it is put back
+ /// into the pool automatically.
+ fn put_value(&self, value: Box<T>) {
+ let caller = THREAD_ID.with(|id| *id);
+ let stack_id = caller % self.stacks.len();
+ // As with trying to pop a value from this thread's stack, we
+ // merely attempt to get access to push this value back on the
+ // stack. If there's too much contention, we just give up and throw
+ // the value away.
+ //
+ // Interestingly, in ad hoc benchmarking, it is beneficial to
+ // attempt to push the value back more than once, unlike when
+ // popping the value. I don't have a good theory for why this is.
+ // I guess if we drop too many values then that winds up forcing
+ // the pop operation to create new fresh values and thus leads to
+ // less reuse. There's definitely a balancing act here.
+ for _ in 0..10 {
+ let mut stack = match self.stacks[stack_id].0.try_lock() {
+ Err(_) => continue,
+ Ok(stack) => stack,
+ };
+ stack.push(value);
+ return;
+ }
+ }
+
+ /// Create a guard that represents the special owned T.
+ fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> {
+ PoolGuard { pool: self, value: Err(caller), discard: false }
+ }
+
+ /// Create a guard that contains a value from the pool's stack.
+ fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
+ PoolGuard { pool: self, value: Ok(value), discard: false }
+ }
+
+ /// Create a guard that contains a value from the pool's stack with an
+ /// instruction to throw away the value instead of putting it back
+ /// into the pool.
+ fn guard_stack_transient(&self, value: Box<T>) -> PoolGuard<'_, T, F> {
+ PoolGuard { pool: self, value: Ok(value), discard: true }
+ }
+ }
+
+ impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("Pool")
+ .field("stacks", &self.stacks)
+ .field("owner", &self.owner)
+ .field("owner_val", &self.owner_val)
+ .finish()
+ }
+ }
+
+ /// A guard that is returned when a caller requests a value from the pool.
+ pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> {
+ /// The pool that this guard is attached to.
+ pool: &'a Pool<T, F>,
+ /// This is Err when the guard represents the special "owned" value.
+ /// In which case, the value is retrieved from 'pool.owner_val'. And
+ /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the
+ /// guard has been put back into the pool and should no longer be used.
+ value: Result<Box<T>, usize>,
+ /// When true, the value should be discarded instead of being pushed
+ /// back into the pool. We tend to use this under high contention, and
+ /// this allows us to avoid inflating the size of the pool. (Because
+ /// under contention, we tend to create more values instead of waiting
+ /// for access to a stack of existing values.)
+ discard: bool,
+ }
+
+ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
+ /// Return the underlying value.
+ pub(super) fn value(&self) -> &T {
+ match self.value {
+ Ok(ref v) => &**v,
+ // SAFETY: This is safe because the only way a PoolGuard gets
+ // created for self.value=Err is when the current thread
+ // corresponds to the owning thread, of which there can only
+ // be one. Thus, we are guaranteed to be providing exclusive
+ // access here which makes this safe.
+ //
+ // Also, since 'owner_val' is guaranteed to be initialized
+ // before an owned PoolGuard is created, the unchecked unwrap
+ // is safe.
+ Err(id) => unsafe {
+ // This assert is *not* necessary for safety, since we
+ // should never be here if the guard had been put back into
+ // the pool. This is a sanity check to make sure we didn't
+ // break an internal invariant.
+ debug_assert_ne!(THREAD_ID_DROPPED, id);
+ (*self.pool.owner_val.get()).as_ref().unwrap_unchecked()
+ },
+ }
+ }
+
+ /// Return the underlying value as a mutable borrow.
+ pub(super) fn value_mut(&mut self) -> &mut T {
+ match self.value {
+ Ok(ref mut v) => &mut **v,
+ // SAFETY: This is safe because the only way a PoolGuard gets
+ // created for self.value=None is when the current thread
+ // corresponds to the owning thread, of which there can only
+ // be one. Thus, we are guaranteed to be providing exclusive
+ // access here which makes this safe.
+ //
+ // Also, since 'owner_val' is guaranteed to be initialized
+ // before an owned PoolGuard is created, the unwrap_unchecked
+ // is safe.
+ Err(id) => unsafe {
+ // This assert is *not* necessary for safety, since we
+ // should never be here if the guard had been put back into
+ // the pool. This is a sanity check to make sure we didn't
+ // break an internal invariant.
+ debug_assert_ne!(THREAD_ID_DROPPED, id);
+ (*self.pool.owner_val.get()).as_mut().unwrap_unchecked()
+ },
+ }
+ }
+
+ /// Consumes this guard and puts it back into the pool.
+ pub(super) fn put(this: PoolGuard<'_, T, F>) {
+ // Since this is effectively consuming the guard and putting the
+ // value back into the pool, there's no reason to run its Drop
+ // impl after doing this. I don't believe there is a correctness
+ // problem with doing so, but there's definitely a perf problem
+ // by redoing this work. So we avoid it.
+ let mut this = core::mem::ManuallyDrop::new(this);
+ this.put_imp();
+ }
+
+ /// Puts this guard back into the pool by only borrowing the guard as
+ /// mutable. This should be called at most once.
+ #[inline(always)]
+ fn put_imp(&mut self) {
+ match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) {
+ Ok(value) => {
+ // If we were told to discard this value then don't bother
+ // trying to put it back into the pool. This occurs when
+ // the pop operation failed to acquire a lock and we
+ // decided to create a new value in lieu of contending for
+ // the lock.
+ if self.discard {
+ return;
+ }
+ self.pool.put_value(value);
+ }
+ // If this guard has a value "owned" by the thread, then
+ // the Pool guarantees that this is the ONLY such guard.
+ // Therefore, in order to place it back into the pool and make
+ // it available, we need to change the owner back to the owning
+ // thread's ID. But note that we use the ID that was stored in
+ // the guard, since a guard can be moved to another thread and
+ // dropped. (A previous iteration of this code read from the
+ // THREAD_ID thread local, which uses the ID of the current
+ // thread which may not be the ID of the owning thread! This
+ // also avoids the TLS access, which is likely a hair faster.)
+ Err(owner) => {
+ // If we hit this point, it implies 'put_imp' has been
+ // called multiple times for the same guard which in turn
+ // corresponds to a bug in this implementation.
+ assert_ne!(THREAD_ID_DROPPED, owner);
+ self.pool.owner.store(owner, Ordering::Release);
+ }
+ }
+ }
+ }
+
+ impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+ fn drop(&mut self) {
+ self.put_imp();
+ }
+ }
+
+ impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug
+ for PoolGuard<'a, T, F>
+ {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("PoolGuard")
+ .field("pool", &self.pool)
+ .field("value", &self.value)
+ .finish()
+ }
+ }
+}
+
+// FUTURE: We should consider using Mara Bos's nearly-lock-free version of this
+// here: https://gist.github.com/m-ou-se/5fdcbdf7dcf4585199ce2de697f367a4.
+//
+// One reason why I did things with a "mutex" below is that it isolates the
+// safety concerns to just the Mutex, where as the safety of Mara's pool is a
+// bit more sprawling. I also expect this code to not be used that much, and
+// so is unlikely to get as much real world usage with which to test it. That
+// means the "obviously correct" lever is an important one.
+//
+// The specific reason to use Mara's pool is that it is likely faster and also
+// less likely to hit problems with spin-locks, although it is not completely
+// impervious to them.
+//
+// The best solution to this problem, probably, is a truly lock free pool. That
+// could be done with a lock free linked list. The issue is the ABA problem. It
+// is difficult to avoid, and doing so is complex. BUT, the upshot of that is
+// that if we had a truly lock free pool, then we could also use it above in
+// the 'std' pool instead of a Mutex because it should be completely free the
+// problems that come from spin-locks.
+#[cfg(not(feature = "std"))]
+mod inner {
+ use core::{
+ cell::UnsafeCell,
+ panic::{RefUnwindSafe, UnwindSafe},
+ sync::atomic::{AtomicBool, Ordering},
+ };
+
+ use alloc::{boxed::Box, vec, vec::Vec};
+
+ /// A thread safe pool utilizing alloc-only features.
+ ///
+ /// Unlike the std version, it doesn't seem possible(?) to implement the
+ /// "thread owner" optimization because alloc-only doesn't have any concept
+ /// of threads. So the best we can do is just a normal stack. This will
+ /// increase latency in alloc-only environments.
+ pub(super) struct Pool<T, F> {
+ /// A stack of T values to hand out. These are used when a Pool is
+ /// accessed by a thread that didn't create it.
+ stack: Mutex<Vec<Box<T>>>,
+ /// A function to create more T values when stack is empty and a caller
+ /// has requested a T.
+ create: F,
+ }
+
+ // If T is UnwindSafe, then since we provide exclusive access to any
+ // particular value in the pool, it should therefore also be considered
+ // RefUnwindSafe.
+ impl<T: UnwindSafe, F: UnwindSafe> RefUnwindSafe for Pool<T, F> {}
+
+ impl<T, F> Pool<T, F> {
+ /// Create a new pool. The given closure is used to create values in
+ /// the pool when necessary.
+ pub(super) const fn new(create: F) -> Pool<T, F> {
+ Pool { stack: Mutex::new(vec![]), create }
+ }
+ }
+
+ impl<T: Send, F: Fn() -> T> Pool<T, F> {
+ /// Get a value from the pool. This may block if another thread is also
+ /// attempting to retrieve a value from the pool.
+ pub(super) fn get(&self) -> PoolGuard<'_, T, F> {
+ let mut stack = self.stack.lock();
+ let value = match stack.pop() {
+ None => Box::new((self.create)()),
+ Some(value) => value,
+ };
+ PoolGuard { pool: self, value: Some(value) }
+ }
+
+ fn put(&self, guard: PoolGuard<'_, T, F>) {
+ let mut guard = core::mem::ManuallyDrop::new(guard);
+ if let Some(value) = guard.value.take() {
+ self.put_value(value);
+ }
+ }
+
+ /// Puts a value back into the pool. Callers don't need to call this.
+ /// Once the guard that's returned by 'get' is dropped, it is put back
+ /// into the pool automatically.
+ fn put_value(&self, value: Box<T>) {
+ let mut stack = self.stack.lock();
+ stack.push(value);
+ }
+ }
+
+ impl<T: core::fmt::Debug, F> core::fmt::Debug for Pool<T, F> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.debug_struct("Pool").field("stack", &self.stack).finish()
+ }
+ }
+
+ /// A guard that is returned when a caller requests a value from the pool.
+ pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> {
+ /// The pool that this guard is attached to.
+ pool: &'a Pool<T, F>,
+ /// This is None after the guard has been put back into the pool.
+ value: Option<Box<T>>,
+ }
+
+ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> {
+ /// Return the underlying value.
+ pub(super) fn value(&self) -> &T {
+ self.value.as_deref().unwrap()
+ }
+
+ /// Return the underlying value as a mutable borrow.
+ pub(super) fn value_mut(&mut self) -> &mut T {
+ self.value.as_deref_mut().unwrap()
+ }
+
+ /// Consumes this guard and puts it back into the pool.
+ pub(super) fn put(this: PoolGuard<'_, T, F>) {
+ // Since this is effectively consuming the guard and putting the
+ // value back into the pool, there's no reason to run its Drop
+ // impl after doing this. I don't believe there is a correctness
+ // problem with doing so, but there's definitely a perf problem
+ // by redoing this work. So we avoid it.
+ let mut this = core::mem::ManuallyDrop::new(this);
+ this.put_imp();
+ }
+
+ /// Puts this guard back into the pool by only borrowing the guard as
+ /// mutable. This should be called at most once.
+ #[inline(always)]
+ fn put_imp(&mut self) {
+ if let Some(value) = self.value.take() {
+ self.pool.put_value(value);
+ }
+ }
+ }
+
+ impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> {
+ fn drop(&mut self) {
+ self.put_imp();
+ }
+ }
+
+ impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug
+ for PoolGuard<'a, T, F>
+ {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("PoolGuard")
+ .field("pool", &self.pool)
+ .field("value", &self.value)
+ .finish()
+ }
+ }
+
+ /// A spin-lock based mutex. Yes, I have read spinlocks cosnidered
+ /// harmful[1], and if there's a reasonable alternative choice, I'll
+ /// happily take it.
+ ///
+ /// I suspect the most likely alternative here is a Treiber stack, but
+ /// implementing one correctly in a way that avoids the ABA problem looks
+ /// subtle enough that I'm not sure I want to attempt that. But otherwise,
+ /// we only need a mutex in order to implement our pool, so if there's
+ /// something simpler we can use that works for our `Pool` use case, then
+ /// that would be great.
+ ///
+ /// Note that this mutex does not do poisoning.
+ ///
+ /// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
+ #[derive(Debug)]
+ struct Mutex<T> {
+ locked: AtomicBool,
+ data: UnsafeCell<T>,
+ }
+
+ // SAFETY: Since a Mutex guarantees exclusive access, as long as we can
+ // send it across threads, it must also be Sync.
+ unsafe impl<T: Send> Sync for Mutex<T> {}
+
+ impl<T> Mutex<T> {
+ /// Create a new mutex for protecting access to the given value across
+ /// multiple threads simultaneously.
+ const fn new(value: T) -> Mutex<T> {
+ Mutex {
+ locked: AtomicBool::new(false),
+ data: UnsafeCell::new(value),
+ }
+ }
+
+ /// Lock this mutex and return a guard providing exclusive access to
+ /// `T`. This blocks if some other thread has already locked this
+ /// mutex.
+ fn lock(&self) -> MutexGuard<'_, T> {
+ while self
+ .locked
+ .compare_exchange(
+ false,
+ true,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ )
+ .is_err()
+ {
+ core::hint::spin_loop();
+ }
+ // SAFETY: The only way we're here is if we successfully set
+ // 'locked' to true, which implies we must be the only thread here
+ // and thus have exclusive access to 'data'.
+ let data = unsafe { &mut *self.data.get() };
+ MutexGuard { locked: &self.locked, data }
+ }
+ }
+
+ /// A guard that derefs to &T and &mut T. When it's dropped, the lock is
+ /// released.
+ #[derive(Debug)]
+ struct MutexGuard<'a, T> {
+ locked: &'a AtomicBool,
+ data: &'a mut T,
+ }
+
+ impl<'a, T> core::ops::Deref for MutexGuard<'a, T> {
+ type Target = T;
+
+ fn deref(&self) -> &T {
+ self.data
+ }
+ }
+
+ impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> {
+ fn deref_mut(&mut self) -> &mut T {
+ self.data
+ }
+ }
+
+ impl<'a, T> Drop for MutexGuard<'a, T> {
+ fn drop(&mut self) {
+ // Drop means 'data' is no longer accessible, so we can unlock
+ // the mutex.
+ self.locked.store(false, Ordering::Release);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use core::panic::{RefUnwindSafe, UnwindSafe};
+
+ use alloc::{boxed::Box, vec, vec::Vec};
+
+ use super::*;
+
+ #[test]
+ fn oibits() {
+ fn assert_oitbits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
+ assert_oitbits::<Pool<Vec<u32>>>();
+ assert_oitbits::<Pool<core::cell::RefCell<Vec<u32>>>>();
+ assert_oitbits::<
+ Pool<
+ Vec<u32>,
+ Box<
+ dyn Fn() -> Vec<u32>
+ + Send
+ + Sync
+ + UnwindSafe
+ + RefUnwindSafe,
+ >,
+ >,
+ >();
+ }
+
+ // Tests that Pool implements the "single owner" optimization. That is, the
+ // thread that first accesses the pool gets its own copy, while all other
+ // threads get distinct copies.
+ #[cfg(feature = "std")]
+ #[test]
+ fn thread_owner_optimization() {
+ use std::{cell::RefCell, sync::Arc, vec};
+
+ let pool: Arc<Pool<RefCell<Vec<char>>>> =
+ Arc::new(Pool::new(|| RefCell::new(vec!['a'])));
+ pool.get().borrow_mut().push('x');
+
+ let pool1 = pool.clone();
+ let t1 = std::thread::spawn(move || {
+ let guard = pool1.get();
+ guard.borrow_mut().push('y');
+ });
+
+ let pool2 = pool.clone();
+ let t2 = std::thread::spawn(move || {
+ let guard = pool2.get();
+ guard.borrow_mut().push('z');
+ });
+
+ t1.join().unwrap();
+ t2.join().unwrap();
+
+ // If we didn't implement the single owner optimization, then one of
+ // the threads above is likely to have mutated the [a, x] vec that
+ // we stuffed in the pool before spawning the threads. But since
+ // neither thread was first to access the pool, and because of the
+ // optimization, we should be guaranteed that neither thread mutates
+ // the special owned pool value.
+ //
+ // (Technically this is an implementation detail and not a contract of
+ // Pool's API.)
+ assert_eq!(vec!['a', 'x'], *pool.get().borrow());
+ }
+
+ // This tests that if the "owner" of a pool asks for two values, then it
+ // gets two distinct values and not the same one. This test failed in the
+ // course of developing the pool, which in turn resulted in UB because it
+ // permitted getting aliasing &mut borrows to the same place in memory.
+ #[test]
+ fn thread_owner_distinct() {
+ let pool = Pool::new(|| vec!['a']);
+
+ {
+ let mut g1 = pool.get();
+ let v1 = &mut *g1;
+ let mut g2 = pool.get();
+ let v2 = &mut *g2;
+ v1.push('b');
+ v2.push('c');
+ assert_eq!(&mut vec!['a', 'b'], v1);
+ assert_eq!(&mut vec!['a', 'c'], v2);
+ }
+ // This isn't technically guaranteed, but we
+ // expect to now get the "owned" value (the first
+ // call to 'get()' above) now that it's back in
+ // the pool.
+ assert_eq!(&mut vec!['a', 'b'], &mut *pool.get());
+ }
+
+ // This tests that we can share a guard with another thread, mutate the
+ // underlying value and everything works. This failed in the course of
+ // developing a pool since the pool permitted 'get()' to return the same
+ // value to the owner thread, even before the previous value was put back
+ // into the pool. This in turn resulted in this test producing a data race.
+ #[cfg(feature = "std")]
+ #[test]
+ fn thread_owner_sync() {
+ let pool = Pool::new(|| vec!['a']);
+ {
+ let mut g1 = pool.get();
+ let mut g2 = pool.get();
+ std::thread::scope(|s| {
+ s.spawn(|| {
+ g1.push('b');
+ });
+ s.spawn(|| {
+ g2.push('c');
+ });
+ });
+
+ let v1 = &mut *g1;
+ let v2 = &mut *g2;
+ assert_eq!(&mut vec!['a', 'b'], v1);
+ assert_eq!(&mut vec!['a', 'c'], v2);
+ }
+
+ // This isn't technically guaranteed, but we
+ // expect to now get the "owned" value (the first
+ // call to 'get()' above) now that it's back in
+ // the pool.
+ assert_eq!(&mut vec!['a', 'b'], &mut *pool.get());
+ }
+
+ // This tests that if we move a PoolGuard that is owned by the current
+ // thread to another thread and drop it, then the thread owner doesn't
+ // change. During development of the pool, this test failed because the
+ // PoolGuard assumed it was dropped in the same thread from which it was
+ // created, and thus used the current thread's ID as the owner, which could
+ // be different than the actual owner of the pool.
+ #[cfg(feature = "std")]
+ #[test]
+ fn thread_owner_send_drop() {
+ let pool = Pool::new(|| vec!['a']);
+ // Establishes this thread as the owner.
+ {
+ pool.get().push('b');
+ }
+ std::thread::scope(|s| {
+ // Sanity check that we get the same value back.
+ // (Not technically guaranteed.)
+ let mut g = pool.get();
+ assert_eq!(&vec!['a', 'b'], &*g);
+ // Now push it to another thread and drop it.
+ s.spawn(move || {
+ g.push('c');
+ })
+ .join()
+ .unwrap();
+ });
+ // Now check that we're still the owner. This is not technically
+ // guaranteed by the API, but is true in practice given the thread
+ // owner optimization.
+ assert_eq!(&vec!['a', 'b', 'c'], &*pool.get());
+ }
+}
diff --git a/vendor/regex-automata/src/util/prefilter.rs b/vendor/regex-automata/src/util/prefilter.rs
deleted file mode 100644
index 5fe151524..000000000
--- a/vendor/regex-automata/src/util/prefilter.rs
+++ /dev/null
@@ -1,281 +0,0 @@
-use crate::Match;
-
-/// A candidate is the result of running a prefilter on a haystack at a
-/// particular position. The result is one of no match, a confirmed match or
-/// a possible match.
-///
-/// When no match is returned, the prefilter is guaranteeing that no possible
-/// match can be found in the haystack, and the caller may trust this. That is,
-/// all correct prefilters must never report false negatives.
-///
-/// In some cases, a prefilter can confirm a match very quickly, in which case,
-/// the caller may use this to stop what it's doing and report the match. In
-/// this case, prefilter implementations must never report a false positive.
-/// In other cases, the prefilter can only report a potential match, in which
-/// case the callers must attempt to confirm the match. In this case, prefilter
-/// implementations are permitted to return false positives.
-#[derive(Clone, Debug)]
-pub enum Candidate {
- /// The prefilter reports that no match is possible. Prefilter
- /// implementations will never report false negatives.
- None,
- /// The prefilter reports that a match has been confirmed at the provided
- /// byte offsets. When this variant is reported, the prefilter is
- /// guaranteeing a match. No false positives are permitted.
- Match(Match),
- /// The prefilter reports that a match *may* start at the given position.
- /// When this variant is reported, it may correspond to a false positive.
- PossibleStartOfMatch(usize),
-}
-
-impl Candidate {
- /// Convert this candidate into an option. This is useful when callers do
- /// not distinguish between true positives and false positives (i.e., the
- /// caller must always confirm the match in order to update some other
- /// state).
- ///
- /// The byte offset in the option returned corresponds to the starting
- /// position of the possible match.
- pub fn into_option(self) -> Option<usize> {
- match self {
- Candidate::None => None,
- Candidate::Match(ref m) => Some(m.start()),
- Candidate::PossibleStartOfMatch(start) => Some(start),
- }
- }
-}
-
-/// A prefilter describes the behavior of fast literal scanners for quickly
-/// skipping past bytes in the haystack that we know cannot possibly
-/// participate in a match.
-pub trait Prefilter: core::fmt::Debug {
- /// Returns the next possible match candidate. This may yield false
- /// positives, so callers must confirm a match starting at the position
- /// returned. This, however, must never produce false negatives. That is,
- /// this must, at minimum, return the starting position of the next match
- /// in the given haystack after or at the given position.
- fn next_candidate(
- &self,
- state: &mut State,
- haystack: &[u8],
- at: usize,
- ) -> Candidate;
-
- /// Returns the approximate total amount of heap used by this prefilter, in
- /// units of bytes.
- fn heap_bytes(&self) -> usize;
-
- /// Returns true if and only if this prefilter may return false positives
- /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
- /// when false positives are not posssible (in which case, implementations
- /// should return false), which may allow completely avoiding heavier regex
- /// machinery when the prefilter can quickly confirm its own matches.
- ///
- /// By default, this returns true, which is conservative; it is always
- /// correct to return `true`. Returning `false` here and reporting a false
- /// positive will result in incorrect searches.
- fn reports_false_positives(&self) -> bool {
- true
- }
-}
-
-impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
- #[inline]
- fn next_candidate(
- &self,
- state: &mut State,
- haystack: &[u8],
- at: usize,
- ) -> Candidate {
- (**self).next_candidate(state, haystack, at)
- }
-
- fn heap_bytes(&self) -> usize {
- (**self).heap_bytes()
- }
-
- fn reports_false_positives(&self) -> bool {
- (**self).reports_false_positives()
- }
-}
-
-#[derive(Clone)]
-pub struct Scanner<'p> {
- prefilter: &'p dyn Prefilter,
- state: State,
-}
-
-impl<'p> Scanner<'p> {
- pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
- Scanner { prefilter, state: State::new() }
- }
-
- pub(crate) fn is_effective(&mut self, at: usize) -> bool {
- self.state.is_effective(at)
- }
-
- pub(crate) fn reports_false_positives(&self) -> bool {
- self.prefilter.reports_false_positives()
- }
-
- pub(crate) fn next_candidate(
- &mut self,
- bytes: &[u8],
- at: usize,
- ) -> Candidate {
- let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
- match cand {
- Candidate::None => {
- self.state.update_skipped_bytes(bytes.len() - at);
- }
- Candidate::Match(ref m) => {
- self.state.update_skipped_bytes(m.start() - at);
- }
- Candidate::PossibleStartOfMatch(i) => {
- self.state.update_skipped_bytes(i - at);
- }
- }
- cand
- }
-}
-
-impl<'p> core::fmt::Debug for Scanner<'p> {
- fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- f.debug_struct("Scanner").field("state", &self.state).finish()
- }
-}
-
-/// State tracks state associated with the effectiveness of a
-/// prefilter. It is used to track how many bytes, on average, are skipped by
-/// the prefilter. If this average dips below a certain threshold over time,
-/// then the state renders the prefilter inert and stops using it.
-///
-/// A prefilter state should be created for each search. (Where creating an
-/// iterator via, e.g., `find_iter`, is treated as a single search.)
-#[derive(Clone, Debug)]
-pub struct State {
- /// The number of skips that has been executed.
- skips: usize,
- /// The total number of bytes that have been skipped.
- skipped: usize,
- /// Once this heuristic has been deemed permanently ineffective, it will be
- /// inert throughout the rest of its lifetime. This serves as a cheap way
- /// to check inertness.
- inert: bool,
- /// The last (absolute) position at which a prefilter scanned to.
- /// Prefilters can use this position to determine whether to re-scan or
- /// not.
- ///
- /// Unlike other things that impact effectiveness, this is a fleeting
- /// condition. That is, a prefilter can be considered ineffective if it is
- /// at a position before `last_scan_at`, but can become effective again
- /// once the search moves past `last_scan_at`.
- ///
- /// The utility of this is to both avoid additional overhead from calling
- /// the prefilter and to avoid quadratic behavior. This ensures that a
- /// prefilter will scan any particular byte at most once. (Note that some
- /// prefilters, like the start-byte prefilter, do not need to use this
- /// field at all, since it only looks for starting bytes.)
- last_scan_at: usize,
-}
-
-impl State {
- /// The minimum number of skip attempts to try before considering whether
- /// a prefilter is effective or not.
- const MIN_SKIPS: usize = 40;
-
- /// The minimum amount of bytes that skipping must average.
- ///
- /// That is, after MIN_SKIPS have occurred, if the average number of bytes
- /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
- /// rendered inert.
- const MIN_AVG_SKIP: usize = 16;
-
- /// Create a fresh prefilter state.
- pub fn new() -> State {
- State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
- }
-
- /// Updates the position at which the last scan stopped. This may be
- /// greater than the position of the last candidate reported. For example,
- /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
- /// report a candidate at position `0`, but the end of its last scan will
- /// be at position `3`.
- ///
- /// This position factors into the effectiveness of this prefilter. If the
- /// current position is less than the last position at which a scan ended,
- /// then the prefilter should not be re-run until the search moves past
- /// that position.
- ///
- /// It is always correct to never update the last scan position. In fact,
- /// it is also always correct to set the last scan position to an arbitrary
- /// value. The key is setting it to a position in the future at which it
- /// makes sense to restart the prefilter.
- pub fn update_last_scan(&mut self, at: usize) {
- if at > self.last_scan_at {
- self.last_scan_at = at;
- }
- }
-
- /// Return true if and only if this state indicates that a prefilter is
- /// still effective. If the prefilter is not effective, then this state
- /// is rendered "inert." At which point, all subsequent calls to
- /// `is_effective` on this state will return `false`.
- ///
- /// `at` should correspond to the current starting position of the search.
- ///
- /// Callers typically do not need to use this, as it represents the
- /// default implementation of
- /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
- fn is_effective(&mut self, at: usize) -> bool {
- if self.inert {
- return false;
- }
- if at < self.last_scan_at {
- return false;
- }
- if self.skips < State::MIN_SKIPS {
- return true;
- }
-
- if self.skipped >= State::MIN_AVG_SKIP * self.skips {
- return true;
- }
-
- // We're inert.
- self.inert = true;
- false
- }
-
- /// Update this state with the number of bytes skipped on the last
- /// invocation of the prefilter.
- fn update_skipped_bytes(&mut self, skipped: usize) {
- self.skips += 1;
- self.skipped += skipped;
- }
-}
-
-/// A `Prefilter` implementation that reports a possible match at every
-/// position.
-///
-/// This should generally not be used as an actual prefilter. It is only
-/// useful when one needs to represent the absence of a prefilter in a generic
-/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
-/// uses this prefilter by default to indicate that no prefilter should be
-/// used.
-///
-/// A `None` prefilter value cannot be constructed.
-#[derive(Clone, Debug)]
-pub struct None {
- _priv: (),
-}
-
-impl Prefilter for None {
- fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
- Candidate::PossibleStartOfMatch(at)
- }
-
- fn heap_bytes(&self) -> usize {
- 0
- }
-}
diff --git a/vendor/regex-automata/src/util/prefilter/aho_corasick.rs b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs
new file mode 100644
index 000000000..50cce827e
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/aho_corasick.rs
@@ -0,0 +1,149 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct AhoCorasick {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ _unused: (),
+ #[cfg(feature = "perf-literal-multisubstring")]
+ ac: aho_corasick::AhoCorasick,
+}
+
+impl AhoCorasick {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<AhoCorasick> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // We used to use `aho_corasick::MatchKind::Standard` here when
+ // `kind` was `MatchKind::All`, but this is not correct. The
+ // "standard" Aho-Corasick match semantics are to report a match
+ // immediately as soon as it is seen, but `All` isn't like that.
+ // In particular, with "standard" semantics, given the needles
+ // "abc" and "b" and the haystack "abc," it would report a match
+ // at offset 1 before a match at offset 0. This is never what we
+ // want in the context of the regex engine, regardless of whether
+ // we have leftmost-first or 'all' semantics. Namely, we always
+ // want the leftmost match.
+ let ac_match_kind = match kind {
+ MatchKind::LeftmostFirst | MatchKind::All => {
+ aho_corasick::MatchKind::LeftmostFirst
+ }
+ };
+ // This is kind of just an arbitrary number, but basically, if we
+ // have a small enough set of literals, then we try to use the VERY
+ // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
+ // upshot is that the NFA is quite lean and decently fast. Faster
+ // than a naive Aho-Corasick NFA anyway.
+ let ac_kind = if needles.len() <= 500 {
+ aho_corasick::AhoCorasickKind::DFA
+ } else {
+ aho_corasick::AhoCorasickKind::ContiguousNFA
+ };
+ let result = aho_corasick::AhoCorasick::builder()
+ .kind(Some(ac_kind))
+ .match_kind(ac_match_kind)
+ .start_kind(aho_corasick::StartKind::Both)
+ // We try to handle all of the prefilter cases in the super
+ // module, and only use Aho-Corasick for the actual automaton.
+ // The aho-corasick crate does have some extra prefilters,
+ // namely, looking for rare bytes to feed to memchr{,2,3}
+ // instead of just the first byte. If we end up wanting
+ // those---and they are somewhat tricky to implement---then
+ // we could port them to this crate.
+ //
+ // The main reason for doing things this way is so we have a
+ // complete and easy to understand picture of which prefilters
+ // are available and how they work. Otherwise it seems too
+ // easy to get into a situation where we have a prefilter
+ // layered on top of prefilter, and that might have unintended
+ // consequences.
+ .prefilter(false)
+ .build(needles);
+ let ac = match result {
+ Ok(ac) => ac,
+ Err(_err) => {
+ debug!("aho-corasick prefilter failed to build: {}", _err);
+ return None;
+ }
+ };
+ Some(AhoCorasick { ac })
+ }
+ }
+}
+
+impl PrefilterI for AhoCorasick {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let input =
+ aho_corasick::Input::new(haystack).span(span.start..span.end);
+ self.ac
+ .find(input)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let input = aho_corasick::Input::new(haystack)
+ .anchored(aho_corasick::Anchored::Yes)
+ .span(span.start..span.end);
+ self.ac
+ .find(input)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ self.ac.memory_usage()
+ }
+ }
+
+ fn is_fast(&self) -> bool {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // Aho-Corasick is never considered "fast" because it's never
+ // going to be even close to an order of magnitude faster than the
+ // regex engine itself (assuming a DFA is used). In fact, it is
+ // usually slower. The magic of Aho-Corasick is that it can search
+ // a *large* number of literals with a relatively small amount of
+ // memory. The regex engines are far more wasteful.
+ //
+ // Aho-Corasick may be "fast" when the regex engine corresponds
+ // to, say, the PikeVM. That happens when the lazy DFA couldn't be
+ // built or used for some reason. But in these cases, the regex
+ // itself is likely quite big and we're probably hosed no matter
+ // what we do. (In this case, the best bet is for the caller to
+ // increase some of the memory limits on the hybrid cache capacity
+ // and hope that's enough.)
+ false
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/prefilter/byteset.rs b/vendor/regex-automata/src/util/prefilter/byteset.rs
new file mode 100644
index 000000000..a669d6c9d
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/byteset.rs
@@ -0,0 +1,58 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct ByteSet([bool; 256]);
+
+impl ByteSet {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ _kind: MatchKind,
+ needles: &[B],
+ ) -> Option<ByteSet> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let mut set = [false; 256];
+ for needle in needles.iter() {
+ let needle = needle.as_ref();
+ if needle.len() != 1 {
+ return None;
+ }
+ set[usize::from(needle[0])] = true;
+ }
+ Some(ByteSet(set))
+ }
+ }
+}
+
+impl PrefilterI for ByteSet {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| {
+ let start = span.start + i;
+ let end = start + 1;
+ Span { start, end }
+ })
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ let b = *haystack.get(span.start)?;
+ if self.0[usize::from(b)] {
+ Some(Span { start: span.start, end: span.start + 1 })
+ } else {
+ None
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ 0
+ }
+
+ fn is_fast(&self) -> bool {
+ false
+ }
+}
diff --git a/vendor/regex-automata/src/util/prefilter/memchr.rs b/vendor/regex-automata/src/util/prefilter/memchr.rs
new file mode 100644
index 000000000..3d44b8372
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/memchr.rs
@@ -0,0 +1,186 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Memchr(u8);
+
+impl Memchr {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ _kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Memchr> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ if needles.len() != 1 {
+ return None;
+ }
+ if needles[0].as_ref().len() != 1 {
+ return None;
+ }
+ Some(Memchr(needles[0].as_ref()[0]))
+ }
+ }
+}
+
+impl PrefilterI for Memchr {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ memchr::memchr(self.0, &haystack[span]).map(|i| {
+ let start = span.start + i;
+ let end = start + 1;
+ Span { start, end }
+ })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ let b = *haystack.get(span.start)?;
+ if self.0 == b {
+ Some(Span { start: span.start, end: span.start + 1 })
+ } else {
+ None
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ 0
+ }
+
+ fn is_fast(&self) -> bool {
+ true
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct Memchr2(u8, u8);
+
+impl Memchr2 {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ _kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Memchr2> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ if needles.len() != 2 {
+ return None;
+ }
+ if !needles.iter().all(|n| n.as_ref().len() == 1) {
+ return None;
+ }
+ let b1 = needles[0].as_ref()[0];
+ let b2 = needles[1].as_ref()[0];
+ Some(Memchr2(b1, b2))
+ }
+ }
+}
+
+impl PrefilterI for Memchr2 {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| {
+ let start = span.start + i;
+ let end = start + 1;
+ Span { start, end }
+ })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ let b = *haystack.get(span.start)?;
+ if self.0 == b || self.1 == b {
+ Some(Span { start: span.start, end: span.start + 1 })
+ } else {
+ None
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ 0
+ }
+
+ fn is_fast(&self) -> bool {
+ true
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct Memchr3(u8, u8, u8);
+
+impl Memchr3 {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ _kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Memchr3> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ if needles.len() != 3 {
+ return None;
+ }
+ if !needles.iter().all(|n| n.as_ref().len() == 1) {
+ return None;
+ }
+ let b1 = needles[0].as_ref()[0];
+ let b2 = needles[1].as_ref()[0];
+ let b3 = needles[2].as_ref()[0];
+ Some(Memchr3(b1, b2, b3))
+ }
+ }
+}
+
+impl PrefilterI for Memchr3 {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-substring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-substring")]
+ {
+ memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| {
+ let start = span.start + i;
+ let end = start + 1;
+ Span { start, end }
+ })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ let b = *haystack.get(span.start)?;
+ if self.0 == b || self.1 == b || self.2 == b {
+ Some(Span { start: span.start, end: span.start + 1 })
+ } else {
+ None
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ 0
+ }
+
+ fn is_fast(&self) -> bool {
+ true
+ }
+}
diff --git a/vendor/regex-automata/src/util/prefilter/memmem.rs b/vendor/regex-automata/src/util/prefilter/memmem.rs
new file mode 100644
index 000000000..deea17bd9
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/memmem.rs
@@ -0,0 +1,88 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Memmem {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ _unused: (),
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ finder: memchr::memmem::Finder<'static>,
+}
+
+impl Memmem {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ _kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Memmem> {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ {
+ None
+ }
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ {
+ if needles.len() != 1 {
+ return None;
+ }
+ let needle = needles[0].as_ref();
+ let finder = memchr::memmem::Finder::new(needle).into_owned();
+ Some(Memmem { finder })
+ }
+ }
+}
+
+impl PrefilterI for Memmem {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ {
+ unreachable!()
+ }
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ {
+ self.finder.find(&haystack[span]).map(|i| {
+ let start = span.start + i;
+ let end = start + self.finder.needle().len();
+ Span { start, end }
+ })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ {
+ unreachable!()
+ }
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ {
+ let needle = self.finder.needle();
+ if haystack[span].starts_with(needle) {
+ Some(Span { end: span.start + needle.len(), ..span })
+ } else {
+ None
+ }
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ {
+ unreachable!()
+ }
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ {
+ self.finder.needle().len()
+ }
+ }
+
+ fn is_fast(&self) -> bool {
+ #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
+ {
+ unreachable!()
+ }
+ #[cfg(all(feature = "std", feature = "perf-literal-substring"))]
+ {
+ true
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/prefilter/mod.rs b/vendor/regex-automata/src/util/prefilter/mod.rs
new file mode 100644
index 000000000..51fc92233
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/mod.rs
@@ -0,0 +1,696 @@
+/*!
+Defines a prefilter for accelerating regex searches.
+
+A prefilter can be created by building a [`Prefilter`] value.
+
+A prefilter represents one of the most important optimizations available for
+accelerating regex searches. The idea of a prefilter is to very quickly find
+candidate locations in a haystack where a regex _could_ match. Once a candidate
+is found, it is then intended for the regex engine to run at that position to
+determine whether the candidate is a match or a false positive.
+
+In the aforementioned description of the prefilter optimization also lay its
+demise. Namely, if a prefilter has a high false positive rate and it produces
+lots of candidates, then a prefilter can overall make a regex search slower.
+It can run more slowly because more time is spent ping-ponging between the
+prefilter search and the regex engine attempting to confirm each candidate as
+a match. This ping-ponging has overhead that adds up, and is exacerbated by
+a high false positive rate.
+
+Nevertheless, the optimization is still generally worth performing in most
+cases. Particularly given just how much throughput can be improved. (It is not
+uncommon for prefilter optimizations to improve throughput by one or two orders
+of magnitude.)
+
+Typically a prefilter is used to find occurrences of literal prefixes from a
+regex pattern, but this isn't required. A prefilter can be used to look for
+suffixes or even inner literals.
+
+Note that as of now, prefilters throw away information about which pattern
+each literal comes from. In other words, when a prefilter finds a match,
+there's no way to know which pattern (or patterns) it came from. Therefore,
+in order to confirm a match, you'll have to check all of the patterns by
+running the full regex engine.
+*/
+
+mod aho_corasick;
+mod byteset;
+mod memchr;
+mod memmem;
+mod teddy;
+
+use core::{
+ borrow::Borrow,
+ fmt::Debug,
+ panic::{RefUnwindSafe, UnwindSafe},
+};
+
+#[cfg(feature = "alloc")]
+use alloc::sync::Arc;
+
+#[cfg(feature = "syntax")]
+use regex_syntax::hir::{literal, Hir};
+
+use crate::util::search::{MatchKind, Span};
+
+pub(crate) use crate::util::prefilter::{
+ aho_corasick::AhoCorasick,
+ byteset::ByteSet,
+ memchr::{Memchr, Memchr2, Memchr3},
+ memmem::Memmem,
+ teddy::Teddy,
+};
+
+/// A prefilter for accelerating regex searches.
+///
+/// If you already have your literals that you want to search with,
+/// then the vanilla [`Prefilter::new`] constructor is for you. But
+/// if you have an [`Hir`] value from the `regex-syntax` crate, then
+/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses
+/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to
+/// extract literal prefixes for you, optimize them and then select and build a
+/// prefilter matcher.
+///
+/// A prefilter must have **zero false negatives**. However, by its very
+/// nature, it may produce false positives. That is, a prefilter will never
+/// skip over a position in the haystack that corresponds to a match of the
+/// original regex pattern, but it *may* produce a match for a position
+/// in the haystack that does *not* correspond to a match of the original
+/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or
+/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is
+/// upheld for you automatically. This guarantee is not preserved if you use
+/// [`Prefilter::new`] though, since it is up to the caller to provide correct
+/// literal strings with respect to the original regex pattern.
+///
+/// # Cloning
+///
+/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning
+/// it will not duplicate whatever heap memory is used to represent the
+/// underlying matcher.
+///
+/// # Example
+///
+/// This example shows how to attach a `Prefilter` to the
+/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate
+/// searches.
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// util::prefilter::Prefilter,
+/// Match, MatchKind,
+/// };
+///
+/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "])
+/// .expect("a prefilter");
+/// let re = PikeVM::builder()
+/// .configure(PikeVM::config().prefilter(Some(pre)))
+/// .build(r"Bruce \w+")?;
+/// let mut cache = re.create_cache();
+/// assert_eq!(
+/// Some(Match::must(0, 6..23)),
+/// re.find(&mut cache, "Hello Bruce Springsteen!"),
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// But note that if you get your prefilter incorrect, it could lead to an
+/// incorrect result!
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// util::prefilter::Prefilter,
+/// Match, MatchKind,
+/// };
+///
+/// // This prefilter is wrong!
+/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "])
+/// .expect("a prefilter");
+/// let re = PikeVM::builder()
+/// .configure(PikeVM::config().prefilter(Some(pre)))
+/// .build(r"Bruce \w+")?;
+/// let mut cache = re.create_cache();
+/// // We find no match even though the regex does match.
+/// assert_eq!(
+/// None,
+/// re.find(&mut cache, "Hello Bruce Springsteen!"),
+/// );
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Prefilter {
+ #[cfg(not(feature = "alloc"))]
+ _unused: (),
+ #[cfg(feature = "alloc")]
+ pre: Arc<dyn PrefilterI>,
+ #[cfg(feature = "alloc")]
+ is_fast: bool,
+}
+
+impl Prefilter {
+ /// Create a new prefilter from a sequence of needles and a corresponding
+ /// match semantics.
+ ///
+ /// This may return `None` for a variety of reasons, for example, if
+ /// a suitable prefilter could not be constructed. That might occur
+ /// if they are unavailable (e.g., the `perf-literal-substring` and
+ /// `perf-literal-multisubstring` features aren't enabled), or it might
+ /// occur because of heuristics or other artifacts of how the prefilter
+ /// works.
+ ///
+ /// Note that if you have an [`Hir`] expression, it may be more convenient
+ /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the
+ /// task of extracting prefix literals for you.
+ ///
+ /// # Example
+ ///
+ /// This example shows how match semantics can impact the matching
+ /// algorithm used by the prefilter. For this reason, it is important to
+ /// ensure that the match semantics given here are consistent with the
+ /// match semantics intended for the regular expression that the literals
+ /// were extracted from.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// util::{prefilter::Prefilter, syntax},
+ /// MatchKind, Span,
+ /// };
+ ///
+ /// let hay = "Hello samwise";
+ ///
+ /// // With leftmost-first, we find 'samwise' here because it comes
+ /// // before 'sam' in the sequence we give it..
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"])
+ /// .expect("a prefilter");
+ /// assert_eq!(
+ /// Some(Span::from(6..13)),
+ /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ /// // Still with leftmost-first but with the literals reverse, now 'sam'
+ /// // will match instead!
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"])
+ /// .expect("a prefilter");
+ /// assert_eq!(
+ /// Some(Span::from(6..9)),
+ /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Prefilter> {
+ Choice::new(kind, needles).and_then(Prefilter::from_choice)
+ }
+
+ /// This turns a prefilter selection into a `Prefilter`. That is, in turns
+ /// the enum given into a trait object.
+ fn from_choice(choice: Choice) -> Option<Prefilter> {
+ #[cfg(not(feature = "alloc"))]
+ {
+ None
+ }
+ #[cfg(feature = "alloc")]
+ {
+ let pre: Arc<dyn PrefilterI> = match choice {
+ Choice::Memchr(p) => Arc::new(p),
+ Choice::Memchr2(p) => Arc::new(p),
+ Choice::Memchr3(p) => Arc::new(p),
+ Choice::Memmem(p) => Arc::new(p),
+ Choice::Teddy(p) => Arc::new(p),
+ Choice::ByteSet(p) => Arc::new(p),
+ Choice::AhoCorasick(p) => Arc::new(p),
+ };
+ let is_fast = pre.is_fast();
+ Some(Prefilter { pre, is_fast })
+ }
+ }
+
+ /// This attempts to extract prefixes from the given `Hir` expression for
+ /// the given match semantics, and if possible, builds a prefilter for
+ /// them.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a prefilter directly from an [`Hir`]
+ /// expression, and use to find an occurrence of a prefix from the regex
+ /// pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// util::{prefilter::Prefilter, syntax},
+ /// MatchKind, Span,
+ /// };
+ ///
+ /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?;
+ /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
+ /// .expect("a prefilter");
+ /// let hay = "Hello Patti Scialfa!";
+ /// assert_eq!(
+ /// Some(Span::from(6..12)),
+ /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> {
+ Prefilter::from_hirs_prefix(kind, &[hir])
+ }
+
+ /// This attempts to extract prefixes from the given `Hir` expressions for
+ /// the given match semantics, and if possible, builds a prefilter for
+ /// them.
+ ///
+ /// Note that as of now, prefilters throw away information about which
+ /// pattern each literal comes from. In other words, when a prefilter finds
+ /// a match, there's no way to know which pattern (or patterns) it came
+ /// from. Therefore, in order to confirm a match, you'll have to check all
+ /// of the patterns by running the full regex engine.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a prefilter directly from multiple
+ /// `Hir` expressions expression, and use it to find an occurrence of a
+ /// prefix from the regex patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// util::{prefilter::Prefilter, syntax},
+ /// MatchKind, Span,
+ /// };
+ ///
+ /// let hirs = syntax::parse_many(&[
+ /// r"(Bruce|Patti) \w+",
+ /// r"Mrs?\. Doubtfire",
+ /// ])?;
+ /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs)
+ /// .expect("a prefilter");
+ /// let hay = "Hello Mrs. Doubtfire";
+ /// assert_eq!(
+ /// Some(Span::from(6..20)),
+ /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn from_hirs_prefix<H: Borrow<Hir>>(
+ kind: MatchKind,
+ hirs: &[H],
+ ) -> Option<Prefilter> {
+ prefixes(kind, hirs)
+ .literals()
+ .and_then(|lits| Prefilter::new(kind, lits))
+ }
+
+ /// Run this prefilter on `haystack[span.start..end]` and return a matching
+ /// span if one exists.
+ ///
+ /// The span returned is guaranteed to have a start position greater than
+ /// or equal to the one given, and an end position less than or equal to
+ /// the one given.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a prefilter directly from an [`Hir`]
+ /// expression, and use it to find an occurrence of a prefix from the regex
+ /// pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// util::{prefilter::Prefilter, syntax},
+ /// MatchKind, Span,
+ /// };
+ ///
+ /// let hir = syntax::parse(r"Bruce \w+")?;
+ /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
+ /// .expect("a prefilter");
+ /// let hay = "Hello Bruce Springsteen!";
+ /// assert_eq!(
+ /// Some(Span::from(6..12)),
+ /// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "alloc"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "alloc")]
+ {
+ self.pre.find(haystack, span)
+ }
+ }
+
+ /// Returns the span of a prefix of `haystack[span.start..span.end]` if
+ /// the prefilter matches.
+ ///
+ /// The span returned is guaranteed to have a start position equivalent to
+ /// the one given, and an end position less than or equal to the one given.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a prefilter directly from an [`Hir`]
+ /// expression, and use it to find an occurrence of a prefix from the regex
+ /// pattern that begins at the start of a haystack only.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// util::{prefilter::Prefilter, syntax},
+ /// MatchKind, Span,
+ /// };
+ ///
+ /// let hir = syntax::parse(r"Bruce \w+")?;
+ /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
+ /// .expect("a prefilter");
+ /// let hay = "Hello Bruce Springsteen!";
+ /// // Nothing is found here because 'Bruce' does
+ /// // not occur at the beginning of our search.
+ /// assert_eq!(
+ /// None,
+ /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())),
+ /// );
+ /// // But if we change where we start the search
+ /// // to begin where 'Bruce ' begins, then a
+ /// // match will be found.
+ /// assert_eq!(
+ /// Some(Span::from(6..12)),
+ /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "alloc"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "alloc")]
+ {
+ self.pre.prefix(haystack, span)
+ }
+ }
+
+ /// Returns the heap memory, in bytes, used by the underlying prefilter.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ #[cfg(not(feature = "alloc"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "alloc")]
+ {
+ self.pre.memory_usage()
+ }
+ }
+
+ /// Implementations might return true here if they believe themselves to
+ /// be "fast." The concept of "fast" is deliberately left vague, but in
+ /// practice this usually corresponds to whether it's believed that SIMD
+ /// will be used.
+ ///
+ /// Why do we care about this? Well, some prefilter tricks tend to come
+ /// with their own bits of overhead, and so might only make sense if we
+ /// know that a scan will be *much* faster than the regex engine itself.
+ /// Otherwise, the trick may not be worth doing. Whether something is
+ /// "much" faster than the regex engine generally boils down to whether
+ /// SIMD is used. (But not always. Even a SIMD matcher with a high false
+ /// positive rate can become quite slow.)
+ ///
+ /// Even if this returns true, it is still possible for the prefilter to
+ /// be "slow." Remember, prefilters are just heuristics. We can't really
+ /// *know* a prefilter will be fast without actually trying the prefilter.
+ /// (Which of course we cannot afford to do.)
+ #[inline]
+ pub(crate) fn is_fast(&self) -> bool {
+ #[cfg(not(feature = "alloc"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "alloc")]
+ {
+ self.is_fast
+ }
+ }
+}
+
+/// A trait for abstracting over prefilters. Basically, a prefilter is
+/// something that do an unanchored *and* an anchored search in a haystack
+/// within a given span.
+///
+/// This exists pretty much only so that we can use prefilters as a trait
+/// object (which is what `Prefilter` is). If we ever move off of trait objects
+/// and to an enum, then it's likely this trait could be removed.
+pub(crate) trait PrefilterI:
+ Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
+{
+ /// Run this prefilter on `haystack[span.start..end]` and return a matching
+ /// span if one exists.
+ ///
+ /// The span returned is guaranteed to have a start position greater than
+ /// or equal to the one given, and an end position less than or equal to
+ /// the one given.
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span>;
+
+ /// Returns the span of a prefix of `haystack[span.start..span.end]` if
+ /// the prefilter matches.
+ ///
+ /// The span returned is guaranteed to have a start position equivalent to
+ /// the one given, and an end position less than or equal to the one given.
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>;
+
+ /// Returns the heap memory, in bytes, used by the underlying prefilter.
+ fn memory_usage(&self) -> usize;
+
+ /// Implementations might return true here if they believe themselves to
+ /// be "fast." See [`Prefilter::is_fast`] for more details.
+ fn is_fast(&self) -> bool;
+}
+
+#[cfg(feature = "alloc")]
+impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ (&**self).find(haystack, span)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ (&**self).prefix(haystack, span)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn memory_usage(&self) -> usize {
+ (&**self).memory_usage()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_fast(&self) -> bool {
+ (&**self).is_fast()
+ }
+}
+
+/// A type that encapsulates the selection of a prefilter algorithm from a
+/// sequence of needles.
+///
+/// The existence of this type is a little tricky, because we don't (currently)
+/// use it for performing a search. Instead, we really only consume it by
+/// converting the underlying prefilter into a trait object, whether that be
+/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order
+/// to avoid re-copying the prefilter selection logic, we isolate it here, and
+/// then force anything downstream that wants to convert it to a trait object
+/// to do trivial case analysis on it.
+///
+/// One wonders whether we *should* use an enum instead of a trait object.
+/// At time of writing, I chose trait objects based on instinct because 1) I
+/// knew I wasn't going to inline anything and 2) there would potentially be
+/// many different choices. However, as of time of writing, I haven't actually
+/// compared the trait object approach to the enum approach. That probably
+/// should be litigated, but I ran out of steam.
+///
+/// Note that if the `alloc` feature is disabled, then values of this type
+/// are (and should) never be constructed. Also, in practice, for any of the
+/// prefilters to be selected, you'll need at least one of the `perf-literal-*`
+/// features enabled.
+#[derive(Clone, Debug)]
+pub(crate) enum Choice {
+ Memchr(Memchr),
+ Memchr2(Memchr2),
+ Memchr3(Memchr3),
+ Memmem(Memmem),
+ Teddy(Teddy),
+ ByteSet(ByteSet),
+ AhoCorasick(AhoCorasick),
+}
+
+impl Choice {
+ /// Select what is believed to be the best prefilter algorithm for the
+ /// match semantics and sequence of needles given.
+ ///
+ /// This selection algorithm uses the needles as given without any
+ /// modification. For example, if `[bar]` is given, then this doesn't
+ /// try to select `memchr` for `b`. Instead, it would select `memmem`
+ /// for `bar`. If callers would want `memchr` selected for `[bar]`, then
+ /// callers should massages the literals themselves. That is, callers are
+ /// responsible for heuristics surrounding which sequence of literals is
+ /// best.
+ ///
+ /// What this selection algorithm does is attempt to use the fastest
+ /// prefilter that works for the literals given. So if `[a, b]`, is given,
+ /// then `memchr2` is selected.
+ ///
+ /// Of course, which prefilter is selected is also subject to what
+ /// is available. For example, if `alloc` isn't enabled, then
+ /// that limits which prefilters can be selected. Similarly, if
+ /// `perf-literal-substring` isn't enabled, then nothing from the `memchr`
+ /// crate can be returned.
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Choice> {
+ // An empty set means the regex matches nothing, so no sense in
+ // building a prefilter.
+ if needles.len() == 0 {
+ debug!("prefilter building failed: found empty set of literals");
+ return None;
+ }
+ // If the regex can match the empty string, then the prefilter
+ // will by definition match at every position. This is obviously
+ // completely ineffective.
+ if needles.iter().any(|n| n.as_ref().is_empty()) {
+ debug!("prefilter building failed: literals match empty string");
+ return None;
+ }
+ // BREADCRUMBS: Perhaps the literal optimizer should special case
+ // sequences of length two or three if the leading bytes of each are
+ // "rare"? Or perhaps, if there are two or three total possible leading
+ // bytes, regardless of the number of literals, and all are rare...
+ // Then well, perhaps we should use memchr2 or memchr3 in those cases?
+ if let Some(pre) = Memchr::new(kind, needles) {
+ debug!("prefilter built: memchr");
+ return Some(Choice::Memchr(pre));
+ }
+ if let Some(pre) = Memchr2::new(kind, needles) {
+ debug!("prefilter built: memchr2");
+ return Some(Choice::Memchr2(pre));
+ }
+ if let Some(pre) = Memchr3::new(kind, needles) {
+ debug!("prefilter built: memchr3");
+ return Some(Choice::Memchr3(pre));
+ }
+ if let Some(pre) = Memmem::new(kind, needles) {
+ debug!("prefilter built: memmem");
+ return Some(Choice::Memmem(pre));
+ }
+ if let Some(pre) = Teddy::new(kind, needles) {
+ debug!("prefilter built: teddy");
+ return Some(Choice::Teddy(pre));
+ }
+ if let Some(pre) = ByteSet::new(kind, needles) {
+ debug!("prefilter built: byteset");
+ return Some(Choice::ByteSet(pre));
+ }
+ if let Some(pre) = AhoCorasick::new(kind, needles) {
+ debug!("prefilter built: aho-corasick");
+ return Some(Choice::AhoCorasick(pre));
+ }
+ debug!("prefilter building failed: no strategy could be found");
+ None
+ }
+}
+
+/// Extracts all of the prefix literals from the given HIR expressions into a
+/// single `Seq`. The literals in the sequence are ordered with respect to the
+/// order of the given HIR expressions and consistent with the match semantics
+/// given.
+///
+/// The sequence returned is "optimized." That is, they may be shrunk or even
+/// truncated according to heuristics with the intent of making them more
+/// useful as a prefilter. (Which translates to both using faster algorithms
+/// and minimizing the false positive rate.)
+///
+/// Note that this erases any connection between the literals and which pattern
+/// (or patterns) they came from.
+///
+/// The match kind given must correspond to the match semantics of the regex
+/// that is represented by the HIRs given. The match semantics may change the
+/// literal sequence returned.
+#[cfg(feature = "syntax")]
+pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
+where
+ H: core::borrow::Borrow<Hir>,
+{
+ let mut extractor = literal::Extractor::new();
+ extractor.kind(literal::ExtractKind::Prefix);
+
+ let mut prefixes = literal::Seq::empty();
+ for hir in hirs {
+ prefixes.union(&mut extractor.extract(hir.borrow()));
+ }
+ debug!(
+ "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
+ prefixes.len(),
+ prefixes.is_exact(),
+ prefixes
+ );
+ match kind {
+ MatchKind::All => {
+ prefixes.sort();
+ prefixes.dedup();
+ }
+ MatchKind::LeftmostFirst => {
+ prefixes.optimize_for_prefix_by_preference();
+ }
+ }
+ debug!(
+ "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
+ prefixes.len(),
+ prefixes.is_exact(),
+ prefixes
+ );
+ prefixes
+}
+
+/// Like `prefixes`, but for all suffixes of all matches for the given HIRs.
+#[cfg(feature = "syntax")]
+pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
+where
+ H: core::borrow::Borrow<Hir>,
+{
+ let mut extractor = literal::Extractor::new();
+ extractor.kind(literal::ExtractKind::Suffix);
+
+ let mut suffixes = literal::Seq::empty();
+ for hir in hirs {
+ suffixes.union(&mut extractor.extract(hir.borrow()));
+ }
+ debug!(
+ "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
+ suffixes.len(),
+ suffixes.is_exact(),
+ suffixes
+ );
+ match kind {
+ MatchKind::All => {
+ suffixes.sort();
+ suffixes.dedup();
+ }
+ MatchKind::LeftmostFirst => {
+ suffixes.optimize_for_suffix_by_preference();
+ }
+ }
+ debug!(
+ "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
+ suffixes.len(),
+ suffixes.is_exact(),
+ suffixes
+ );
+ suffixes
+}
diff --git a/vendor/regex-automata/src/util/prefilter/teddy.rs b/vendor/regex-automata/src/util/prefilter/teddy.rs
new file mode 100644
index 000000000..fc79f2b2f
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter/teddy.rs
@@ -0,0 +1,160 @@
+use crate::util::{
+ prefilter::PrefilterI,
+ search::{MatchKind, Span},
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Teddy {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ _unused: (),
+ /// The actual Teddy searcher.
+ ///
+ /// Technically, it's possible that Teddy doesn't actually get used, since
+ /// Teddy does require its haystack to at least be of a certain size
+ /// (usually around the size of whatever vector is being used, so ~16
+ /// or ~32 bytes). For haystacks shorter than that, the implementation
+ /// currently uses Rabin-Karp.
+ #[cfg(feature = "perf-literal-multisubstring")]
+ searcher: aho_corasick::packed::Searcher,
+ /// When running an anchored search, the packed searcher can't handle it so
+ /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
+ /// searchers to support anchored search would be difficult at worst and
+ /// annoying at best. Since packed searchers only apply to small numbers of
+ /// literals, we content ourselves that this is not much of an added cost.
+ /// (That packed searchers only work with a small number of literals is
+ /// also why we use a DFA here. Otherwise, the memory usage of a DFA would
+ /// likely be unacceptable.)
+ #[cfg(feature = "perf-literal-multisubstring")]
+ anchored_ac: aho_corasick::dfa::DFA,
+ /// The length of the smallest literal we look for.
+ ///
+ /// We use this as a heuristic to figure out whether this will be "fast" or
+ /// not. Generally, the longer the better, because longer needles are more
+ /// discriminating and thus reduce false positive rate.
+ #[cfg(feature = "perf-literal-multisubstring")]
+ minimum_len: usize,
+}
+
+impl Teddy {
+ pub(crate) fn new<B: AsRef<[u8]>>(
+ kind: MatchKind,
+ needles: &[B],
+ ) -> Option<Teddy> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ None
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // We only really support leftmost-first semantics. In
+ // theory we could at least support leftmost-longest, as the
+ // aho-corasick crate does, but regex-automata doesn't know about
+ // leftmost-longest currently.
+ //
+ // And like the aho-corasick prefilter, if we're using `All`
+ // semantics, then we can still use leftmost semantics for a
+ // prefilter. (This might be a suspicious choice for the literal
+ // engine, which uses a prefilter as a regex engine directly, but
+ // that only happens when using leftmost-first semantics.)
+ let (packed_match_kind, ac_match_kind) = match kind {
+ MatchKind::LeftmostFirst | MatchKind::All => (
+ aho_corasick::packed::MatchKind::LeftmostFirst,
+ aho_corasick::MatchKind::LeftmostFirst,
+ ),
+ };
+ let minimum_len =
+ needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
+ let packed = aho_corasick::packed::Config::new()
+ .match_kind(packed_match_kind)
+ .builder()
+ .extend(needles)
+ .build()?;
+ let anchored_ac = aho_corasick::dfa::DFA::builder()
+ .match_kind(ac_match_kind)
+ .start_kind(aho_corasick::StartKind::Anchored)
+ .prefilter(false)
+ .build(needles)
+ .ok()?;
+ Some(Teddy { searcher: packed, anchored_ac, minimum_len })
+ }
+ }
+}
+
+impl PrefilterI for Teddy {
+ fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ let ac_span =
+ aho_corasick::Span { start: span.start, end: span.end };
+ self.searcher
+ .find_in(haystack, ac_span)
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ use aho_corasick::automaton::Automaton;
+ let input = aho_corasick::Input::new(haystack)
+ .anchored(aho_corasick::Anchored::Yes)
+ .span(span.start..span.end);
+ self.anchored_ac
+ .try_find(&input)
+ // OK because we build the DFA with anchored support.
+ .expect("aho-corasick DFA should never fail")
+ .map(|m| Span { start: m.start(), end: m.end() })
+ }
+ }
+
+ fn memory_usage(&self) -> usize {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ use aho_corasick::automaton::Automaton;
+ self.searcher.memory_usage() + self.anchored_ac.memory_usage()
+ }
+ }
+
+ fn is_fast(&self) -> bool {
+ #[cfg(not(feature = "perf-literal-multisubstring"))]
+ {
+ unreachable!()
+ }
+ #[cfg(feature = "perf-literal-multisubstring")]
+ {
+ // Teddy is usually quite fast, but I have seen some cases where
+ // a large number of literals can overwhelm it and make it not so
+ // fast. We make an educated but conservative guess at a limit, at
+ // which point, we're not so comfortable thinking Teddy is "fast."
+ //
+ // Well... this used to incorporate a "limit" on the *number*
+ // of literals, but I have since changed it to a minimum on the
+ // *smallest* literal. Namely, when there is a very small literal
+ // (1 or 2 bytes), it is far more likely that it leads to a higher
+ // false positive rate. (Although, of course, not always. For
+ // example, 'zq' is likely to have a very low false positive rate.)
+ // But when we have 3 bytes, we have a really good chance of being
+ // quite discriminatory and thus fast.
+ //
+ // We may still want to add some kind of limit on the number of
+ // literals here, but keep in mind that Teddy already has its own
+ // somewhat small limit (64 at time of writing). The main issue
+ // here is that if 'is_fast' is false, it opens the door for the
+ // reverse inner optimization to kick in. We really only want to
+ // resort to the reverse inner optimization if we absolutely must.
+ self.minimum_len >= 3
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/primitives.rs b/vendor/regex-automata/src/util/primitives.rs
new file mode 100644
index 000000000..5c5d187b0
--- /dev/null
+++ b/vendor/regex-automata/src/util/primitives.rs
@@ -0,0 +1,776 @@
+/*!
+Lower level primitive types that are useful in a variety of circumstances.
+
+# Overview
+
+This list represents the principle types in this module and briefly describes
+when you might want to use them.
+
+* [`PatternID`] - A type that represents the identifier of a regex pattern.
+This is probably the most widely used type in this module (which is why it's
+also re-exported in the crate root).
+* [`StateID`] - A type the represents the identifier of a finite automaton
+state. This is used for both NFAs and DFAs, with the notable exception of
+the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
+identifier.)
+* [`SmallIndex`] - The internal representation of both a `PatternID` and a
+`StateID`. Its purpose is to serve as a type that can index memory without
+being as big as a `usize` on 64-bit targets. The main idea behind this type
+is that there are many things in regex engines that will, in practice, never
+overflow a 32-bit integer. (For example, like the number of patterns in a regex
+or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
+memory without peppering `as` casts everywhere. Moreover, it forces callers
+to handle errors in the case where, somehow, the value would otherwise overflow
+either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
+* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a
+result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This
+useful, for example, when representing the offsets of submatches since it
+reduces memory usage by a factor of 2. It is a legal optimization since Rust
+guarantees that slices never have a length that exceeds `isize::MAX`.
+*/
+
+use core::num::NonZeroUsize;
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+use crate::util::int::{Usize, U16, U32, U64};
+
+/// A `usize` that can never be `usize::MAX`.
+///
+/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting
+/// a zero value, this does not permit a max value.
+///
+/// This is useful in certain contexts where one wants to optimize the memory
+/// usage of things that contain match offsets. Namely, since Rust slices
+/// are guaranteed to never have a length exceeding `isize::MAX`, we can use
+/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed,
+/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a
+/// `usize`.
+///
+/// This type is defined to be `repr(transparent)` for
+/// `core::num::NonZeroUsize`, which is in turn defined to be
+/// `repr(transparent)` for `usize`.
+#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct NonMaxUsize(NonZeroUsize);
+
+impl NonMaxUsize {
+ /// Create a new `NonMaxUsize` from the given value.
+ ///
+ /// This returns `None` only when the given value is equal to `usize::MAX`.
+ #[inline]
+ pub fn new(value: usize) -> Option<NonMaxUsize> {
+ NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize)
+ }
+
+ /// Return the underlying `usize` value. The returned value is guaranteed
+ /// to not equal `usize::MAX`.
+ #[inline]
+ pub fn get(self) -> usize {
+ self.0.get().wrapping_sub(1)
+ }
+}
+
+// We provide our own Debug impl because seeing the internal repr can be quite
+// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'.
+impl core::fmt::Debug for NonMaxUsize {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{:?}", self.get())
+ }
+}
+
+/// A type that represents a "small" index.
+///
+/// The main idea of this type is to provide something that can index memory,
+/// but uses less memory than `usize` on 64-bit systems. Specifically, its
+/// representation is always a `u32` and has `repr(transparent)` enabled. (So
+/// it is safe to transmute between a `u32` and a `SmallIndex`.)
+///
+/// A small index is typically useful in cases where there is no practical way
+/// that the index will overflow a 32-bit integer. A good example of this is
+/// an NFA state. If you could somehow build an NFA with `2^30` states, its
+/// memory usage would be exorbitant and its runtime execution would be so
+/// slow as to be completely worthless. Therefore, this crate generally deems
+/// it acceptable to return an error if it would otherwise build an NFA that
+/// requires a slice longer than what a 32-bit integer can index. In exchange,
+/// we can use 32-bit indices instead of 64-bit indices in various places.
+///
+/// This type ensures this by providing a constructor that will return an error
+/// if its argument cannot fit into the type. This makes it much easier to
+/// handle these sorts of boundary cases that are otherwise extremely subtle.
+///
+/// On all targets, this type guarantees that its value will fit in a `u32`,
+/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
+/// example, this type's maximum value will never overflow an `isize`,
+/// which means it will never overflow a `i16` even though its internal
+/// representation is still a `u32`.
+///
+/// The purpose for making the type fit into even signed integer types like
+/// `isize` is to guarantee that the difference between any two small indices
+/// is itself also a small index. This is useful in certain contexts, e.g.,
+/// for delta encoding.
+///
+/// # Other types
+///
+/// The following types wrap `SmallIndex` to provide a more focused use case:
+///
+/// * [`PatternID`] is for representing the identifiers of patterns.
+/// * [`StateID`] is for representing the identifiers of states in finite
+/// automata. It is used for both NFAs and DFAs.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `SmallIndex` to index slices.
+///
+/// # Safety
+///
+/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
+/// without using as much space as a `usize` on all targets, callers must
+/// not rely on this property for safety. Callers may choose to rely on this
+/// property for correctness however. For example, creating a `SmallIndex` with
+/// an invalid value can be done in entirely safe code. This may in turn result
+/// in panics or silent logical errors.
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+#[repr(transparent)]
+pub struct SmallIndex(u32);
+
+impl SmallIndex {
+ /// The maximum index value.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: SmallIndex =
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum index value.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: SmallIndex =
+ SmallIndex::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of values that can be represented as a small index.
+ pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
+
+ /// The zero index value.
+ pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
+
+ /// The number of bytes that a single small index uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
+
+ /// Create a new small index.
+ ///
+ /// If the given index exceeds [`SmallIndex::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ SmallIndex::try_from(index)
+ }
+
+ /// Create a new small index without checking whether the given value
+ /// exceeds [`SmallIndex::MAX`].
+ ///
+ /// Using this routine with an invalid index value will result in
+ /// unspecified behavior, but *not* undefined behavior. In particular, an
+ /// invalid index value is likely to cause panics or possibly even silent
+ /// logical errors.
+ ///
+ /// Callers must never rely on a `SmallIndex` to be within a certain range
+ /// for memory safety.
+ #[inline]
+ pub const fn new_unchecked(index: usize) -> SmallIndex {
+ // FIXME: Use as_u32() once const functions in traits are stable.
+ SmallIndex(index as u32)
+ }
+
+ /// Like [`SmallIndex::new`], but panics if the given index is not valid.
+ #[inline]
+ pub fn must(index: usize) -> SmallIndex {
+ SmallIndex::new(index).expect("invalid small index")
+ }
+
+ /// Return this small index as a `usize`. This is guaranteed to never
+ /// overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ self.0 as usize
+ }
+
+ /// Return this small index as a `u64`. This is guaranteed to never
+ /// overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ // FIXME: Use u64::from() once const functions in traits are stable.
+ self.0 as u64
+ }
+
+ /// Return the internal `u32` of this small index. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal `u32` of this small index represented as an `i32`.
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ // This is OK because we guarantee that our max value is <= i32::MAX.
+ self.0 as i32
+ }
+
+ /// Returns one more than this small index as a usize.
+ ///
+ /// Since a small index has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize() + 1
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(
+ bytes: [u8; 4],
+ ) -> Result<SmallIndex, SmallIndexError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(id) });
+ }
+ Ok(SmallIndex::new_unchecked(id.as_usize()))
+ }
+
+ /// Decode this small index from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
+ SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
+ }
+
+ /// Return the underlying small index integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+}
+
+impl<T> core::ops::Index<SmallIndex> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+impl<T> core::ops::IndexMut<SmallIndex> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T> core::ops::Index<SmallIndex> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: SmallIndex) -> &T {
+ &self[index.as_usize()]
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: SmallIndex) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+}
+
+impl From<u8> for SmallIndex {
+ fn from(index: u8) -> SmallIndex {
+ SmallIndex::new_unchecked(usize::from(index))
+ }
+}
+
+impl TryFrom<u16> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
+ if u32::from(index) > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u32> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u32() {
+ return Err(SmallIndexError { attempted: u64::from(index) });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<u64> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_u64() {
+ return Err(SmallIndexError { attempted: index });
+ }
+ Ok(SmallIndex::new_unchecked(index.as_usize()))
+ }
+}
+
+impl TryFrom<usize> for SmallIndex {
+ type Error = SmallIndexError;
+
+ fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
+ if index > SmallIndex::MAX.as_usize() {
+ return Err(SmallIndexError { attempted: index.as_u64() });
+ }
+ Ok(SmallIndex::new_unchecked(index))
+ }
+}
+
+#[cfg(test)]
+impl quickcheck::Arbitrary for SmallIndex {
+ fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex {
+ use core::cmp::max;
+
+ let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
+ if id > SmallIndex::MAX.as_i32() {
+ SmallIndex::MAX
+ } else {
+ SmallIndex::new(usize::try_from(id).unwrap()).unwrap()
+ }
+ }
+}
+
+/// This error occurs when a small index could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum small index value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SmallIndexError {
+ attempted: u64,
+}
+
+impl SmallIndexError {
+ /// Returns the value that could not be converted to a small index.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SmallIndexError {}
+
+impl core::fmt::Display for SmallIndexError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create small index from {:?}, which exceeds {:?}",
+ self.attempted(),
+ SmallIndex::MAX,
+ )
+ }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct SmallIndexIter {
+ rng: core::ops::Range<usize>,
+}
+
+impl Iterator for SmallIndexIter {
+ type Item = SmallIndex;
+
+ fn next(&mut self) -> Option<SmallIndex> {
+ if self.rng.start >= self.rng.end {
+ return None;
+ }
+ let next_id = self.rng.start + 1;
+ let id = core::mem::replace(&mut self.rng.start, next_id);
+ // new_unchecked is OK since we asserted that the number of
+ // elements in this iterator will fit in an ID at construction.
+ Some(SmallIndex::new_unchecked(id))
+ }
+}
+
+macro_rules! index_type_impls {
+ ($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
+ impl $name {
+ /// The maximum value.
+ pub const MAX: $name = $name(SmallIndex::MAX);
+
+ /// The total number of values that can be represented.
+ pub const LIMIT: usize = SmallIndex::LIMIT;
+
+ /// The zero value.
+ pub const ZERO: $name = $name(SmallIndex::ZERO);
+
+ /// The number of bytes that a single value uses in memory.
+ pub const SIZE: usize = SmallIndex::SIZE;
+
+ /// Create a new value that is represented by a "small index."
+ ///
+ /// If the given index exceeds the maximum allowed value, then this
+ /// returns an error.
+ #[inline]
+ pub fn new(value: usize) -> Result<$name, $err> {
+ SmallIndex::new(value).map($name).map_err($err)
+ }
+
+ /// Create a new value without checking whether the given argument
+ /// exceeds the maximum.
+ ///
+ /// Using this routine with an invalid value will result in
+ /// unspecified behavior, but *not* undefined behavior. In
+ /// particular, an invalid ID value is likely to cause panics or
+ /// possibly even silent logical errors.
+ ///
+ /// Callers must never rely on this type to be within a certain
+ /// range for memory safety.
+ #[inline]
+ pub const fn new_unchecked(value: usize) -> $name {
+ $name(SmallIndex::new_unchecked(value))
+ }
+
+ /// Like `new`, but panics if the given value is not valid.
+ #[inline]
+ pub fn must(value: usize) -> $name {
+ $name::new(value).expect(concat!(
+ "invalid ",
+ stringify!($name),
+ " value"
+ ))
+ }
+
+ /// Return the internal value as a `usize`. This is guaranteed to
+ /// never overflow `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0.as_usize()
+ }
+
+ /// Return the internal value as a `u64`. This is guaranteed to
+ /// never overflow.
+ #[inline]
+ pub const fn as_u64(&self) -> u64 {
+ self.0.as_u64()
+ }
+
+ /// Return the internal value as a `u32`. This is guaranteed to
+ /// never overflow `u32`.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0.as_u32()
+ }
+
+ /// Return the internal value as a i32`. This is guaranteed to
+ /// never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0.as_i32()
+ }
+
+ /// Returns one more than this value as a usize.
+ ///
+ /// Since values represented by a "small index" have constraints
+ /// on their maximum value, adding `1` to it will always fit in a
+ /// `usize`, `u32` and a `i32`.
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.0.one_more()
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a small index
+ /// for the current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
+ SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
+ }
+
+ /// Decode this value from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to `new_unchecked` in that is does not check
+ /// whether the decoded integer is representable as a small index.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
+ $name(SmallIndex::from_ne_bytes_unchecked(bytes))
+ }
+
+ /// Return the underlying integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all values from 0 up to and not
+ /// including the given length.
+ ///
+ /// If the given length exceeds this type's limit, then this
+ /// panics.
+ pub(crate) fn iter(len: usize) -> $iter {
+ $iter::new(len)
+ }
+ }
+
+ // We write our own Debug impl so that we get things like PatternID(5)
+ // instead of PatternID(SmallIndex(5)).
+ impl core::fmt::Debug for $name {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
+ }
+ }
+
+ impl<T> core::ops::Index<$name> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$name> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::Index<$name> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $name) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::IndexMut<$name> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: $name) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl From<u8> for $name {
+ fn from(value: u8) -> $name {
+ $name(SmallIndex::from(value))
+ }
+ }
+
+ impl TryFrom<u16> for $name {
+ type Error = $err;
+
+ fn try_from(value: u16) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u32> for $name {
+ type Error = $err;
+
+ fn try_from(value: u32) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<u64> for $name {
+ type Error = $err;
+
+ fn try_from(value: u64) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ impl TryFrom<usize> for $name {
+ type Error = $err;
+
+ fn try_from(value: usize) -> Result<$name, $err> {
+ SmallIndex::try_from(value).map($name).map_err($err)
+ }
+ }
+
+ #[cfg(test)]
+ impl quickcheck::Arbitrary for $name {
+ fn arbitrary(gen: &mut quickcheck::Gen) -> $name {
+ $name(SmallIndex::arbitrary(gen))
+ }
+ }
+
+ /// This error occurs when a value could not be constructed.
+ ///
+ /// This occurs when given an integer exceeding the maximum allowed
+ /// value.
+ ///
+ /// When the `std` feature is enabled, this implements the `Error`
+ /// trait.
+ #[derive(Clone, Debug, Eq, PartialEq)]
+ pub struct $err(SmallIndexError);
+
+ impl $err {
+ /// Returns the value that could not be converted to an ID.
+ pub fn attempted(&self) -> u64 {
+ self.0.attempted()
+ }
+ }
+
+ #[cfg(feature = "std")]
+ impl std::error::Error for $err {}
+
+ impl core::fmt::Display for $err {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create {} from {:?}, which exceeds {:?}",
+ stringify!($name),
+ self.attempted(),
+ $name::MAX,
+ )
+ }
+ }
+
+ #[derive(Clone, Debug)]
+ pub(crate) struct $iter(SmallIndexIter);
+
+ impl $iter {
+ fn new(len: usize) -> $iter {
+ assert!(
+ len <= $name::LIMIT,
+ "cannot create iterator for {} when number of \
+ elements exceed {:?}",
+ stringify!($name),
+ $name::LIMIT,
+ );
+ $iter(SmallIndexIter { rng: 0..len })
+ }
+ }
+
+ impl Iterator for $iter {
+ type Item = $name;
+
+ fn next(&mut self) -> Option<$name> {
+ self.0.next().map($name)
+ }
+ }
+
+ /// An iterator adapter that is like std::iter::Enumerate, but attaches
+ /// small index values instead. It requires `ExactSizeIterator`. At
+ /// construction, it ensures that the index of each element in the
+ /// iterator is representable in the corresponding small index type.
+ #[derive(Clone, Debug)]
+ pub(crate) struct $withiter<I> {
+ it: I,
+ ids: $iter,
+ }
+
+ impl<I: Iterator + ExactSizeIterator> $withiter<I> {
+ fn new(it: I) -> $withiter<I> {
+ let ids = $name::iter(it.len());
+ $withiter { it, ids }
+ }
+ }
+
+ impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
+ type Item = ($name, I::Item);
+
+ fn next(&mut self) -> Option<($name, I::Item)> {
+ let item = self.it.next()?;
+ // Number of elements in this iterator must match, according
+ // to contract of ExactSizeIterator.
+ let id = self.ids.next().unwrap();
+ Some((id, item))
+ }
+ }
+ };
+}
+
+/// The identifier of a regex pattern, represented by a [`SmallIndex`].
+///
+/// The identifier for a pattern corresponds to its relative position among
+/// other patterns in a single finite state machine. Namely, when building
+/// a multi-pattern regex engine, one must supply a sequence of patterns to
+/// match. The position (starting at 0) of each pattern in that sequence
+/// represents its identifier. This identifier is in turn used to identify and
+/// report matches of that pattern in various APIs.
+///
+/// See the [`SmallIndex`] type for more information about what it means for
+/// a pattern ID to be a "small index."
+///
+/// Note that this type is defined in the
+/// [`util::primitives`](crate::util::primitives) module, but it is also
+/// re-exported at the crate root due to how common it is.
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct PatternID(SmallIndex);
+
+/// The identifier of a finite automaton state, represented by a
+/// [`SmallIndex`].
+///
+/// Most regex engines in this crate are built on top of finite automata. Each
+/// state in a finite automaton defines transitions from its state to another.
+/// Those transitions point to other states via their identifiers, i.e., a
+/// `StateID`. Since finite automata tend to contain many transitions, it is
+/// much more memory efficient to define state IDs as small indices.
+///
+/// See the [`SmallIndex`] type for more information about what it means for
+/// a state ID to be a "small index."
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct StateID(SmallIndex);
+
+index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
+index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as "small index" types. We require ExactSizeIterator so
+/// that iterator construction can do a single check to make sure the index of
+/// each element is representable by its small index type.
+pub(crate) trait IteratorIndexExt: Iterator {
+ fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithPatternIDIter::new(self)
+ }
+
+ fn with_state_ids(self) -> WithStateIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithStateIDIter::new(self)
+ }
+}
+
+impl<I: Iterator> IteratorIndexExt for I {}
diff --git a/vendor/regex-automata/src/util/search.rs b/vendor/regex-automata/src/util/search.rs
new file mode 100644
index 000000000..39aec522b
--- /dev/null
+++ b/vendor/regex-automata/src/util/search.rs
@@ -0,0 +1,1969 @@
+/*!
+Types and routines that support the search APIs of most regex engines.
+
+This sub-module isn't exposed directly, but rather, its contents are exported
+at the crate root due to the universality of most of the types and routines in
+this module.
+*/
+
+use core::ops::{Range, RangeBounds};
+
+use crate::util::{escape::DebugByte, primitives::PatternID, utf8};
+
+/// The parameters for a regex search including the haystack to search.
+///
+/// It turns out that regex searches have a few parameters, and in most cases,
+/// those parameters have defaults that work in the vast majority of cases.
+/// This `Input` type exists to make that common case seamnless while also
+/// providing an avenue for changing the parameters of a search. In particular,
+/// this type enables doing so without a combinatorial explosion of different
+/// methods and/or superfluous parameters in the common cases.
+///
+/// An `Input` permits configuring the following things:
+///
+/// * Search only a substring of a haystack, while taking the broader context
+/// into account for resolving look-around assertions.
+/// * Indicating whether to search for all patterns in a regex, or to
+/// only search for one pattern in particular.
+/// * Whether to perform an anchored on unanchored search.
+/// * Whether to report a match as early as possible.
+///
+/// All of these parameters, except for the haystack, have sensible default
+/// values. This means that the minimal search configuration is simply a call
+/// to [`Input::new`] with your haystack. Setting any other parameter is
+/// optional.
+///
+/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a
+/// `From<H> for Input` implementation. This is useful because many of the
+/// search APIs in this crate accept an `Into<Input>`. This means you can
+/// provide string or byte strings to these routines directly, and they'll
+/// automatically get converted into an `Input` for you.
+///
+/// The lifetime parameter `'h` refers to the lifetime of the haystack.
+///
+/// # Organization
+///
+/// The API of `Input` is split into a few different parts:
+///
+/// * A builder-like API that transforms a `Input` by value. Examples:
+/// [`Input::span`] and [`Input::anchored`].
+/// * A setter API that permits mutating parameters in place. Examples:
+/// [`Input::set_span`] and [`Input::set_anchored`].
+/// * A getter API that permits retrieving any of the search parameters.
+/// Examples: [`Input::get_span`] and [`Input::get_anchored`].
+/// * A few convenience getter routines that don't conform to the above naming
+/// pattern due to how common they are. Examples: [`Input::haystack`],
+/// [`Input::start`] and [`Input::end`].
+/// * Miscellaneous predicates and other helper routines that are useful
+/// in some contexts. Examples: [`Input::is_char_boundary`].
+///
+/// A `Input` exposes so much because it is meant to be used by both callers of
+/// regex engines _and_ implementors of regex engines. A constraining factor is
+/// that regex engines should accept a `&Input` as its lowest level API, which
+/// means that implementors should only use the "getter" APIs of a `Input`.
+///
+/// # Valid bounds and search termination
+///
+/// An `Input` permits setting the bounds of a search via either
+/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
+/// else a panic will occur. Bounds are valid if and only if:
+///
+/// * The bounds represent a valid range into the input's haystack.
+/// * **or** the end bound is a valid ending bound for the haystack *and*
+/// the start bound is exactly one greater than the start bound.
+///
+/// In the latter case, [`Input::is_done`] will return true and indicates any
+/// search receiving such an input should immediately return with no match.
+///
+/// Note that while `Input` is used for reverse searches in this crate, the
+/// `Input::is_done` predicate assumes a forward search. Because unsigned
+/// offsets are used internally, there is no way to tell from only the offsets
+/// whether a reverse search is done or not.
+///
+/// # Regex engine support
+///
+/// Any regex engine accepting an `Input` must support at least the following
+/// things:
+///
+/// * Searching a `&[u8]` for matches.
+/// * Searching a substring of `&[u8]` for a match, such that any match
+/// reported must appear entirely within that substring.
+/// * For a forwards search, a match should never be reported when
+/// [`Input::is_done`] returns true. (For reverse searches, termination should
+/// be handled outside of `Input`.)
+///
+/// Supporting other aspects of an `Input` are optional, but regex engines
+/// should handle aspects they don't support gracefully. How this is done is
+/// generally up to the regex engine. This crate generally treats unsupported
+/// anchored modes as an error to report for example, but for simplicity, in
+/// the meta regex engine, trying to search with an invalid pattern ID just
+/// results in no match being reported.
+#[derive(Clone)]
+pub struct Input<'h> {
+ haystack: &'h [u8],
+ span: Span,
+ anchored: Anchored,
+ earliest: bool,
+}
+
+impl<'h> Input<'h> {
+ /// Create a new search configuration for the given haystack.
+ #[inline]
+ pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
+ Input {
+ haystack: haystack.as_ref(),
+ span: Span { start: 0, end: haystack.as_ref().len() },
+ anchored: Anchored::No,
+ earliest: false,
+ }
+ }
+
+ /// Set the span for this search.
+ ///
+ /// This routine does not panic if the span given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`. To provide anything supported by range
+ /// syntax, use the [`Input::range`] method.
+ ///
+ /// The default span is the entire haystack.
+ ///
+ /// Note that [`Input::range`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// This example shows how the span of the search can impact whether a
+ /// match is reported or not. This is particularly relevant for look-around
+ /// operators, which might take things outside of the span into account
+ /// when determining whether they match.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Match, Input,
+ /// };
+ ///
+ /// // Look for 'at', but as a distinct word.
+ /// let re = PikeVM::new(r"\bat\b")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// // Our haystack contains 'at', but not as a distinct word.
+ /// let haystack = "batter";
+ ///
+ /// // A standard search finds nothing, as expected.
+ /// let input = Input::new(haystack);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// // But if we wanted to search starting at position '1', we might
+ /// // slice the haystack. If we do this, it's impossible for the \b
+ /// // anchors to take the surrounding context into account! And thus,
+ /// // a match is produced.
+ /// let input = Input::new(&haystack[1..3]);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
+ ///
+ /// // But if we specify the span of the search instead of slicing the
+ /// // haystack, then the regex engine can "see" outside of the span
+ /// // and resolve the anchors correctly.
+ /// let input = Input::new(haystack).span(1..3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This may seem a little ham-fisted, but this scenario tends to come up
+ /// if some other regex engine found the match span and now you need to
+ /// re-process that span to look for capturing groups. (e.g., Run a faster
+ /// DFA first, find a match, then run the PikeVM on just the match span to
+ /// resolve capturing groups.) In order to implement that sort of logic
+ /// correctly, you need to set the span on the search instead of slicing
+ /// the haystack directly.
+ ///
+ /// The other advantage of using this routine to specify the bounds of the
+ /// search is that the match offsets are still reported in terms of the
+ /// original haystack. For example, the second search in the example above
+ /// reported a match at position `0`, even though `at` starts at offset
+ /// `1` because we sliced the haystack.
+ #[inline]
+ pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
+ self.set_span(span);
+ self
+ }
+
+ /// Like `Input::span`, but accepts any range instead.
+ ///
+ /// This routine does not panic if the range given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// The default range is the entire haystack.
+ ///
+ /// Note that [`Input::span`] overrides this method and vice versa.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This also panics if the given range does not correspond to valid bounds
+ /// in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ ///
+ /// let input = Input::new("foobar").range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
+ self.set_range(range);
+ self
+ }
+
+ /// Sets the anchor mode of a search.
+ ///
+ /// When a search is anchored (so that's [`Anchored::Yes`] or
+ /// [`Anchored::Pattern`]), a match must begin at the start of a search.
+ /// When a search is not anchored (that's [`Anchored::No`]), regex engines
+ /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
+ /// permits a match to appear anywhere.
+ ///
+ /// By default, the anchored mode is [`Anchored::No`].
+ ///
+ /// **WARNING:** this is subtly different than using a `^` at the start of
+ /// your regex. A `^` forces a regex to match exclusively at the start of
+ /// a haystack, regardless of where you begin your search. In contrast,
+ /// anchoring a search will allow your regex to match anywhere in your
+ /// haystack, but the match must start at the beginning of a search.
+ ///
+ /// For example, consider the haystack `aba` and the following searches:
+ ///
+ /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
+ /// starting at position `2`. Since `^` requires the match to start at
+ /// the beginning of the haystack and `2 > 0`, no match is found.
+ /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
+ /// starting at position `2`. This reports a match at `[2, 3]` since
+ /// the match starts where the search started. Since there is no `^`,
+ /// there is no requirement for the match to start at the beginning of
+ /// the haystack.
+ /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
+ /// starting at position `1`. Since `b` corresponds to position `1` and
+ /// since the search is anchored, it finds no match. While the regex
+ /// matches at other positions, configuring the search to be anchored
+ /// requires that it only report a match that begins at the same offset
+ /// as the beginning of the search.
+ /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
+ /// starting at position `1`. Since the search is not anchored and
+ /// the regex does not start with `^`, the search executes as if there
+ /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
+ /// reports a match at `[2, 3]`.
+ ///
+ /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
+ /// except it only reports matches for a particular pattern.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// a pattern that begins with `^` (as described in the above warning
+ /// message).
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// Anchored, Match, Input,
+ /// };
+ ///
+ /// let haystack = "aba";
+ ///
+ /// let re = PikeVM::new(r"^a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // No match is found because 2 is not the beginning of the haystack,
+ /// // which is what ^ requires.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // An anchored search can still match anywhere in the haystack, it just
+ /// // must begin at the start of the search which is '2' in this case.
+ /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // No match is found since we start searching at offset 1 which
+ /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+ /// // is found.
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// let re = PikeVM::new(r"a")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
+ /// // pattern. Even though the search starts at 'b', the 'match anything'
+ /// // prefix allows the search to match 'a'.
+ /// let expected = Some(Match::must(0, 2..3));
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
+ self.set_anchored(mode);
+ self
+ }
+
+ /// Whether to execute an "earliest" search or not.
+ ///
+ /// When running a non-overlapping search, an "earliest" search will return
+ /// the match location as early as possible. For example, given a pattern
+ /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search
+ /// will return `foo12345` as a match. But an "earliest" search for regex
+ /// engines that support "earliest" semantics will return `foo1` as a
+ /// match, since as soon as the first digit following `foo` is seen, it is
+ /// known to have found a match.
+ ///
+ /// Note that "earliest" semantics generally depend on the regex engine.
+ /// Different regex engines may determine there is a match at different
+ /// points. So there is no guarantee that "earliest" matches will always
+ /// return the same offsets for all regex engines. The "earliest" notion
+ /// is really about when the particular regex engine determines there is
+ /// a match rather than a consistent semantic unto itself. This is often
+ /// useful for implementing "did a match occur or not" predicates, but
+ /// sometimes the offset is useful as well.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows the difference between "earliest" searching and
+ /// normal searching.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
+ ///
+ /// let re = PikeVM::new(r"foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// // A normal search implements greediness like you expect.
+ /// let input = Input::new("foo12345");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
+ ///
+ /// // When 'earliest' is enabled and the regex engine supports
+ /// // it, the search will bail once it knows a match has been
+ /// // found.
+ /// let input = Input::new("foo12345").earliest(true);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn earliest(mut self, yes: bool) -> Input<'h> {
+ self.set_earliest(yes);
+ self
+ }
+
+ /// Set the span for this search configuration.
+ ///
+ /// This is like the [`Input::span`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given span does not correspond to valid bounds in
+ /// the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_span(2..4);
+ /// assert_eq!(2..4, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_span<S: Into<Span>>(&mut self, span: S) {
+ let span = span.into();
+ assert!(
+ span.end <= self.haystack.len()
+ && span.start <= span.end.wrapping_add(1),
+ "invalid span {:?} for haystack of length {}",
+ span,
+ self.haystack.len(),
+ );
+ self.span = span;
+ }
+
+ /// Set the span for this search configuration given any range.
+ ///
+ /// This is like the [`Input::range`] method, except this mutates the
+ /// span in place.
+ ///
+ /// This routine does not panic if the range given is not a valid range for
+ /// this search's haystack. If this search is run with an invalid range,
+ /// then the most likely outcome is that the actual search execution will
+ /// panic.
+ ///
+ /// # Panics
+ ///
+ /// This routine will panic if the given range could not be converted
+ /// to a valid [`Range`]. For example, this would panic when given
+ /// `0..=usize::MAX` since it cannot be represented using a half-open
+ /// interval in terms of `usize`.
+ ///
+ /// This also panics if the given span does not correspond to valid bounds
+ /// in the haystack or the termination of a search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_range(2..=4);
+ /// assert_eq!(2..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
+ use core::ops::Bound;
+
+ // It's a little weird to convert ranges into spans, and then spans
+ // back into ranges when we actually slice the haystack. Because
+ // of that process, we always represent everything as a half-open
+ // internal. Therefore, handling things like m..=n is a little awkward.
+ let start = match range.start_bound() {
+ Bound::Included(&i) => i,
+ // Can this case ever happen? Range syntax doesn't support it...
+ Bound::Excluded(&i) => i.checked_add(1).unwrap(),
+ Bound::Unbounded => 0,
+ };
+ let end = match range.end_bound() {
+ Bound::Included(&i) => i.checked_add(1).unwrap(),
+ Bound::Excluded(&i) => i,
+ Bound::Unbounded => self.haystack().len(),
+ };
+ self.set_span(Span { start, end });
+ }
+
+ /// Set the starting offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the start of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the span resulting from the new start position does not
+ /// correspond to valid bounds in the haystack or the termination of a
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_start(5);
+ /// assert_eq!(5..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_start(&mut self, start: usize) {
+ self.set_span(Span { start, ..self.get_span() });
+ }
+
+ /// Set the ending offset for the span for this search configuration.
+ ///
+ /// This is a convenience routine for only mutating the end of a span
+ /// without having to set the entire span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the span resulting from the new end position does not
+ /// correspond to valid bounds in the haystack or the termination of a
+ /// search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// input.set_end(5);
+ /// assert_eq!(0..5, input.get_range());
+ /// ```
+ #[inline]
+ pub fn set_end(&mut self, end: usize) {
+ self.set_span(Span { end, ..self.get_span() });
+ }
+
+ /// Set the anchor mode of a search.
+ ///
+ /// This is like [`Input::anchored`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, Input, PatternID};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// let pid = PatternID::must(5);
+ /// input.set_anchored(Anchored::Pattern(pid));
+ /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn set_anchored(&mut self, mode: Anchored) {
+ self.anchored = mode;
+ }
+
+ /// Set whether the search should execute in "earliest" mode or not.
+ ///
+ /// This is like [`Input::earliest`], except it mutates the search
+ /// configuration in place.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// input.set_earliest(true);
+ /// assert!(input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn set_earliest(&mut self, yes: bool) {
+ self.earliest = yes;
+ }
+
+ /// Return a borrow of the underlying haystack as a slice of bytes.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(b"foobar", input.haystack());
+ /// ```
+ #[inline]
+ pub fn haystack(&self) -> &[u8] {
+ self.haystack
+ }
+
+ /// Return the start position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().start()`.
+ ///
+ /// When [`Input::is_done`] is `false`, this is guaranteed to return
+ /// an offset that is less than or equal to [`Input::end`]. Otherwise,
+ /// the offset is one greater than [`Input::end`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0, input.start());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(2, input.start());
+ /// ```
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.get_span().start
+ }
+
+ /// Return the end position of this search.
+ ///
+ /// This is a convenience routine for `search.get_span().end()`.
+ ///
+ /// This is guaranteed to return an offset that is a valid exclusive end
+ /// bound for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(6, input.end());
+ ///
+ /// let input = Input::new("foobar").span(2..4);
+ /// assert_eq!(4, input.end());
+ /// ```
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.get_span().end
+ }
+
+ /// Return the span for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// When [`Input::is_done`] is `false`, the span returned is guaranteed
+ /// to correspond to valid bounds for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Input, Span};
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
+ /// ```
+ #[inline]
+ pub fn get_span(&self) -> Span {
+ self.span
+ }
+
+ /// Return the span as a range for this search configuration.
+ ///
+ /// If one was not explicitly set, then the span corresponds to the entire
+ /// range of the haystack.
+ ///
+ /// When [`Input::is_done`] is `false`, the range returned is guaranteed
+ /// to correspond to valid bounds for this input's haystack.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert_eq!(0..6, input.get_range());
+ /// ```
+ #[inline]
+ pub fn get_range(&self) -> Range<usize> {
+ self.get_span().range()
+ }
+
+ /// Return the anchored mode for this search configuration.
+ ///
+ /// If no anchored mode was set, then it defaults to [`Anchored::No`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, Input, PatternID};
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert_eq!(Anchored::No, input.get_anchored());
+ ///
+ /// let pid = PatternID::must(5);
+ /// input.set_anchored(Anchored::Pattern(pid));
+ /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
+ /// ```
+ #[inline]
+ pub fn get_anchored(&self) -> Anchored {
+ self.anchored
+ }
+
+ /// Return whether this search should execute in "earliest" mode.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("foobar");
+ /// assert!(!input.get_earliest());
+ /// ```
+ #[inline]
+ pub fn get_earliest(&self) -> bool {
+ self.earliest
+ }
+
+ /// Return true if and only if this search can never return any other
+ /// matches.
+ ///
+ /// This occurs when the start position of this search is greater than the
+ /// end position of the search.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let mut input = Input::new("foobar");
+ /// assert!(!input.is_done());
+ /// input.set_start(6);
+ /// assert!(!input.is_done());
+ /// input.set_start(7);
+ /// assert!(input.is_done());
+ /// ```
+ #[inline]
+ pub fn is_done(&self) -> bool {
+ self.get_span().start > self.get_span().end
+ }
+
+ /// Returns true if and only if the given offset in this search's haystack
+ /// falls on a valid UTF-8 encoded codepoint boundary.
+ ///
+ /// If the haystack is not valid UTF-8, then the behavior of this routine
+ /// is unspecified.
+ ///
+ /// # Example
+ ///
+ /// This shows where codepoint boundaries do and don't exist in valid
+ /// UTF-8.
+ ///
+ /// ```
+ /// use regex_automata::Input;
+ ///
+ /// let input = Input::new("☃");
+ /// assert!(input.is_char_boundary(0));
+ /// assert!(!input.is_char_boundary(1));
+ /// assert!(!input.is_char_boundary(2));
+ /// assert!(input.is_char_boundary(3));
+ /// assert!(!input.is_char_boundary(4));
+ /// ```
+ #[inline]
+ pub fn is_char_boundary(&self, offset: usize) -> bool {
+ utf8::is_boundary(self.haystack(), offset)
+ }
+}
+
+impl<'h> core::fmt::Debug for Input<'h> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::util::escape::DebugHaystack;
+
+ f.debug_struct("Input")
+ .field("haystack", &DebugHaystack(self.haystack()))
+ .field("span", &self.span)
+ .field("anchored", &self.anchored)
+ .field("earliest", &self.earliest)
+ .finish()
+ }
+}
+
+impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
+ fn from(haystack: &'h H) -> Input<'h> {
+ Input::new(haystack)
+ }
+}
+
+/// A representation of a span reported by a regex engine.
+///
+/// A span corresponds to the starting and ending _byte offsets_ of a
+/// contiguous region of bytes. The starting offset is inclusive while the
+/// ending offset is exclusive. That is, a span is a half-open interval.
+///
+/// A span is used to report the offsets of a match, but it is also used to
+/// convey which region of a haystack should be searched via routines like
+/// [`Input::span`].
+///
+/// This is basically equivalent to a `std::ops::Range<usize>`, except this
+/// type implements `Copy` which makes it more ergonomic to use in the context
+/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`,
+/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`,
+/// which means things like `Span::from(5..10)` work.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Span {
+ /// The start offset of the span, inclusive.
+ pub start: usize,
+ /// The end offset of the span, exclusive.
+ pub end: usize,
+}
+
+impl Span {
+ /// Returns this span as a range.
+ #[inline]
+ pub fn range(&self) -> Range<usize> {
+ Range::from(*self)
+ }
+
+ /// Returns true when this span is empty. That is, when `start >= end`.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start >= self.end
+ }
+
+ /// Returns the length of this span.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end.saturating_sub(self.start)
+ }
+
+ /// Returns true when the given offset is contained within this span.
+ ///
+ /// Note that an empty span contains no offsets and will always return
+ /// false.
+ #[inline]
+ pub fn contains(&self, offset: usize) -> bool {
+ !self.is_empty() && self.start <= offset && offset <= self.end
+ }
+
+ /// Returns a new span with `offset` added to this span's `start` and `end`
+ /// values.
+ #[inline]
+ pub fn offset(&self, offset: usize) -> Span {
+ Span { start: self.start + offset, end: self.end + offset }
+ }
+}
+
+impl core::fmt::Debug for Span {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl core::ops::Index<Span> for [u8] {
+ type Output = [u8];
+
+ #[inline]
+ fn index(&self, index: Span) -> &[u8] {
+ &self[index.range()]
+ }
+}
+
+impl core::ops::IndexMut<Span> for [u8] {
+ #[inline]
+ fn index_mut(&mut self, index: Span) -> &mut [u8] {
+ &mut self[index.range()]
+ }
+}
+
+impl core::ops::Index<Span> for str {
+ type Output = str;
+
+ #[inline]
+ fn index(&self, index: Span) -> &str {
+ &self[index.range()]
+ }
+}
+
+impl From<Range<usize>> for Span {
+ #[inline]
+ fn from(range: Range<usize>) -> Span {
+ Span { start: range.start, end: range.end }
+ }
+}
+
+impl From<Span> for Range<usize> {
+ #[inline]
+ fn from(span: Span) -> Range<usize> {
+ Range { start: span.start, end: span.end }
+ }
+}
+
+impl PartialEq<Range<usize>> for Span {
+ #[inline]
+ fn eq(&self, range: &Range<usize>) -> bool {
+ self.start == range.start && self.end == range.end
+ }
+}
+
+impl PartialEq<Span> for Range<usize> {
+ #[inline]
+ fn eq(&self, span: &Span) -> bool {
+ self.start == span.start && self.end == span.end
+ }
+}
+
+/// A representation of "half" of a match reported by a DFA.
+///
+/// This is called a "half" match because it only includes the end location (or
+/// start location for a reverse search) of a match. This corresponds to the
+/// information that a single DFA scan can report. Getting the other half of
+/// the match requires a second scan with a reversed DFA.
+///
+/// A half match also includes the pattern that matched. The pattern is
+/// identified by an ID, which corresponds to its position (starting from `0`)
+/// relative to other patterns used to construct the corresponding DFA. If only
+/// a single pattern is provided to the DFA, then all matches are guaranteed to
+/// have a pattern ID of `0`.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct HalfMatch {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The offset of the match.
+ ///
+ /// For forward searches, the offset is exclusive. For reverse searches,
+ /// the offset is inclusive.
+ offset: usize,
+}
+
+impl HalfMatch {
+ /// Create a new half match from a pattern ID and a byte offset.
+ #[inline]
+ pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
+ HalfMatch { pattern, offset }
+ }
+
+ /// Create a new half match from a pattern ID and a byte offset.
+ ///
+ /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ #[inline]
+ pub fn must(pattern: usize, offset: usize) -> HalfMatch {
+ HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding DFA. The first pattern has
+ /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The position of the match.
+ ///
+ /// If this match was produced by a forward search, then the offset is
+ /// exclusive. If this match was produced by a reverse search, then the
+ /// offset is inclusive.
+ #[inline]
+ pub fn offset(&self) -> usize {
+ self.offset
+ }
+}
+
+/// A representation of a match reported by a regex engine.
+///
+/// A match has two essential pieces of information: the [`PatternID`] that
+/// matches, and the [`Span`] of the match in a haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding regex engine. If only a single pattern is provided, then all
+/// matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every match reported by a regex engine guarantees that its span has its
+/// start offset as less than or equal to its end offset.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The underlying match span.
+ span: Span,
+}
+
+impl Match {
+ /// Create a new match from a pattern ID and a span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the first pattern in a regex
+ /// object using convenient range syntax.
+ ///
+ /// ```
+ /// use regex_automata::{Match, PatternID};
+ ///
+ /// let m = Match::new(PatternID::ZERO, 5..10);
+ /// assert_eq!(0, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
+ let span: Span = span.into();
+ assert!(span.start <= span.end, "invalid match span");
+ Match { pattern, span }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This constructor is generic over how a span is provided. While
+ /// a [`Span`] may be given directly, one may also provide a
+ /// `std::ops::Range<usize>`.
+ ///
+ /// This is like [`Match::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ ///
+ /// # Example
+ ///
+ /// This shows how to create a match for the third pattern in a regex
+ /// object using convenient range syntax.
+ ///
+ /// ```
+ /// use regex_automata::Match;
+ ///
+ /// let m = Match::must(3, 5..10);
+ /// assert_eq!(3, m.pattern().as_usize());
+ /// assert_eq!(5, m.start());
+ /// assert_eq!(10, m.end());
+ /// ```
+ #[inline]
+ pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
+ Match::new(PatternID::must(pattern), span)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding regex engine. The first
+ /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
+ /// so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().start`.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.span().start
+ }
+
+ /// The ending position of the match.
+ ///
+ /// This is a convenience routine for `Match::span().end`.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.span().end
+ }
+
+ /// Returns the match span as a range.
+ ///
+ /// This is a convenience routine for `Match::span().range()`.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.span().range()
+ }
+
+ /// Returns the span for this match.
+ #[inline]
+ pub fn span(&self) -> Span {
+ self.span
+ }
+
+ /// Returns true when the span in this match is empty.
+ ///
+ /// An empty match can only be returned when the regex itself can match
+ /// the empty string.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.span().is_empty()
+ }
+
+ /// Returns the length of this match.
+ ///
+ /// This returns `0` in precisely the cases that `is_empty` returns `true`.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.span().len()
+ }
+}
+
+/// A set of `PatternID`s.
+///
+/// A set of pattern identifiers is useful for recording which patterns have
+/// matched a particular haystack. A pattern set _only_ includes pattern
+/// identifiers. It does not include offset information.
+///
+/// # Example
+///
+/// This shows basic usage of a set.
+///
+/// ```
+/// use regex_automata::{PatternID, PatternSet};
+///
+/// let pid1 = PatternID::must(5);
+/// let pid2 = PatternID::must(8);
+/// // Create a new empty set.
+/// let mut set = PatternSet::new(10);
+/// // Insert pattern IDs.
+/// set.insert(pid1);
+/// set.insert(pid2);
+/// // Test membership.
+/// assert!(set.contains(pid1));
+/// assert!(set.contains(pid2));
+/// // Get all members.
+/// assert_eq!(
+/// vec![5, 8],
+/// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(),
+/// );
+/// // Clear the set.
+/// set.clear();
+/// // Test that it is indeed empty.
+/// assert!(set.is_empty());
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PatternSet {
+ /// The number of patterns set to 'true' in this set.
+ len: usize,
+ /// A map from PatternID to boolean of whether a pattern matches or not.
+ ///
+ /// This should probably be a bitset, but it's probably unlikely to matter
+ /// much in practice.
+ ///
+ /// The main downside of this representation (and similarly for a bitset)
+ /// is that iteration scales with the capacity of the set instead of
+ /// the length of the set. This doesn't seem likely to be a problem in
+ /// practice.
+ ///
+ /// Another alternative is to just use a 'SparseSet' for this. It does use
+ /// more memory (quite a bit more), but that seems fine I think compared
+ /// to the memory being used by the regex engine. The real hiccup with
+ /// it is that it yields pattern IDs in the order they were inserted.
+ /// Which is actually kind of nice, but at the time of writing, pattern
+ /// IDs are yielded in ascending order in the regex crate RegexSet API.
+ /// If we did change to 'SparseSet', we could provide an additional
+ /// 'iter_match_order' iterator, but keep the ascending order one for
+ /// compatibility.
+ which: alloc::boxed::Box<[bool]>,
+}
+
+#[cfg(feature = "alloc")]
+impl PatternSet {
+ /// Create a new set of pattern identifiers with the given capacity.
+ ///
+ /// The given capacity typically corresponds to (at least) the number of
+ /// patterns in a compiled regex object.
+ ///
+ /// # Panics
+ ///
+ /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is
+ /// impossible if you use the `pattern_len()` method as defined on any of
+ /// the regex engines in this crate. Namely, a regex will fail to build by
+ /// returning an error if the number of patterns given to it exceeds the
+ /// limit. Therefore, the number of patterns in a valid regex is always
+ /// a correct capacity to provide here.
+ pub fn new(capacity: usize) -> PatternSet {
+ assert!(
+ capacity <= PatternID::LIMIT,
+ "pattern set capacity exceeds limit of {}",
+ PatternID::LIMIT,
+ );
+ PatternSet {
+ len: 0,
+ which: alloc::vec![false; capacity].into_boxed_slice(),
+ }
+ }
+
+ /// Clear this set such that it contains no pattern IDs.
+ pub fn clear(&mut self) {
+ self.len = 0;
+ for matched in self.which.iter_mut() {
+ *matched = false;
+ }
+ }
+
+ /// Return true if and only if the given pattern identifier is in this set.
+ pub fn contains(&self, pid: PatternID) -> bool {
+ pid.as_usize() < self.capacity() && self.which[pid]
+ }
+
+ /// Insert the given pattern identifier into this set and return `true` if
+ /// the given pattern ID was not previously in this set.
+ ///
+ /// If the pattern identifier is already in this set, then this is a no-op.
+ ///
+ /// Use [`PatternSet::try_insert`] for a fallible version of this routine.
+ ///
+ /// # Panics
+ ///
+ /// This panics if this pattern set has insufficient capacity to
+ /// store the given pattern ID.
+ pub fn insert(&mut self, pid: PatternID) -> bool {
+ self.try_insert(pid)
+ .expect("PatternSet should have sufficient capacity")
+ }
+
+ /// Insert the given pattern identifier into this set and return `true` if
+ /// the given pattern ID was not previously in this set.
+ ///
+ /// If the pattern identifier is already in this set, then this is a no-op.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if this pattern set has insufficient capacity to
+ /// store the given pattern ID.
+ pub fn try_insert(
+ &mut self,
+ pid: PatternID,
+ ) -> Result<bool, PatternSetInsertError> {
+ if pid.as_usize() >= self.capacity() {
+ return Err(PatternSetInsertError {
+ attempted: pid,
+ capacity: self.capacity(),
+ });
+ }
+ if self.which[pid] {
+ return Ok(false);
+ }
+ self.len += 1;
+ self.which[pid] = true;
+ Ok(true)
+ }
+
+ /*
+ // This is currently commented out because it is unused and it is unclear
+ // whether it's useful or not. What's the harm in having it? When, if
+ // we ever wanted to change our representation to a 'SparseSet', then
+ // supporting this method would be a bit tricky. So in order to keep some
+ // API evolution flexibility, we leave it out for now.
+
+ /// Remove the given pattern identifier from this set.
+ ///
+ /// If the pattern identifier was not previously in this set, then this
+ /// does not change the set and returns `false`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `pid` exceeds the capacity of this set.
+ pub fn remove(&mut self, pid: PatternID) -> bool {
+ if !self.which[pid] {
+ return false;
+ }
+ self.len -= 1;
+ self.which[pid] = false;
+ true
+ }
+ */
+
+ /// Return true if and only if this set has no pattern identifiers in it.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Return true if and only if this set has the maximum number of pattern
+ /// identifiers in the set. This occurs precisely when `PatternSet::len()
+ /// == PatternSet::capacity()`.
+ ///
+ /// This particular property is useful to test because it may allow one to
+ /// stop a search earlier than you might otherwise. Namely, if a search is
+ /// only reporting which patterns match a haystack and if you know all of
+ /// the patterns match at a given point, then there's no new information
+ /// that can be learned by continuing the search. (Because a pattern set
+ /// does not keep track of offset information.)
+ pub fn is_full(&self) -> bool {
+ self.len() == self.capacity()
+ }
+
+ /// Returns the total number of pattern identifiers in this set.
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns the total number of pattern identifiers that may be stored
+ /// in this set.
+ ///
+ /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`].
+ ///
+ /// Typically, the capacity of a pattern set matches the number of patterns
+ /// in a regex object with which you are searching.
+ pub fn capacity(&self) -> usize {
+ self.which.len()
+ }
+
+ /// Returns an iterator over all pattern identifiers in this set.
+ ///
+ /// The iterator yields pattern identifiers in ascending order, starting
+ /// at zero.
+ pub fn iter(&self) -> PatternSetIter<'_> {
+ PatternSetIter { it: self.which.iter().enumerate() }
+ }
+}
+
+/// An error that occurs when a `PatternID` failed to insert into a
+/// `PatternSet`.
+///
+/// An insert fails when the given `PatternID` exceeds the configured capacity
+/// of the `PatternSet`.
+///
+/// This error is created by the [`PatternSet::try_insert`] routine.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct PatternSetInsertError {
+ attempted: PatternID,
+ capacity: usize,
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for PatternSetInsertError {}
+
+#[cfg(feature = "alloc")]
+impl core::fmt::Display for PatternSetInsertError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to insert pattern ID {} into pattern set \
+ with insufficiet capacity of {}",
+ self.attempted.as_usize(),
+ self.capacity,
+ )
+ }
+}
+
+/// An iterator over all pattern identifiers in a [`PatternSet`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the pattern set being
+/// iterated over.
+///
+/// This iterator is created by the [`PatternSet::iter`] method.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct PatternSetIter<'a> {
+ it: core::iter::Enumerate<core::slice::Iter<'a, bool>>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for PatternSetIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ while let Some((index, &yes)) = self.it.next() {
+ if yes {
+ // Only valid 'PatternID' values can be inserted into the set
+ // and construction of the set panics if the capacity would
+ // permit storing invalid pattern IDs. Thus, 'yes' is only true
+ // precisely when 'index' corresponds to a valid 'PatternID'.
+ return Some(PatternID::new_unchecked(index));
+ }
+ }
+ None
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.it.size_hint()
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> DoubleEndedIterator for PatternSetIter<'a> {
+ fn next_back(&mut self) -> Option<PatternID> {
+ while let Some((index, &yes)) = self.it.next_back() {
+ if yes {
+ // Only valid 'PatternID' values can be inserted into the set
+ // and construction of the set panics if the capacity would
+ // permit storing invalid pattern IDs. Thus, 'yes' is only true
+ // precisely when 'index' corresponds to a valid 'PatternID'.
+ return Some(PatternID::new_unchecked(index));
+ }
+ }
+ None
+ }
+}
+
+/// The type of anchored search to perform.
+///
+/// This is *almost* a boolean option. That is, you can either do an unanchored
+/// search for any pattern in a regex, or you can do an anchored search for any
+/// pattern in a regex.
+///
+/// A third option exists that, assuming the regex engine supports it, permits
+/// you to do an anchored search for a specific pattern.
+///
+/// Note that there is no way to run an unanchored search for a specific
+/// pattern. If you need that, you'll need to build separate regexes for each
+/// pattern.
+///
+/// # Errors
+///
+/// If a regex engine does not support the anchored mode selected, then the
+/// regex engine will return an error. While any non-trivial regex engine
+/// should support at least one of the available anchored modes, there is no
+/// singular mode that is guaranteed to be universally supported. Some regex
+/// engines might only support unanchored searches (DFAs compiled without
+/// anchored starting states) and some regex engines might only support
+/// anchored searches (like the one-pass DFA).
+///
+/// The specific error returned is a [`MatchError`] with a
+/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the
+/// `Anchored` value given that is unsupported.
+///
+/// Note that regex engines should report "no match" if, for example, an
+/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where
+/// anchored searches for a specific pattern are supported. This is smooths out
+/// behavior such that it's possible to guarantee that an error never occurs
+/// based on how the regex engine is configured. All regex engines in this
+/// crate report "no match" when searching for an invalid pattern ID, but where
+/// searching for a valid pattern ID is otherwise supported.
+///
+/// # Example
+///
+/// This example shows how to use the various `Anchored` modes to run a
+/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)
+/// because it supports all modes unconditionally. Some regex engines, like
+/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored
+/// searches.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Anchored, Input, Match, PatternID,
+/// };
+///
+/// let re = PikeVM::new_many(&[
+/// r"Mrs. \w+",
+/// r"Miss \w+",
+/// r"Mr. \w+",
+/// r"Ms. \w+",
+/// ])?;
+/// let mut cache = re.create_cache();
+/// let hay = "Hello Mr. Springsteen!";
+///
+/// // The default is to do an unanchored search.
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
+/// // Explicitly ask for an unanchored search. Same as above.
+/// let input = Input::new(hay).anchored(Anchored::No);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
+///
+/// // Now try an anchored search. Since the match doesn't start at the
+/// // beginning of the haystack, no match is found!
+/// let input = Input::new(hay).anchored(Anchored::Yes);
+/// assert_eq!(None, re.find(&mut cache, input));
+///
+/// // We can try an anchored search again, but move the location of where
+/// // we start the search. Note that the offsets reported are still in
+/// // terms of the overall haystack and not relative to where we started
+/// // the search.
+/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
+///
+/// // Now try an anchored search for a specific pattern. We specifically
+/// // choose a pattern that we know doesn't match to prove that the search
+/// // only looks for the pattern we provide.
+/// let input = Input::new(hay)
+/// .anchored(Anchored::Pattern(PatternID::must(1)))
+/// .range(6..);
+/// assert_eq!(None, re.find(&mut cache, input));
+///
+/// // But if we switch it to the pattern that we know matches, then we find
+/// // the match.
+/// let input = Input::new(hay)
+/// .anchored(Anchored::Pattern(PatternID::must(2)))
+/// .range(6..);
+/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Anchored {
+ /// Run an unanchored search. This means a match may occur anywhere at or
+ /// after the start position of the search.
+ ///
+ /// This search can return a match for any pattern in the regex.
+ No,
+ /// Run an anchored search. This means that a match must begin at the
+ /// start position of the search.
+ ///
+ /// This search can return a match for any pattern in the regex.
+ Yes,
+ /// Run an anchored search for a specific pattern. This means that a match
+ /// must be for the given pattern and must begin at the start position of
+ /// the search.
+ Pattern(PatternID),
+}
+
+impl Anchored {
+ /// Returns true if and only if this anchor mode corresponds to any kind of
+ /// anchored search.
+ ///
+ /// # Example
+ ///
+ /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern`
+ /// are considered anchored searches.
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, PatternID};
+ ///
+ /// assert!(!Anchored::No.is_anchored());
+ /// assert!(Anchored::Yes.is_anchored());
+ /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored());
+ /// ```
+ #[inline]
+ pub fn is_anchored(&self) -> bool {
+ matches!(*self, Anchored::Yes | Anchored::Pattern(_))
+ }
+
+ /// Returns the pattern ID associated with this configuration if it is an
+ /// anchored search for a specific pattern. Otherwise `None` is returned.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Anchored, PatternID};
+ ///
+ /// assert_eq!(None, Anchored::No.pattern());
+ /// assert_eq!(None, Anchored::Yes.pattern());
+ ///
+ /// let pid = PatternID::must(5);
+ /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern());
+ /// ```
+ #[inline]
+ pub fn pattern(&self) -> Option<PatternID> {
+ match *self {
+ Anchored::Pattern(pid) => Some(pid),
+ _ => None,
+ }
+ }
+}
+
+/// The kind of match semantics to use for a regex pattern.
+///
+/// The default match kind is `LeftmostFirst`, and this corresponds to the
+/// match semantics used by most backtracking engines, such as Perl.
+///
+/// # Leftmost first or "preference order" match semantics
+///
+/// Leftmost-first semantics determine which match to report when there are
+/// multiple paths through a regex that match at the same position. The tie is
+/// essentially broken by how a backtracker would behave. For example, consider
+/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this
+/// case, both the `foofoo` and `foo` branches match at position `0`. So should
+/// the end of the match be `3` or `6`?
+///
+/// A backtracker will conceptually work by trying `foofoofoo` and failing.
+/// Then it will try `foofoo`, find the match and stop there. Thus, the
+/// leftmost-first match position is `6`. This is called "leftmost-first" or
+/// "preference order" because the order of the branches as written in the
+/// regex pattern is what determines how to break the tie.
+///
+/// (Note that leftmost-longest match semantics, which break ties by always
+/// taking the longest matching string, are not currently supported by this
+/// crate. These match semantics tend to be found in POSIX regex engines.)
+///
+/// This example shows how leftmost-first semantics work, and how it even
+/// applies to multi-pattern regexes:
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Match,
+/// };
+///
+/// let re = PikeVM::new_many(&[
+/// r"foofoofoo",
+/// r"foofoo",
+/// r"foo",
+/// ])?;
+/// let mut cache = re.create_cache();
+/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect();
+/// let expected = vec![Match::must(1, 0..6)];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # All matches
+///
+/// The `All` match semantics report any and all matches, and generally will
+/// attempt to match as much as possible. It doesn't respect any sort of match
+/// priority at all, so things like non-greedy matching don't work in this
+/// mode.
+///
+/// The fact that non-greedy matching doesn't work generally makes most forms
+/// of unanchored non-overlapping searches have unintuitive behavior. Namely,
+/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the
+/// beginning of the pattern, which is specifically non-greedy. Since it will
+/// be treated as greedy in `All` match semantics, this generally means that
+/// it will first attempt to consume all of the haystack and is likely to wind
+/// up skipping matches.
+///
+/// Generally speaking, `All` should only be used in two circumstances:
+///
+/// * When running an anchored search and there is a desire to match as much as
+/// possible. For example, when building a reverse regex matcher to find the
+/// start of a match after finding the end. In this case, the reverse search
+/// is anchored to the end of the match found by the forward search.
+/// * When running overlapping searches. Since `All` encodes all possible
+/// matches, this is generally what you want for an overlapping search. If you
+/// try to use leftmost-first in an overlapping search, it is likely to produce
+/// counter-intuitive results since leftmost-first specifically excludes some
+/// matches from its underlying finite state machine.
+///
+/// This example demonstrates the counter-intuitive behavior of `All` semantics
+/// when using a standard leftmost unanchored search:
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// Match, MatchKind,
+/// };
+///
+/// let re = PikeVM::builder()
+/// .configure(PikeVM::config().match_kind(MatchKind::All))
+/// .build("foo")?;
+/// let hay = "first foo second foo wat";
+/// let mut cache = re.create_cache();
+/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect();
+/// // Notice that it completely skips the first 'foo'!
+/// let expected = vec![Match::must(0, 17..20)];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This second example shows how `All` semantics are useful for an overlapping
+/// search. Note that we use lower level lazy DFA APIs here since the NFA
+/// engines only currently support a very limited form of overlapping search.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::{DFA, OverlappingState},
+/// HalfMatch, Input, MatchKind,
+/// };
+///
+/// let re = DFA::builder()
+/// // If we didn't set 'All' semantics here, then the regex would only
+/// // match 'foo' at offset 3 and nothing else. Why? Because the state
+/// // machine implements preference order and knows that the 'foofoo' and
+/// // 'foofoofoo' branches can never match since 'foo' will always match
+/// // when they match and take priority.
+/// .configure(DFA::config().match_kind(MatchKind::All))
+/// .build(r"foo|foofoo|foofoofoo")?;
+/// let mut cache = re.create_cache();
+/// let mut state = OverlappingState::start();
+/// let input = Input::new("foofoofoo");
+/// let mut got = vec![];
+/// loop {
+/// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?;
+/// let m = match state.get_match() {
+/// None => break,
+/// Some(m) => m,
+/// };
+/// got.push(m);
+/// }
+/// let expected = vec![
+/// HalfMatch::must(0, 3),
+/// HalfMatch::must(0, 6),
+/// HalfMatch::must(0, 9),
+/// ];
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[non_exhaustive]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Report all possible matches.
+ All,
+ /// Report only the leftmost matches. When multiple leftmost matches exist,
+ /// report the match corresponding to the part of the regex that appears
+ /// first in the syntax.
+ LeftmostFirst,
+ // There is prior art in RE2 that shows that we should be able to add
+ // LeftmostLongest too. The tricky part of it is supporting ungreedy
+ // repetitions. Instead of treating all NFA states as having equivalent
+ // priority (as in 'All') or treating all NFA states as having distinct
+ // priority based on order (as in 'LeftmostFirst'), we instead group NFA
+ // states into sets, and treat members of each set as having equivalent
+ // priority, but having greater priority than all following members
+ // of different sets.
+ //
+ // However, it's not clear whether it's really worth adding this. After
+ // all, leftmost-longest can be emulated when using literals by using
+ // leftmost-first and sorting the literals by length in descending order.
+ // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
+ // always match `a` in `ab` when using leftmost-first, but leftmost-longest
+ // would match `ab`.
+}
+
+impl MatchKind {
+ #[cfg(feature = "alloc")]
+ pub(crate) fn continue_past_first_match(&self) -> bool {
+ *self == MatchKind::All
+ }
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// An error indicating that a search stopped before reporting whether a
+/// match exists or not.
+///
+/// To be very clear, this error type implies that one cannot assume that no
+/// matches occur, since the search stopped before completing. That is, if
+/// you're looking for information about where a search determined that no
+/// match can occur, then this error type does *not* give you that. (Indeed, at
+/// the time of writing, if you need such a thing, you have to write your own
+/// search routine.)
+///
+/// Normally, when one searches for something, the response is either an
+/// affirmative "it was found at this location" or a negative "not found at
+/// all." However, in some cases, a regex engine can be configured to stop its
+/// search before concluding whether a match exists or not. When this happens,
+/// it may be important for the caller to know why the regex engine gave up and
+/// where in the input it gave up at. This error type exposes the 'why' and the
+/// 'where.'
+///
+/// For example, the DFAs provided by this library generally cannot correctly
+/// implement Unicode word boundaries. Instead, they provide an option to
+/// eagerly support them on ASCII text (since Unicode word boundaries are
+/// equivalent to ASCII word boundaries when searching ASCII text), but will
+/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
+/// required to either report the failure to the caller (unergonomic) or
+/// otherwise fall back to some other regex engine (ergonomic, but potentially
+/// costly).
+///
+/// More generally, some regex engines offer the ability for callers to specify
+/// certain bytes that will trigger the regex engine to automatically quit if
+/// they are seen.
+///
+/// Still yet, there may be other reasons for a failed match. For example,
+/// the hybrid DFA provided by this crate can be configured to give up if it
+/// believes that it is not efficient. This in turn permits callers to choose a
+/// different regex engine.
+///
+/// (Note that DFAs are configured by default to never quit or give up in this
+/// fashion. For example, by default, a DFA will fail to build if the regex
+/// pattern contains a Unicode word boundary. One needs to opt into the "quit"
+/// behavior via options, like
+/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).)
+///
+/// There are a couple other ways a search
+/// can fail. For example, when using the
+/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
+/// with a haystack that is too long, or trying to run an unanchored search
+/// with a [one-pass DFA](crate::dfa::onepass).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct MatchError(
+ #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>,
+ #[cfg(not(feature = "alloc"))] MatchErrorKind,
+);
+
+impl MatchError {
+ /// Create a new error value with the given kind.
+ ///
+ /// This is a more verbose version of the kind-specific constructors,
+ /// e.g., `MatchError::quit`.
+ pub fn new(kind: MatchErrorKind) -> MatchError {
+ #[cfg(feature = "alloc")]
+ {
+ MatchError(alloc::boxed::Box::new(kind))
+ }
+ #[cfg(not(feature = "alloc"))]
+ {
+ MatchError(kind)
+ }
+ }
+
+ /// Returns a reference to the underlying error kind.
+ pub fn kind(&self) -> &MatchErrorKind {
+ &self.0
+ }
+
+ /// Create a new "quit" error. The given `byte` corresponds to the value
+ /// that tripped a search's quit condition, and `offset` corresponds to the
+ /// location in the haystack at which the search quit.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::Quit`] kind.
+ pub fn quit(byte: u8, offset: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::Quit { byte, offset })
+ }
+
+ /// Create a new "gave up" error. The given `offset` corresponds to the
+ /// location in the haystack at which the search gave up.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::GaveUp`] kind.
+ pub fn gave_up(offset: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::GaveUp { offset })
+ }
+
+ /// Create a new "haystack too long" error. The given `len` corresponds to
+ /// the length of the haystack that was problematic.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::HaystackTooLong`] kind.
+ pub fn haystack_too_long(len: usize) -> MatchError {
+ MatchError::new(MatchErrorKind::HaystackTooLong { len })
+ }
+
+ /// Create a new "unsupported anchored" error. This occurs when the caller
+ /// requests a search with an anchor mode that is not supported by the
+ /// regex engine.
+ ///
+ /// This is the same as calling `MatchError::new` with a
+ /// [`MatchErrorKind::UnsupportedAnchored`] kind.
+ pub fn unsupported_anchored(mode: Anchored) -> MatchError {
+ MatchError::new(MatchErrorKind::UnsupportedAnchored { mode })
+ }
+}
+
+/// The underlying kind of a [`MatchError`].
+///
+/// This is a **non-exhaustive** enum. That means new variants may be added in
+/// a semver-compatible release.
+#[non_exhaustive]
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum MatchErrorKind {
+ /// The search saw a "quit" byte at which it was instructed to stop
+ /// searching.
+ Quit {
+ /// The "quit" byte that was observed that caused the search to stop.
+ byte: u8,
+ /// The offset at which the quit byte was observed.
+ offset: usize,
+ },
+ /// The search, based on heuristics, determined that it would be better
+ /// to stop, typically to provide the caller an opportunity to use an
+ /// alternative regex engine.
+ ///
+ /// Currently, the only way for this to occur is via the lazy DFA and
+ /// only when it is configured to do so (it will not return this error by
+ /// default).
+ GaveUp {
+ /// The offset at which the search stopped. This corresponds to the
+ /// position immediately following the last byte scanned.
+ offset: usize,
+ },
+ /// This error occurs if the haystack given to the regex engine was too
+ /// long to be searched. This occurs, for example, with regex engines
+ /// like the bounded backtracker that have a configurable fixed amount of
+ /// capacity that is tied to the length of the haystack. Anything beyond
+ /// that configured limit will result in an error at search time.
+ HaystackTooLong {
+ /// The length of the haystack that exceeded the limit.
+ len: usize,
+ },
+ /// An error indicating that a particular type of anchored search was
+ /// requested, but that the regex engine does not support it.
+ ///
+ /// Note that this error should not be returned by a regex engine simply
+ /// because the pattern ID is invalid (i.e., equal to or exceeds the number
+ /// of patterns in the regex). In that case, the regex engine should report
+ /// a non-match.
+ UnsupportedAnchored {
+ /// The anchored mode given that is unsupported.
+ mode: Anchored,
+ },
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self.kind() {
+ MatchErrorKind::Quit { byte, offset } => write!(
+ f,
+ "quit search after observing byte {:?} at offset {}",
+ DebugByte(byte),
+ offset,
+ ),
+ MatchErrorKind::GaveUp { offset } => {
+ write!(f, "gave up searching at offset {}", offset)
+ }
+ MatchErrorKind::HaystackTooLong { len } => {
+ write!(f, "haystack of length {} is too long", len)
+ }
+ MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => {
+ write!(f, "anchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => {
+ write!(f, "unanchored searches are not supported or enabled")
+ }
+ MatchErrorKind::UnsupportedAnchored {
+ mode: Anchored::Pattern(pid),
+ } => {
+ write!(
+ f,
+ "anchored searches for a specific pattern ({}) are \
+ not supported or enabled",
+ pid.as_usize(),
+ )
+ }
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ // We test that our 'MatchError' type is the size we expect. This isn't an
+ // API guarantee, but if the size increases, we really want to make sure we
+ // decide to do that intentionally. So this should be a speed bump. And in
+ // general, we should not increase the size without a very good reason.
+ //
+ // Why? Because low level search APIs return Result<.., MatchError>. When
+ // MatchError gets bigger, so to does the Result type.
+ //
+ // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes
+ // the importance of keeping a small error type. But without 'alloc', we
+ // still want things to be small.
+ #[test]
+ fn match_error_size() {
+ let expected_size = if cfg!(feature = "alloc") {
+ core::mem::size_of::<usize>()
+ } else {
+ 2 * core::mem::size_of::<usize>()
+ };
+ assert_eq!(expected_size, core::mem::size_of::<MatchError>());
+ }
+
+ // Same as above, but for the underlying match error kind.
+ #[cfg(target_pointer_width = "64")]
+ #[test]
+ fn match_error_kind_size() {
+ let expected_size = 2 * core::mem::size_of::<usize>();
+ assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
+ }
+
+ #[cfg(target_pointer_width = "32")]
+ #[test]
+ fn match_error_kind_size() {
+ let expected_size = 3 * core::mem::size_of::<usize>();
+ assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
+ }
+}
diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs
index bf59e4469..cbaa0b6f4 100644
--- a/vendor/regex-automata/src/util/sparse_set.rs
+++ b/vendor/regex-automata/src/util/sparse_set.rs
@@ -1,6 +1,23 @@
-use alloc::{boxed::Box, vec, vec::Vec};
+/*!
+This module defines a sparse set data structure. Its most interesting
+properties are:
-use crate::util::id::StateID;
+* They preserve insertion order.
+* Set membership testing is done in constant time.
+* Set insertion is done in constant time.
+* Clearing the set is done in constant time.
+
+The cost for doing this is that the capacity of the set needs to be known up
+front, and the elements in the set are limited to state identifiers.
+
+These sets are principally used when traversing an NFA state graph. This
+happens at search time, for example, in the PikeVM. It also happens during DFA
+determinization.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use crate::util::primitives::StateID;
/// A pairse of sparse sets.
///
@@ -79,7 +96,12 @@ pub(crate) struct SparseSet {
/// Sparse maps ids to their location in dense.
///
/// A state ID is in the set if and only if
- /// sparse[id] < dense.len() && id == dense[sparse[id]].
+ /// sparse[id] < len && id == dense[sparse[id]].
+ ///
+ /// Note that these are indices into 'dense'. It's a little weird to use
+ /// StateID here, but we know our length can never exceed the bounds of
+ /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
+ /// where as a usize is likely double that in most cases.
sparse: Vec<StateID>,
}
@@ -146,9 +168,9 @@ impl SparseSet {
///
/// This is marked as inline(always) since the compiler won't inline it
/// otherwise, and it's a fairly hot piece of code in DFA determinization.
- #[inline(always)]
- pub(crate) fn insert(&mut self, value: StateID) -> bool {
- if self.contains(value) {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn insert(&mut self, id: StateID) -> bool {
+ if self.contains(id) {
return false;
}
@@ -158,30 +180,22 @@ impl SparseSet {
"{:?} exceeds capacity of {:?} when inserting {:?}",
i,
self.capacity(),
- value,
+ id,
);
// OK since i < self.capacity() and self.capacity() is guaranteed to
// be <= StateID::LIMIT.
- let id = StateID::new_unchecked(i);
- self.dense[id] = value;
- self.sparse[value] = id;
+ let index = StateID::new_unchecked(i);
+ self.dense[index] = id;
+ self.sparse[id] = index;
self.len += 1;
true
}
/// Returns true if and only if this set contains the given value.
#[inline]
- pub(crate) fn contains(&self, value: StateID) -> bool {
- let i = self.sparse[value];
- i.as_usize() < self.len() && self.dense[i] == value
- }
-
- /// Returns the ith inserted element from this set.
- ///
- /// Panics when i >= self.len().
- #[inline]
- pub(crate) fn get(&self, i: usize) -> StateID {
- self.dense[i]
+ pub(crate) fn contains(&self, id: StateID) -> bool {
+ let index = self.sparse[id];
+ index.as_usize() < self.len() && self.dense[index] == id
}
/// Clear this set such that it has no members.
@@ -190,16 +204,21 @@ impl SparseSet {
self.len = 0;
}
+ #[inline]
+ pub(crate) fn iter(&self) -> SparseSetIter<'_> {
+ SparseSetIter(self.dense[..self.len()].iter())
+ }
+
/// Returns the heap memory usage, in bytes, used by this sparse set.
#[inline]
pub(crate) fn memory_usage(&self) -> usize {
- 2 * self.dense.len() * StateID::SIZE
+ self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
}
}
impl core::fmt::Debug for SparseSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
- let elements: Vec<StateID> = self.into_iter().collect();
+ let elements: Vec<StateID> = self.iter().collect();
f.debug_tuple("SparseSet").field(&elements).finish()
}
}
@@ -210,20 +229,11 @@ impl core::fmt::Debug for SparseSet {
#[derive(Debug)]
pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
-impl<'a> IntoIterator for &'a SparseSet {
- type Item = StateID;
- type IntoIter = SparseSetIter<'a>;
-
- fn into_iter(self) -> Self::IntoIter {
- SparseSetIter(self.dense[..self.len()].iter())
- }
-}
-
impl<'a> Iterator for SparseSetIter<'a> {
type Item = StateID;
- #[inline(always)]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
fn next(&mut self) -> Option<StateID> {
- self.0.next().map(|value| *value)
+ self.0.next().map(|&id| id)
}
}
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs
index 3c756fc26..4e360d083 100644
--- a/vendor/regex-automata/src/util/start.rs
+++ b/vendor/regex-automata/src/util/start.rs
@@ -1,21 +1,186 @@
-/// Represents the four possible starting configurations of a DFA search.
+/*!
+Provides some helpers for dealing with start state configurations in DFAs.
+
+[`Start`] represents the possible starting configurations, while
+[`StartByteMap`] represents a way to retrieve the `Start` configuration for a
+given position in a haystack.
+*/
+
+use crate::util::{
+ look::LookMatcher,
+ search::Input,
+ wire::{self, DeserializeError, SerializeError},
+};
+
+/// A map from every possible byte value to its corresponding starting
+/// configuration.
///
-/// The starting configuration is determined by inspecting the the beginning of
-/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
-/// specified) is what selects the start state to use in a DFA.
+/// This map is used in order to lookup the start configuration for a particular
+/// position in a haystack. This start configuration is then used in
+/// combination with things like the anchored mode and pattern ID to fully
+/// determine the start state.
///
-/// In a DFA that doesn't have starting states for each pattern, then it will
-/// have a maximum of four DFA start states. If the DFA was compiled with start
-/// states for each pattern, then it will have a maximum of four DFA start
-/// states for searching for any pattern, and then another maximum of four DFA
-/// start states for executing an anchored search for each pattern.
+/// Generally speaking, this map is only used for fully compiled DFAs and lazy
+/// DFAs. For NFAs (including the one-pass DFA), the start state is generally
+/// selected by virtue of traversing the NFA state graph. DFAs do the same
+/// thing, but at build time and not search time. (Well, technically the lazy
+/// DFA does it at search time, but it does enough work to cache the full
+/// result of the epsilon closure that the NFA engines tend to need to do.)
+#[derive(Clone)]
+pub(crate) struct StartByteMap {
+ map: [Start; 256],
+}
+
+impl StartByteMap {
+ /// Create a new map from byte values to their corresponding starting
+ /// configurations. The map is determined, in part, by how look-around
+ /// assertions are matched via the matcher given.
+ pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
+ let mut map = [Start::NonWordByte; 256];
+ map[usize::from(b'\n')] = Start::LineLF;
+ map[usize::from(b'\r')] = Start::LineCR;
+ map[usize::from(b'_')] = Start::WordByte;
+
+ let mut byte = b'0';
+ while byte <= b'9' {
+ map[usize::from(byte)] = Start::WordByte;
+ byte += 1;
+ }
+ byte = b'A';
+ while byte <= b'Z' {
+ map[usize::from(byte)] = Start::WordByte;
+ byte += 1;
+ }
+ byte = b'a';
+ while byte <= b'z' {
+ map[usize::from(byte)] = Start::WordByte;
+ byte += 1;
+ }
+
+ let lineterm = lookm.get_line_terminator();
+ // If our line terminator is normal, then it is already handled by
+ // the LineLF and LineCR configurations. But if it's weird, then we
+ // overwrite whatever was there before for that terminator with a
+ // special configuration. The trick here is that if the terminator
+ // is, say, a word byte like `a`, then callers seeing this start
+ // configuration need to account for that and build their DFA state as
+ // if it *also* came from a word byte.
+ if lineterm != b'\r' && lineterm != b'\n' {
+ map[usize::from(lineterm)] = Start::CustomLineTerminator;
+ }
+ StartByteMap { map }
+ }
+
+ /// Return the forward starting configuration for the given `input`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn fwd(&self, input: &Input) -> Start {
+ match input
+ .start()
+ .checked_sub(1)
+ .and_then(|i| input.haystack().get(i))
+ {
+ None => Start::Text,
+ Some(&byte) => self.get(byte),
+ }
+ }
+
+ /// Return the reverse starting configuration for the given `input`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn rev(&self, input: &Input) -> Start {
+ match input.haystack().get(input.end()) {
+ None => Start::Text,
+ Some(&byte) => self.get(byte),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn get(&self, byte: u8) -> Start {
+ self.map[usize::from(byte)]
+ }
+
+ /// Deserializes a byte class map from the given slice. If the slice is of
+ /// insufficient length or otherwise contains an impossible mapping, then
+ /// an error is returned. Upon success, the number of bytes read along with
+ /// the map are returned. The number of bytes read is always a multiple of
+ /// 8.
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(StartByteMap, usize), DeserializeError> {
+ wire::check_slice_len(slice, 256, "start byte map")?;
+ let mut map = [Start::NonWordByte; 256];
+ for (i, &repr) in slice[..256].iter().enumerate() {
+ map[i] = match Start::from_usize(usize::from(repr)) {
+ Some(start) => start,
+ None => {
+ return Err(DeserializeError::generic(
+ "found invalid starting configuration",
+ ))
+ }
+ };
+ }
+ Ok((StartByteMap { map }, 256))
+ }
+
+ /// Writes this map to the given byte buffer. if the given buffer is too
+ /// small, then an error is returned. Upon success, the total number of
+ /// bytes written is returned. The number of bytes written is guaranteed to
+ /// be a multiple of 8.
+ pub(crate) fn write_to(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("start byte map"));
+ }
+ for (i, &start) in self.map.iter().enumerate() {
+ dst[i] = start.as_u8();
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub(crate) fn write_to_len(&self) -> usize {
+ 256
+ }
+}
+
+impl core::fmt::Debug for StartByteMap {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use crate::util::escape::DebugByte;
+
+ write!(f, "StartByteMap{{")?;
+ for byte in 0..=255 {
+ if byte > 0 {
+ write!(f, ", ")?;
+ }
+ let start = self.map[usize::from(byte)];
+ write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
+ }
+ write!(f, "}}")?;
+ Ok(())
+ }
+}
+
+/// Represents the six possible starting configurations of a DFA search.
+///
+/// The starting configuration is determined by inspecting the the beginning
+/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
+/// (if specified) and the type of search (anchored or not) is what selects the
+/// start state to use in a DFA.
///
-/// This ends up being represented as a table in the DFA (whether lazy or fully
-/// built) where the stride of that table is 4, and each entry is an index into
-/// the state transition table. Note though that multiple entries in the table
-/// might point to the same state if the states would otherwise be equivalent.
-/// (This is guaranteed by DFA minimization and may even be accomplished by
-/// normal determinization, since it attempts to reuse equivalent states too.)
+/// As one example, if a DFA only supports unanchored searches and does not
+/// support anchored searches for each pattern, then it will have at most 6
+/// distinct start states. (Some start states may be reused if determinization
+/// can determine that they will be equivalent.) If the DFA supports both
+/// anchored and unanchored searches, then it will have a maximum of 12
+/// distinct start states. Finally, if the DFA also supports anchored searches
+/// for each pattern, then it can have up to `12 + (N * 6)` start states, where
+/// `N` is the number of patterns.
+///
+/// Handling each of these starting configurations in the context of DFA
+/// determinization can be *quite* tricky and subtle. But the code is small
+/// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum Start {
/// This occurs when the starting position is not any of the ones below.
@@ -28,7 +193,20 @@ pub(crate) enum Start {
Text = 2,
/// This occurs when the byte immediately preceding the start of the search
/// is a line terminator. Specifically, `\n`.
- Line = 3,
+ LineLF = 3,
+ /// This occurs when the byte immediately preceding the start of the search
+ /// is a line terminator. Specifically, `\r`.
+ LineCR = 4,
+ /// This occurs when a custom line terminator has been set via a
+ /// `LookMatcher`, and when that line terminator is neither a `\r` or a
+ /// `\n`.
+ ///
+ /// If the custom line terminator is a word byte, then this start
+ /// configuration is still selected. DFAs that implement word boundary
+ /// assertions will likely need to check whether the custom line terminator
+ /// is a word byte, in which case, it should behave as if the byte
+ /// satisfies `\b` in addition to multi-line anchors.
+ CustomLineTerminator = 5,
}
impl Start {
@@ -39,71 +217,90 @@ impl Start {
0 => Some(Start::NonWordByte),
1 => Some(Start::WordByte),
2 => Some(Start::Text),
- 3 => Some(Start::Line),
+ 3 => Some(Start::LineLF),
+ 4 => Some(Start::LineCR),
+ 5 => Some(Start::CustomLineTerminator),
_ => None,
}
}
/// Returns the total number of starting state configurations.
- pub(crate) fn count() -> usize {
- 4
- }
-
- /// Returns the starting state configuration for the given search
- /// parameters. If the given offset range is not valid, then this panics.
- #[inline(always)]
- pub(crate) fn from_position_fwd(
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Start {
- assert!(
- bytes.get(start..end).is_some(),
- "{}..{} is invalid",
- start,
- end
- );
- if start == 0 {
- Start::Text
- } else if bytes[start - 1] == b'\n' {
- Start::Line
- } else if crate::util::is_word_byte(bytes[start - 1]) {
- Start::WordByte
- } else {
- Start::NonWordByte
- }
+ pub(crate) fn len() -> usize {
+ 6
}
- /// Returns the starting state configuration for a reverse search with the
- /// given search parameters. If the given offset range is not valid, then
- /// this panics.
- #[inline(always)]
- pub(crate) fn from_position_rev(
- bytes: &[u8],
- start: usize,
- end: usize,
- ) -> Start {
- assert!(
- bytes.get(start..end).is_some(),
- "{}..{} is invalid",
- start,
- end
- );
- if end == bytes.len() {
- Start::Text
- } else if bytes[end] == b'\n' {
- Start::Line
- } else if crate::util::is_word_byte(bytes[end]) {
- Start::WordByte
- } else {
- Start::NonWordByte
- }
+ /// Return this starting configuration as `u8` integer. It is guaranteed to
+ /// be less than `Start::len()`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn as_u8(&self) -> u8 {
+ // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+ // actual int.
+ *self as u8
}
- /// Return this starting configuration as an integer. It is guaranteed to
- /// be less than `Start::count()`.
- #[inline(always)]
+ /// Return this starting configuration as a `usize` integer. It is
+ /// guaranteed to be less than `Start::len()`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn as_usize(&self) -> usize {
- *self as usize
+ usize::from(self.as_u8())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn start_fwd_done_range() {
+ let smap = StartByteMap::new(&LookMatcher::default());
+ assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0)));
+ }
+
+ #[test]
+ fn start_rev_done_range() {
+ let smap = StartByteMap::new(&LookMatcher::default());
+ assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0)));
+ }
+
+ #[test]
+ fn start_fwd() {
+ let f = |haystack, start, end| {
+ let smap = StartByteMap::new(&LookMatcher::default());
+ let input = &Input::new(haystack).range(start..end);
+ smap.fwd(input)
+ };
+
+ assert_eq!(Start::Text, f("", 0, 0));
+ assert_eq!(Start::Text, f("abc", 0, 3));
+ assert_eq!(Start::Text, f("\nabc", 0, 3));
+
+ assert_eq!(Start::LineLF, f("\nabc", 1, 3));
+
+ assert_eq!(Start::LineCR, f("\rabc", 1, 3));
+
+ assert_eq!(Start::WordByte, f("abc", 1, 3));
+
+ assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
+ }
+
+ #[test]
+ fn start_rev() {
+ let f = |haystack, start, end| {
+ let smap = StartByteMap::new(&LookMatcher::default());
+ let input = &Input::new(haystack).range(start..end);
+ smap.rev(input)
+ };
+
+ assert_eq!(Start::Text, f("", 0, 0));
+ assert_eq!(Start::Text, f("abc", 0, 3));
+ assert_eq!(Start::Text, f("abc\n", 0, 4));
+
+ assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
+
+ assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
+
+ assert_eq!(Start::WordByte, f("abc", 0, 2));
+
+ assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
}
}
diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs
index 88beeee75..78e3cf9a1 100644
--- a/vendor/regex-automata/src/util/syntax.rs
+++ b/vendor/regex-automata/src/util/syntax.rs
@@ -1,4 +1,132 @@
-use regex_syntax::ParserBuilder;
+/*!
+Utilities for dealing with the syntax of a regular expression.
+
+This module currently only exposes a [`Config`] type that
+itself represents a wrapper around the configuration for a
+[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
+this wrapper is to make configuring syntax options very similar to how other
+configuration is done throughout this crate. Namely, instead of duplicating
+syntax options across every builder (of which there are many), we instead
+create small config objects like this one that can be passed around and
+composed.
+*/
+
+use alloc::{vec, vec::Vec};
+
+use regex_syntax::{
+ ast,
+ hir::{self, Hir},
+ Error, ParserBuilder,
+};
+
+/// A convenience routine for parsing a pattern into an HIR value with the
+/// default configuration.
+///
+/// # Example
+///
+/// This shows how to parse a pattern into an HIR value:
+///
+/// ```
+/// use regex_automata::util::syntax;
+///
+/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
+/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse(pattern: &str) -> Result<Hir, Error> {
+ parse_with(pattern, &Config::default())
+}
+
+/// A convenience routine for parsing many patterns into HIR value with the
+/// default configuration.
+///
+/// # Example
+///
+/// This shows how to parse many patterns into an corresponding HIR values:
+///
+/// ```
+/// use {
+/// regex_automata::util::syntax,
+/// regex_syntax::hir::Properties,
+/// };
+///
+/// let hirs = syntax::parse_many(&[
+/// r"([a-z]+)|([0-9]+)",
+/// r"foo(A-Z]+)bar",
+/// ])?;
+/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
+/// assert_eq!(Some(1), props.static_explicit_captures_len());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
+ parse_many_with(patterns, &Config::default())
+}
+
+/// A convenience routine for parsing a pattern into an HIR value using a
+/// `Config`.
+///
+/// # Example
+///
+/// This shows how to parse a pattern into an HIR value with a non-default
+/// configuration:
+///
+/// ```
+/// use regex_automata::util::syntax;
+///
+/// let hir = syntax::parse_with(
+/// r"^[a-z]+$",
+/// &syntax::Config::new().multi_line(true).crlf(true),
+/// )?;
+/// assert!(hir.properties().look_set().contains_anchor_crlf());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
+ let mut builder = ParserBuilder::new();
+ config.apply(&mut builder);
+ builder.build().parse(pattern)
+}
+
+/// A convenience routine for parsing many patterns into HIR values using a
+/// `Config`.
+///
+/// # Example
+///
+/// This shows how to parse many patterns into an corresponding HIR values
+/// with a non-default configuration:
+///
+/// ```
+/// use {
+/// regex_automata::util::syntax,
+/// regex_syntax::hir::Properties,
+/// };
+///
+/// let patterns = &[
+/// r"([a-z]+)|([0-9]+)",
+/// r"\W",
+/// r"foo(A-Z]+)bar",
+/// ];
+/// let config = syntax::Config::new().unicode(false).utf8(false);
+/// let hirs = syntax::parse_many_with(patterns, &config)?;
+/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
+/// assert!(!props.is_utf8());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+pub fn parse_many_with<P: AsRef<str>>(
+ patterns: &[P],
+ config: &Config,
+) -> Result<Vec<Hir>, Error> {
+ let mut builder = ParserBuilder::new();
+ config.apply(&mut builder);
+ let mut hirs = vec![];
+ for p in patterns.iter() {
+ hirs.push(builder.build().parse(p.as_ref())?);
+ }
+ Ok(hirs)
+}
/// A common set of configuration options that apply to the syntax of a regex.
///
@@ -14,10 +142,12 @@ use regex_syntax::ParserBuilder;
/// in this crate. Instead of re-defining them on every engine's builder, they
/// are instead provided here as one cohesive unit.
#[derive(Clone, Copy, Debug)]
-pub struct SyntaxConfig {
+pub struct Config {
case_insensitive: bool,
multi_line: bool,
dot_matches_new_line: bool,
+ crlf: bool,
+ line_terminator: u8,
swap_greed: bool,
ignore_whitespace: bool,
unicode: bool,
@@ -26,14 +156,16 @@ pub struct SyntaxConfig {
octal: bool,
}
-impl SyntaxConfig {
+impl Config {
/// Return a new default syntax configuration.
- pub fn new() -> SyntaxConfig {
+ pub fn new() -> Config {
// These defaults match the ones used in regex-syntax.
- SyntaxConfig {
+ Config {
case_insensitive: false,
multi_line: false,
dot_matches_new_line: false,
+ crlf: false,
+ line_terminator: b'\n',
swap_greed: false,
ignore_whitespace: false,
unicode: true,
@@ -51,7 +183,7 @@ impl SyntaxConfig {
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `i` flag.
- pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig {
+ pub fn case_insensitive(mut self, yes: bool) -> Config {
self.case_insensitive = yes;
self
}
@@ -66,7 +198,7 @@ impl SyntaxConfig {
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `m` flag.
- pub fn multi_line(mut self, yes: bool) -> SyntaxConfig {
+ pub fn multi_line(mut self, yes: bool) -> Config {
self.multi_line = yes;
self
}
@@ -77,7 +209,7 @@ impl SyntaxConfig {
/// then `.` will match any character except for a new line character.
///
/// Note that `.` is impacted by whether the "unicode" setting is enabled
- /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8
+ /// or not. When Unicode is enabled (the default), `.` will match any UTF-8
/// encoding of any Unicode scalar value (sans a new line, depending on
/// whether this "dot matches new line" option is enabled). When Unicode
/// mode is disabled, `.` will match any byte instead. Because of this,
@@ -87,11 +219,53 @@ impl SyntaxConfig {
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `s` flag.
- pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig {
+ pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
self.dot_matches_new_line = yes;
self
}
+ /// Enable or disable the "CRLF mode" flag by default.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `R` flag.
+ ///
+ /// When CRLF mode is enabled, the following happens:
+ ///
+ /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
+ /// except for `\r` and `\n`.
+ /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
+ /// `\r` and `\n` as line terminators. And in particular, neither will
+ /// match between a `\r` and a `\n`.
+ pub fn crlf(mut self, yes: bool) -> Config {
+ self.crlf = yes;
+ self
+ }
+
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(mut self, byte: u8) -> Config {
+ self.line_terminator = byte;
+ self
+ }
+
/// Enable or disable the "swap greed" flag by default.
///
/// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
@@ -99,7 +273,7 @@ impl SyntaxConfig {
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `U` flag.
- pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig {
+ pub fn swap_greed(mut self, yes: bool) -> Config {
self.swap_greed = yes;
self
}
@@ -112,7 +286,7 @@ impl SyntaxConfig {
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
- pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig {
+ pub fn ignore_whitespace(mut self, yes: bool) -> Config {
self.ignore_whitespace = yes;
self
}
@@ -131,7 +305,7 @@ impl SyntaxConfig {
/// time. This is especially noticeable if your regex contains character
/// classes like `\w` that are impacted by whether Unicode is enabled or
/// not. If Unicode is not necessary, you are encouraged to disable it.
- pub fn unicode(mut self, yes: bool) -> SyntaxConfig {
+ pub fn unicode(mut self, yes: bool) -> Config {
self.unicode = yes;
self
}
@@ -139,7 +313,7 @@ impl SyntaxConfig {
/// When disabled, the builder will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
- /// For example, when [`SyntaxConfig::unicode`] is disabled, then
+ /// For example, when [`Config::unicode`] is disabled, then
/// expressions like `[^a]` may match invalid UTF-8 since they can match
/// any single byte that is not `a`. By default, these sub-expressions
/// are disallowed to avoid returning offsets that split a UTF-8
@@ -150,7 +324,7 @@ impl SyntaxConfig {
/// When enabled (the default), the builder is guaranteed to produce a
/// regex that will only ever match valid UTF-8 (otherwise, the builder
/// will return an error).
- pub fn utf8(mut self, yes: bool) -> SyntaxConfig {
+ pub fn utf8(mut self, yes: bool) -> Config {
self.utf8 = yes;
self
}
@@ -171,7 +345,7 @@ impl SyntaxConfig {
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since the parser will
- /// limit itself to heap space proportional to the lenth of the pattern
+ /// limit itself to heap space proportional to the length of the pattern
/// string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
@@ -180,7 +354,7 @@ impl SyntaxConfig {
/// in a nest depth of `1`. In general, a nest limit is not something that
/// manifests in an obvious way in the concrete syntax, therefore, it
/// should not be used in a granular way.
- pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig {
+ pub fn nest_limit(mut self, limit: u32) -> Config {
self.nest_limit = limit;
self
}
@@ -200,7 +374,7 @@ impl SyntaxConfig {
/// message will explicitly mention that backreferences aren't supported.
///
/// Octal syntax is disabled by default.
- pub fn octal(mut self, yes: bool) -> SyntaxConfig {
+ pub fn octal(mut self, yes: bool) -> Config {
self.octal = yes;
self
}
@@ -225,6 +399,16 @@ impl SyntaxConfig {
self.dot_matches_new_line
}
+ /// Returns whether "CRLF" mode is enabled.
+ pub fn get_crlf(&self) -> bool {
+ self.crlf
+ }
+
+ /// Returns the line terminator in this syntax configuration.
+ pub fn get_line_terminator(&self) -> u8 {
+ self.line_terminator
+ }
+
/// Returns whether "swap greed" mode is enabled.
pub fn get_swap_greed(&self) -> bool {
self.swap_greed
@@ -257,16 +441,42 @@ impl SyntaxConfig {
.case_insensitive(self.case_insensitive)
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
+ .crlf(self.crlf)
+ .line_terminator(self.line_terminator)
.swap_greed(self.swap_greed)
.ignore_whitespace(self.ignore_whitespace)
- .allow_invalid_utf8(!self.utf8)
+ .utf8(self.utf8)
+ .nest_limit(self.nest_limit)
+ .octal(self.octal);
+ }
+
+ /// Applies this configuration to the given AST parser.
+ pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
+ builder
+ .ignore_whitespace(self.ignore_whitespace)
.nest_limit(self.nest_limit)
.octal(self.octal);
}
+
+ /// Applies this configuration to the given AST-to-HIR translator.
+ pub(crate) fn apply_hir(
+ &self,
+ builder: &mut hir::translate::TranslatorBuilder,
+ ) {
+ builder
+ .unicode(self.unicode)
+ .case_insensitive(self.case_insensitive)
+ .multi_line(self.multi_line)
+ .crlf(self.crlf)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .line_terminator(self.line_terminator)
+ .swap_greed(self.swap_greed)
+ .utf8(self.utf8);
+ }
}
-impl Default for SyntaxConfig {
- fn default() -> SyntaxConfig {
- SyntaxConfig::new()
+impl Default for Config {
+ fn default() -> Config {
+ Config::new()
}
}
diff --git a/vendor/regex-automata/src/util/unicode_data/mod.rs b/vendor/regex-automata/src/util/unicode_data/mod.rs
new file mode 100644
index 000000000..fc7b1c738
--- /dev/null
+++ b/vendor/regex-automata/src/util/unicode_data/mod.rs
@@ -0,0 +1,17 @@
+// This cfg should match the one in src/util/look.rs that uses perl_word.
+#[cfg(all(
+ // We have to explicitly want to support Unicode word boundaries.
+ feature = "unicode-word-boundary",
+ not(all(
+ // If we don't have regex-syntax at all, then we definitely need to
+ // bring our own \w data table.
+ feature = "syntax",
+ // If unicode-perl is enabled, then regex-syntax/unicode-perl is
+ // also enabled, which in turn means we can use regex-syntax's
+ // is_word_character routine (and thus use its data tables). But if
+ // unicode-perl is not enabled, even if syntax is, then we need to
+ // bring our own.
+ feature = "unicode-perl",
+ )),
+))]
+pub(crate) mod perl_word;
diff --git a/vendor/regex-automata/src/util/unicode_data/perl_word.rs b/vendor/regex-automata/src/util/unicode_data/perl_word.rs
new file mode 100644
index 000000000..74d62656f
--- /dev/null
+++ b/vendor/regex-automata/src/util/unicode_data/perl_word.rs
@@ -0,0 +1,781 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// ucd-generate perl-word tmp/ucd-15.0.0/ --chars
+//
+// Unicode version: 15.0.0.
+//
+// ucd-generate 0.2.15 is available on crates.io.
+
+pub const PERL_WORD: &'static [(char, char)] = &[
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ª', 'ª'),
+ ('µ', 'µ'),
+ ('º', 'º'),
+ ('À', 'Ö'),
+ ('Ø', 'ö'),
+ ('ø', 'ˁ'),
+ ('ˆ', 'ˑ'),
+ ('ˠ', 'ˤ'),
+ ('ˬ', 'ˬ'),
+ ('ˮ', 'ˮ'),
+ ('\u{300}', 'ʹ'),
+ ('Ͷ', 'ͷ'),
+ ('ͺ', 'ͽ'),
+ ('Ϳ', 'Ϳ'),
+ ('Ά', 'Ά'),
+ ('Έ', 'Ί'),
+ ('Ό', 'Ό'),
+ ('Ύ', 'Ρ'),
+ ('Σ', 'ϵ'),
+ ('Ϸ', 'ҁ'),
+ ('\u{483}', 'ԯ'),
+ ('Ա', 'Ֆ'),
+ ('ՙ', 'ՙ'),
+ ('ՠ', 'ֈ'),
+ ('\u{591}', '\u{5bd}'),
+ ('\u{5bf}', '\u{5bf}'),
+ ('\u{5c1}', '\u{5c2}'),
+ ('\u{5c4}', '\u{5c5}'),
+ ('\u{5c7}', '\u{5c7}'),
+ ('א', 'ת'),
+ ('ׯ', 'ײ'),
+ ('\u{610}', '\u{61a}'),
+ ('ؠ', '٩'),
+ ('ٮ', 'ۓ'),
+ ('ە', '\u{6dc}'),
+ ('\u{6df}', '\u{6e8}'),
+ ('\u{6ea}', 'ۼ'),
+ ('ۿ', 'ۿ'),
+ ('ܐ', '\u{74a}'),
+ ('ݍ', 'ޱ'),
+ ('߀', 'ߵ'),
+ ('ߺ', 'ߺ'),
+ ('\u{7fd}', '\u{7fd}'),
+ ('ࠀ', '\u{82d}'),
+ ('ࡀ', '\u{85b}'),
+ ('ࡠ', 'ࡪ'),
+ ('ࡰ', 'ࢇ'),
+ ('ࢉ', 'ࢎ'),
+ ('\u{898}', '\u{8e1}'),
+ ('\u{8e3}', '\u{963}'),
+ ('०', '९'),
+ ('ॱ', 'ঃ'),
+ ('অ', 'ঌ'),
+ ('এ', 'ঐ'),
+ ('ও', 'ন'),
+ ('প', 'র'),
+ ('ল', 'ল'),
+ ('শ', 'হ'),
+ ('\u{9bc}', '\u{9c4}'),
+ ('ে', 'ৈ'),
+ ('ো', 'ৎ'),
+ ('\u{9d7}', '\u{9d7}'),
+ ('ড়', 'ঢ়'),
+ ('য়', '\u{9e3}'),
+ ('০', 'ৱ'),
+ ('ৼ', 'ৼ'),
+ ('\u{9fe}', '\u{9fe}'),
+ ('\u{a01}', 'ਃ'),
+ ('ਅ', 'ਊ'),
+ ('ਏ', 'ਐ'),
+ ('ਓ', 'ਨ'),
+ ('ਪ', 'ਰ'),
+ ('ਲ', 'ਲ਼'),
+ ('ਵ', 'ਸ਼'),
+ ('ਸ', 'ਹ'),
+ ('\u{a3c}', '\u{a3c}'),
+ ('ਾ', '\u{a42}'),
+ ('\u{a47}', '\u{a48}'),
+ ('\u{a4b}', '\u{a4d}'),
+ ('\u{a51}', '\u{a51}'),
+ ('ਖ਼', 'ੜ'),
+ ('ਫ਼', 'ਫ਼'),
+ ('੦', '\u{a75}'),
+ ('\u{a81}', 'ઃ'),
+ ('અ', 'ઍ'),
+ ('એ', 'ઑ'),
+ ('ઓ', 'ન'),
+ ('પ', 'ર'),
+ ('લ', 'ળ'),
+ ('વ', 'હ'),
+ ('\u{abc}', '\u{ac5}'),
+ ('\u{ac7}', 'ૉ'),
+ ('ો', '\u{acd}'),
+ ('ૐ', 'ૐ'),
+ ('ૠ', '\u{ae3}'),
+ ('૦', '૯'),
+ ('ૹ', '\u{aff}'),
+ ('\u{b01}', 'ଃ'),
+ ('ଅ', 'ଌ'),
+ ('ଏ', 'ଐ'),
+ ('ଓ', 'ନ'),
+ ('ପ', 'ର'),
+ ('ଲ', 'ଳ'),
+ ('ଵ', 'ହ'),
+ ('\u{b3c}', '\u{b44}'),
+ ('େ', 'ୈ'),
+ ('ୋ', '\u{b4d}'),
+ ('\u{b55}', '\u{b57}'),
+ ('ଡ଼', 'ଢ଼'),
+ ('ୟ', '\u{b63}'),
+ ('୦', '୯'),
+ ('ୱ', 'ୱ'),
+ ('\u{b82}', 'ஃ'),
+ ('அ', 'ஊ'),
+ ('எ', 'ஐ'),
+ ('ஒ', 'க'),
+ ('ங', 'ச'),
+ ('ஜ', 'ஜ'),
+ ('ஞ', 'ட'),
+ ('ண', 'த'),
+ ('ந', 'ப'),
+ ('ம', 'ஹ'),
+ ('\u{bbe}', 'ூ'),
+ ('ெ', 'ை'),
+ ('ொ', '\u{bcd}'),
+ ('ௐ', 'ௐ'),
+ ('\u{bd7}', '\u{bd7}'),
+ ('௦', '௯'),
+ ('\u{c00}', 'ఌ'),
+ ('ఎ', 'ఐ'),
+ ('ఒ', 'న'),
+ ('ప', 'హ'),
+ ('\u{c3c}', 'ౄ'),
+ ('\u{c46}', '\u{c48}'),
+ ('\u{c4a}', '\u{c4d}'),
+ ('\u{c55}', '\u{c56}'),
+ ('ౘ', 'ౚ'),
+ ('ౝ', 'ౝ'),
+ ('ౠ', '\u{c63}'),
+ ('౦', '౯'),
+ ('ಀ', 'ಃ'),
+ ('ಅ', 'ಌ'),
+ ('ಎ', 'ಐ'),
+ ('ಒ', 'ನ'),
+ ('ಪ', 'ಳ'),
+ ('ವ', 'ಹ'),
+ ('\u{cbc}', 'ೄ'),
+ ('\u{cc6}', 'ೈ'),
+ ('ೊ', '\u{ccd}'),
+ ('\u{cd5}', '\u{cd6}'),
+ ('ೝ', 'ೞ'),
+ ('ೠ', '\u{ce3}'),
+ ('೦', '೯'),
+ ('ೱ', 'ೳ'),
+ ('\u{d00}', 'ഌ'),
+ ('എ', 'ഐ'),
+ ('ഒ', '\u{d44}'),
+ ('െ', 'ൈ'),
+ ('ൊ', 'ൎ'),
+ ('ൔ', '\u{d57}'),
+ ('ൟ', '\u{d63}'),
+ ('൦', '൯'),
+ ('ൺ', 'ൿ'),
+ ('\u{d81}', 'ඃ'),
+ ('අ', 'ඖ'),
+ ('ක', 'න'),
+ ('ඳ', 'ර'),
+ ('ල', 'ල'),
+ ('ව', 'ෆ'),
+ ('\u{dca}', '\u{dca}'),
+ ('\u{dcf}', '\u{dd4}'),
+ ('\u{dd6}', '\u{dd6}'),
+ ('ෘ', '\u{ddf}'),
+ ('෦', '෯'),
+ ('ෲ', 'ෳ'),
+ ('ก', '\u{e3a}'),
+ ('เ', '\u{e4e}'),
+ ('๐', '๙'),
+ ('ກ', 'ຂ'),
+ ('ຄ', 'ຄ'),
+ ('ຆ', 'ຊ'),
+ ('ຌ', 'ຣ'),
+ ('ລ', 'ລ'),
+ ('ວ', 'ຽ'),
+ ('ເ', 'ໄ'),
+ ('ໆ', 'ໆ'),
+ ('\u{ec8}', '\u{ece}'),
+ ('໐', '໙'),
+ ('ໜ', 'ໟ'),
+ ('ༀ', 'ༀ'),
+ ('\u{f18}', '\u{f19}'),
+ ('༠', '༩'),
+ ('\u{f35}', '\u{f35}'),
+ ('\u{f37}', '\u{f37}'),
+ ('\u{f39}', '\u{f39}'),
+ ('༾', 'ཇ'),
+ ('ཉ', 'ཬ'),
+ ('\u{f71}', '\u{f84}'),
+ ('\u{f86}', '\u{f97}'),
+ ('\u{f99}', '\u{fbc}'),
+ ('\u{fc6}', '\u{fc6}'),
+ ('က', '၉'),
+ ('ၐ', '\u{109d}'),
+ ('Ⴀ', 'Ⴥ'),
+ ('Ⴧ', 'Ⴧ'),
+ ('Ⴭ', 'Ⴭ'),
+ ('ა', 'ჺ'),
+ ('ჼ', 'ቈ'),
+ ('ቊ', 'ቍ'),
+ ('ቐ', 'ቖ'),
+ ('ቘ', 'ቘ'),
+ ('ቚ', 'ቝ'),
+ ('በ', 'ኈ'),
+ ('ኊ', 'ኍ'),
+ ('ነ', 'ኰ'),
+ ('ኲ', 'ኵ'),
+ ('ኸ', 'ኾ'),
+ ('ዀ', 'ዀ'),
+ ('ዂ', 'ዅ'),
+ ('ወ', 'ዖ'),
+ ('ዘ', 'ጐ'),
+ ('ጒ', 'ጕ'),
+ ('ጘ', 'ፚ'),
+ ('\u{135d}', '\u{135f}'),
+ ('ᎀ', 'ᎏ'),
+ ('Ꭰ', 'Ᏽ'),
+ ('ᏸ', 'ᏽ'),
+ ('ᐁ', 'ᙬ'),
+ ('ᙯ', 'ᙿ'),
+ ('ᚁ', 'ᚚ'),
+ ('ᚠ', 'ᛪ'),
+ ('ᛮ', 'ᛸ'),
+ ('ᜀ', '᜕'),
+ ('ᜟ', '᜴'),
+ ('ᝀ', '\u{1753}'),
+ ('ᝠ', 'ᝬ'),
+ ('ᝮ', 'ᝰ'),
+ ('\u{1772}', '\u{1773}'),
+ ('ក', '\u{17d3}'),
+ ('ៗ', 'ៗ'),
+ ('ៜ', '\u{17dd}'),
+ ('០', '៩'),
+ ('\u{180b}', '\u{180d}'),
+ ('\u{180f}', '᠙'),
+ ('ᠠ', 'ᡸ'),
+ ('ᢀ', 'ᢪ'),
+ ('ᢰ', 'ᣵ'),
+ ('ᤀ', 'ᤞ'),
+ ('\u{1920}', 'ᤫ'),
+ ('ᤰ', '\u{193b}'),
+ ('᥆', 'ᥭ'),
+ ('ᥰ', 'ᥴ'),
+ ('ᦀ', 'ᦫ'),
+ ('ᦰ', 'ᧉ'),
+ ('᧐', '᧙'),
+ ('ᨀ', '\u{1a1b}'),
+ ('ᨠ', '\u{1a5e}'),
+ ('\u{1a60}', '\u{1a7c}'),
+ ('\u{1a7f}', '᪉'),
+ ('᪐', '᪙'),
+ ('ᪧ', 'ᪧ'),
+ ('\u{1ab0}', '\u{1ace}'),
+ ('\u{1b00}', 'ᭌ'),
+ ('᭐', '᭙'),
+ ('\u{1b6b}', '\u{1b73}'),
+ ('\u{1b80}', '᯳'),
+ ('ᰀ', '\u{1c37}'),
+ ('᱀', '᱉'),
+ ('ᱍ', 'ᱽ'),
+ ('ᲀ', 'ᲈ'),
+ ('Ა', 'Ჺ'),
+ ('Ჽ', 'Ჿ'),
+ ('\u{1cd0}', '\u{1cd2}'),
+ ('\u{1cd4}', 'ᳺ'),
+ ('ᴀ', 'ἕ'),
+ ('Ἐ', 'Ἕ'),
+ ('ἠ', 'ὅ'),
+ ('Ὀ', 'Ὅ'),
+ ('ὐ', 'ὗ'),
+ ('Ὑ', 'Ὑ'),
+ ('Ὓ', 'Ὓ'),
+ ('Ὕ', 'Ὕ'),
+ ('Ὗ', 'ώ'),
+ ('ᾀ', 'ᾴ'),
+ ('ᾶ', 'ᾼ'),
+ ('ι', 'ι'),
+ ('ῂ', 'ῄ'),
+ ('ῆ', 'ῌ'),
+ ('ῐ', 'ΐ'),
+ ('ῖ', 'Ί'),
+ ('ῠ', 'Ῥ'),
+ ('ῲ', 'ῴ'),
+ ('ῶ', 'ῼ'),
+ ('\u{200c}', '\u{200d}'),
+ ('‿', '⁀'),
+ ('⁔', '⁔'),
+ ('ⁱ', 'ⁱ'),
+ ('ⁿ', 'ⁿ'),
+ ('ₐ', 'ₜ'),
+ ('\u{20d0}', '\u{20f0}'),
+ ('ℂ', 'ℂ'),
+ ('ℇ', 'ℇ'),
+ ('ℊ', 'ℓ'),
+ ('ℕ', 'ℕ'),
+ ('ℙ', 'ℝ'),
+ ('ℤ', 'ℤ'),
+ ('Ω', 'Ω'),
+ ('ℨ', 'ℨ'),
+ ('K', 'ℭ'),
+ ('ℯ', 'ℹ'),
+ ('ℼ', 'ℿ'),
+ ('ⅅ', 'ⅉ'),
+ ('ⅎ', 'ⅎ'),
+ ('Ⅰ', 'ↈ'),
+ ('Ⓐ', 'ⓩ'),
+ ('Ⰰ', 'ⳤ'),
+ ('Ⳬ', 'ⳳ'),
+ ('ⴀ', 'ⴥ'),
+ ('ⴧ', 'ⴧ'),
+ ('ⴭ', 'ⴭ'),
+ ('ⴰ', 'ⵧ'),
+ ('ⵯ', 'ⵯ'),
+ ('\u{2d7f}', 'ⶖ'),
+ ('ⶠ', 'ⶦ'),
+ ('ⶨ', 'ⶮ'),
+ ('ⶰ', 'ⶶ'),
+ ('ⶸ', 'ⶾ'),
+ ('ⷀ', 'ⷆ'),
+ ('ⷈ', 'ⷎ'),
+ ('ⷐ', 'ⷖ'),
+ ('ⷘ', 'ⷞ'),
+ ('\u{2de0}', '\u{2dff}'),
+ ('ⸯ', 'ⸯ'),
+ ('々', '〇'),
+ ('〡', '\u{302f}'),
+ ('〱', '〵'),
+ ('〸', '〼'),
+ ('ぁ', 'ゖ'),
+ ('\u{3099}', '\u{309a}'),
+ ('ゝ', 'ゟ'),
+ ('ァ', 'ヺ'),
+ ('ー', 'ヿ'),
+ ('ㄅ', 'ㄯ'),
+ ('ㄱ', 'ㆎ'),
+ ('ㆠ', 'ㆿ'),
+ ('ㇰ', 'ㇿ'),
+ ('㐀', '䶿'),
+ ('一', 'ꒌ'),
+ ('ꓐ', 'ꓽ'),
+ ('ꔀ', 'ꘌ'),
+ ('ꘐ', 'ꘫ'),
+ ('Ꙁ', '\u{a672}'),
+ ('\u{a674}', '\u{a67d}'),
+ ('ꙿ', '\u{a6f1}'),
+ ('ꜗ', 'ꜟ'),
+ ('Ꜣ', 'ꞈ'),
+ ('Ꞌ', 'ꟊ'),
+ ('Ꟑ', 'ꟑ'),
+ ('ꟓ', 'ꟓ'),
+ ('ꟕ', 'ꟙ'),
+ ('ꟲ', 'ꠧ'),
+ ('\u{a82c}', '\u{a82c}'),
+ ('ꡀ', 'ꡳ'),
+ ('ꢀ', '\u{a8c5}'),
+ ('꣐', '꣙'),
+ ('\u{a8e0}', 'ꣷ'),
+ ('ꣻ', 'ꣻ'),
+ ('ꣽ', '\u{a92d}'),
+ ('ꤰ', '꥓'),
+ ('ꥠ', 'ꥼ'),
+ ('\u{a980}', '꧀'),
+ ('ꧏ', '꧙'),
+ ('ꧠ', 'ꧾ'),
+ ('ꨀ', '\u{aa36}'),
+ ('ꩀ', 'ꩍ'),
+ ('꩐', '꩙'),
+ ('ꩠ', 'ꩶ'),
+ ('ꩺ', 'ꫂ'),
+ ('ꫛ', 'ꫝ'),
+ ('ꫠ', 'ꫯ'),
+ ('ꫲ', '\u{aaf6}'),
+ ('ꬁ', 'ꬆ'),
+ ('ꬉ', 'ꬎ'),
+ ('ꬑ', 'ꬖ'),
+ ('ꬠ', 'ꬦ'),
+ ('ꬨ', 'ꬮ'),
+ ('ꬰ', 'ꭚ'),
+ ('ꭜ', 'ꭩ'),
+ ('ꭰ', 'ꯪ'),
+ ('꯬', '\u{abed}'),
+ ('꯰', '꯹'),
+ ('가', '힣'),
+ ('ힰ', 'ퟆ'),
+ ('ퟋ', 'ퟻ'),
+ ('豈', '舘'),
+ ('並', '龎'),
+ ('ff', 'st'),
+ ('ﬓ', 'ﬗ'),
+ ('יִ', 'ﬨ'),
+ ('שׁ', 'זּ'),
+ ('טּ', 'לּ'),
+ ('מּ', 'מּ'),
+ ('נּ', 'סּ'),
+ ('ףּ', 'פּ'),
+ ('צּ', 'ﮱ'),
+ ('ﯓ', 'ﴽ'),
+ ('ﵐ', 'ﶏ'),
+ ('ﶒ', 'ﷇ'),
+ ('ﷰ', 'ﷻ'),
+ ('\u{fe00}', '\u{fe0f}'),
+ ('\u{fe20}', '\u{fe2f}'),
+ ('︳', '︴'),
+ ('﹍', '﹏'),
+ ('ﹰ', 'ﹴ'),
+ ('ﹶ', 'ﻼ'),
+ ('0', '9'),
+ ('A', 'Z'),
+ ('_', '_'),
+ ('a', 'z'),
+ ('ヲ', 'ᄒ'),
+ ('ᅡ', 'ᅦ'),
+ ('ᅧ', 'ᅬ'),
+ ('ᅭ', 'ᅲ'),
+ ('ᅳ', 'ᅵ'),
+ ('𐀀', '𐀋'),
+ ('𐀍', '𐀦'),
+ ('𐀨', '𐀺'),
+ ('𐀼', '𐀽'),
+ ('𐀿', '𐁍'),
+ ('𐁐', '𐁝'),
+ ('𐂀', '𐃺'),
+ ('𐅀', '𐅴'),
+ ('\u{101fd}', '\u{101fd}'),
+ ('𐊀', '𐊜'),
+ ('𐊠', '𐋐'),
+ ('\u{102e0}', '\u{102e0}'),
+ ('𐌀', '𐌟'),
+ ('𐌭', '𐍊'),
+ ('𐍐', '\u{1037a}'),
+ ('𐎀', '𐎝'),
+ ('𐎠', '𐏃'),
+ ('𐏈', '𐏏'),
+ ('𐏑', '𐏕'),
+ ('𐐀', '𐒝'),
+ ('𐒠', '𐒩'),
+ ('𐒰', '𐓓'),
+ ('𐓘', '𐓻'),
+ ('𐔀', '𐔧'),
+ ('𐔰', '𐕣'),
+ ('𐕰', '𐕺'),
+ ('𐕼', '𐖊'),
+ ('𐖌', '𐖒'),
+ ('𐖔', '𐖕'),
+ ('𐖗', '𐖡'),
+ ('𐖣', '𐖱'),
+ ('𐖳', '𐖹'),
+ ('𐖻', '𐖼'),
+ ('𐘀', '𐜶'),
+ ('𐝀', '𐝕'),
+ ('𐝠', '𐝧'),
+ ('𐞀', '𐞅'),
+ ('𐞇', '𐞰'),
+ ('𐞲', '𐞺'),
+ ('𐠀', '𐠅'),
+ ('𐠈', '𐠈'),
+ ('𐠊', '𐠵'),
+ ('𐠷', '𐠸'),
+ ('𐠼', '𐠼'),
+ ('𐠿', '𐡕'),
+ ('𐡠', '𐡶'),
+ ('𐢀', '𐢞'),
+ ('𐣠', '𐣲'),
+ ('𐣴', '𐣵'),
+ ('𐤀', '𐤕'),
+ ('𐤠', '𐤹'),
+ ('𐦀', '𐦷'),
+ ('𐦾', '𐦿'),
+ ('𐨀', '\u{10a03}'),
+ ('\u{10a05}', '\u{10a06}'),
+ ('\u{10a0c}', '𐨓'),
+ ('𐨕', '𐨗'),
+ ('𐨙', '𐨵'),
+ ('\u{10a38}', '\u{10a3a}'),
+ ('\u{10a3f}', '\u{10a3f}'),
+ ('𐩠', '𐩼'),
+ ('𐪀', '𐪜'),
+ ('𐫀', '𐫇'),
+ ('𐫉', '\u{10ae6}'),
+ ('𐬀', '𐬵'),
+ ('𐭀', '𐭕'),
+ ('𐭠', '𐭲'),
+ ('𐮀', '𐮑'),
+ ('𐰀', '𐱈'),
+ ('𐲀', '𐲲'),
+ ('𐳀', '𐳲'),
+ ('𐴀', '\u{10d27}'),
+ ('𐴰', '𐴹'),
+ ('𐺀', '𐺩'),
+ ('\u{10eab}', '\u{10eac}'),
+ ('𐺰', '𐺱'),
+ ('\u{10efd}', '𐼜'),
+ ('𐼧', '𐼧'),
+ ('𐼰', '\u{10f50}'),
+ ('𐽰', '\u{10f85}'),
+ ('𐾰', '𐿄'),
+ ('𐿠', '𐿶'),
+ ('𑀀', '\u{11046}'),
+ ('𑁦', '𑁵'),
+ ('\u{1107f}', '\u{110ba}'),
+ ('\u{110c2}', '\u{110c2}'),
+ ('𑃐', '𑃨'),
+ ('𑃰', '𑃹'),
+ ('\u{11100}', '\u{11134}'),
+ ('𑄶', '𑄿'),
+ ('𑅄', '𑅇'),
+ ('𑅐', '\u{11173}'),
+ ('𑅶', '𑅶'),
+ ('\u{11180}', '𑇄'),
+ ('\u{111c9}', '\u{111cc}'),
+ ('𑇎', '𑇚'),
+ ('𑇜', '𑇜'),
+ ('𑈀', '𑈑'),
+ ('𑈓', '\u{11237}'),
+ ('\u{1123e}', '\u{11241}'),
+ ('𑊀', '𑊆'),
+ ('𑊈', '𑊈'),
+ ('𑊊', '𑊍'),
+ ('𑊏', '𑊝'),
+ ('𑊟', '𑊨'),
+ ('𑊰', '\u{112ea}'),
+ ('𑋰', '𑋹'),
+ ('\u{11300}', '𑌃'),
+ ('𑌅', '𑌌'),
+ ('𑌏', '𑌐'),
+ ('𑌓', '𑌨'),
+ ('𑌪', '𑌰'),
+ ('𑌲', '𑌳'),
+ ('𑌵', '𑌹'),
+ ('\u{1133b}', '𑍄'),
+ ('𑍇', '𑍈'),
+ ('𑍋', '𑍍'),
+ ('𑍐', '𑍐'),
+ ('\u{11357}', '\u{11357}'),
+ ('𑍝', '𑍣'),
+ ('\u{11366}', '\u{1136c}'),
+ ('\u{11370}', '\u{11374}'),
+ ('𑐀', '𑑊'),
+ ('𑑐', '𑑙'),
+ ('\u{1145e}', '𑑡'),
+ ('𑒀', '𑓅'),
+ ('𑓇', '𑓇'),
+ ('𑓐', '𑓙'),
+ ('𑖀', '\u{115b5}'),
+ ('𑖸', '\u{115c0}'),
+ ('𑗘', '\u{115dd}'),
+ ('𑘀', '\u{11640}'),
+ ('𑙄', '𑙄'),
+ ('𑙐', '𑙙'),
+ ('𑚀', '𑚸'),
+ ('𑛀', '𑛉'),
+ ('𑜀', '𑜚'),
+ ('\u{1171d}', '\u{1172b}'),
+ ('𑜰', '𑜹'),
+ ('𑝀', '𑝆'),
+ ('𑠀', '\u{1183a}'),
+ ('𑢠', '𑣩'),
+ ('𑣿', '𑤆'),
+ ('𑤉', '𑤉'),
+ ('𑤌', '𑤓'),
+ ('𑤕', '𑤖'),
+ ('𑤘', '𑤵'),
+ ('𑤷', '𑤸'),
+ ('\u{1193b}', '\u{11943}'),
+ ('𑥐', '𑥙'),
+ ('𑦠', '𑦧'),
+ ('𑦪', '\u{119d7}'),
+ ('\u{119da}', '𑧡'),
+ ('𑧣', '𑧤'),
+ ('𑨀', '\u{11a3e}'),
+ ('\u{11a47}', '\u{11a47}'),
+ ('𑩐', '\u{11a99}'),
+ ('𑪝', '𑪝'),
+ ('𑪰', '𑫸'),
+ ('𑰀', '𑰈'),
+ ('𑰊', '\u{11c36}'),
+ ('\u{11c38}', '𑱀'),
+ ('𑱐', '𑱙'),
+ ('𑱲', '𑲏'),
+ ('\u{11c92}', '\u{11ca7}'),
+ ('𑲩', '\u{11cb6}'),
+ ('𑴀', '𑴆'),
+ ('𑴈', '𑴉'),
+ ('𑴋', '\u{11d36}'),
+ ('\u{11d3a}', '\u{11d3a}'),
+ ('\u{11d3c}', '\u{11d3d}'),
+ ('\u{11d3f}', '\u{11d47}'),
+ ('𑵐', '𑵙'),
+ ('𑵠', '𑵥'),
+ ('𑵧', '𑵨'),
+ ('𑵪', '𑶎'),
+ ('\u{11d90}', '\u{11d91}'),
+ ('𑶓', '𑶘'),
+ ('𑶠', '𑶩'),
+ ('𑻠', '𑻶'),
+ ('\u{11f00}', '𑼐'),
+ ('𑼒', '\u{11f3a}'),
+ ('𑼾', '\u{11f42}'),
+ ('𑽐', '𑽙'),
+ ('𑾰', '𑾰'),
+ ('𒀀', '𒎙'),
+ ('𒐀', '𒑮'),
+ ('𒒀', '𒕃'),
+ ('𒾐', '𒿰'),
+ ('𓀀', '𓐯'),
+ ('\u{13440}', '\u{13455}'),
+ ('𔐀', '𔙆'),
+ ('𖠀', '𖨸'),
+ ('𖩀', '𖩞'),
+ ('𖩠', '𖩩'),
+ ('𖩰', '𖪾'),
+ ('𖫀', '𖫉'),
+ ('𖫐', '𖫭'),
+ ('\u{16af0}', '\u{16af4}'),
+ ('𖬀', '\u{16b36}'),
+ ('𖭀', '𖭃'),
+ ('𖭐', '𖭙'),
+ ('𖭣', '𖭷'),
+ ('𖭽', '𖮏'),
+ ('𖹀', '𖹿'),
+ ('𖼀', '𖽊'),
+ ('\u{16f4f}', '𖾇'),
+ ('\u{16f8f}', '𖾟'),
+ ('𖿠', '𖿡'),
+ ('𖿣', '\u{16fe4}'),
+ ('𖿰', '𖿱'),
+ ('𗀀', '𘟷'),
+ ('𘠀', '𘳕'),
+ ('𘴀', '𘴈'),
+ ('𚿰', '𚿳'),
+ ('𚿵', '𚿻'),
+ ('𚿽', '𚿾'),
+ ('𛀀', '𛄢'),
+ ('𛄲', '𛄲'),
+ ('𛅐', '𛅒'),
+ ('𛅕', '𛅕'),
+ ('𛅤', '𛅧'),
+ ('𛅰', '𛋻'),
+ ('𛰀', '𛱪'),
+ ('𛱰', '𛱼'),
+ ('𛲀', '𛲈'),
+ ('𛲐', '𛲙'),
+ ('\u{1bc9d}', '\u{1bc9e}'),
+ ('\u{1cf00}', '\u{1cf2d}'),
+ ('\u{1cf30}', '\u{1cf46}'),
+ ('\u{1d165}', '\u{1d169}'),
+ ('𝅭', '\u{1d172}'),
+ ('\u{1d17b}', '\u{1d182}'),
+ ('\u{1d185}', '\u{1d18b}'),
+ ('\u{1d1aa}', '\u{1d1ad}'),
+ ('\u{1d242}', '\u{1d244}'),
+ ('𝐀', '𝑔'),
+ ('𝑖', '𝒜'),
+ ('𝒞', '𝒟'),
+ ('𝒢', '𝒢'),
+ ('𝒥', '𝒦'),
+ ('𝒩', '𝒬'),
+ ('𝒮', '𝒹'),
+ ('𝒻', '𝒻'),
+ ('𝒽', '𝓃'),
+ ('𝓅', '𝔅'),
+ ('𝔇', '𝔊'),
+ ('𝔍', '𝔔'),
+ ('𝔖', '𝔜'),
+ ('𝔞', '𝔹'),
+ ('𝔻', '𝔾'),
+ ('𝕀', '𝕄'),
+ ('𝕆', '𝕆'),
+ ('𝕊', '𝕐'),
+ ('𝕒', '𝚥'),
+ ('𝚨', '𝛀'),
+ ('𝛂', '𝛚'),
+ ('𝛜', '𝛺'),
+ ('𝛼', '𝜔'),
+ ('𝜖', '𝜴'),
+ ('𝜶', '𝝎'),
+ ('𝝐', '𝝮'),
+ ('𝝰', '𝞈'),
+ ('𝞊', '𝞨'),
+ ('𝞪', '𝟂'),
+ ('𝟄', '𝟋'),
+ ('𝟎', '𝟿'),
+ ('\u{1da00}', '\u{1da36}'),
+ ('\u{1da3b}', '\u{1da6c}'),
+ ('\u{1da75}', '\u{1da75}'),
+ ('\u{1da84}', '\u{1da84}'),
+ ('\u{1da9b}', '\u{1da9f}'),
+ ('\u{1daa1}', '\u{1daaf}'),
+ ('𝼀', '𝼞'),
+ ('𝼥', '𝼪'),
+ ('\u{1e000}', '\u{1e006}'),
+ ('\u{1e008}', '\u{1e018}'),
+ ('\u{1e01b}', '\u{1e021}'),
+ ('\u{1e023}', '\u{1e024}'),
+ ('\u{1e026}', '\u{1e02a}'),
+ ('𞀰', '𞁭'),
+ ('\u{1e08f}', '\u{1e08f}'),
+ ('𞄀', '𞄬'),
+ ('\u{1e130}', '𞄽'),
+ ('𞅀', '𞅉'),
+ ('𞅎', '𞅎'),
+ ('𞊐', '\u{1e2ae}'),
+ ('𞋀', '𞋹'),
+ ('𞓐', '𞓹'),
+ ('𞟠', '𞟦'),
+ ('𞟨', '𞟫'),
+ ('𞟭', '𞟮'),
+ ('𞟰', '𞟾'),
+ ('𞠀', '𞣄'),
+ ('\u{1e8d0}', '\u{1e8d6}'),
+ ('𞤀', '𞥋'),
+ ('𞥐', '𞥙'),
+ ('𞸀', '𞸃'),
+ ('𞸅', '𞸟'),
+ ('𞸡', '𞸢'),
+ ('𞸤', '𞸤'),
+ ('𞸧', '𞸧'),
+ ('𞸩', '𞸲'),
+ ('𞸴', '𞸷'),
+ ('𞸹', '𞸹'),
+ ('𞸻', '𞸻'),
+ ('𞹂', '𞹂'),
+ ('𞹇', '𞹇'),
+ ('𞹉', '𞹉'),
+ ('𞹋', '𞹋'),
+ ('𞹍', '𞹏'),
+ ('𞹑', '𞹒'),
+ ('𞹔', '𞹔'),
+ ('𞹗', '𞹗'),
+ ('𞹙', '𞹙'),
+ ('𞹛', '𞹛'),
+ ('𞹝', '𞹝'),
+ ('𞹟', '𞹟'),
+ ('𞹡', '𞹢'),
+ ('𞹤', '𞹤'),
+ ('𞹧', '𞹪'),
+ ('𞹬', '𞹲'),
+ ('𞹴', '𞹷'),
+ ('𞹹', '𞹼'),
+ ('𞹾', '𞹾'),
+ ('𞺀', '𞺉'),
+ ('𞺋', '𞺛'),
+ ('𞺡', '𞺣'),
+ ('𞺥', '𞺩'),
+ ('𞺫', '𞺻'),
+ ('🄰', '🅉'),
+ ('🅐', '🅩'),
+ ('🅰', '🆉'),
+ ('🯰', '🯹'),
+ ('𠀀', '𪛟'),
+ ('𪜀', '𫜹'),
+ ('𫝀', '𫠝'),
+ ('𫠠', '𬺡'),
+ ('𬺰', '𮯠'),
+ ('丽', '𪘀'),
+ ('𰀀', '𱍊'),
+ ('𱍐', '𲎯'),
+ ('\u{e0100}', '\u{e01ef}'),
+];
diff --git a/vendor/regex-automata/src/util/utf8.rs b/vendor/regex-automata/src/util/utf8.rs
new file mode 100644
index 000000000..91b27efe0
--- /dev/null
+++ b/vendor/regex-automata/src/util/utf8.rs
@@ -0,0 +1,196 @@
+/*!
+Utilities for dealing with UTF-8.
+
+This module provides some UTF-8 related helper routines, including an
+incremental decoder.
+*/
+
+/// Returns true if and only if the given byte is considered a word character.
+/// This only applies to ASCII.
+///
+/// This was copied from regex-syntax so that we can use it to determine the
+/// starting DFA state while searching without depending on regex-syntax. The
+/// definition is never going to change, so there's no maintenance/bit-rot
+/// hazard here.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn is_word_byte(b: u8) -> bool {
+ const fn mkwordset() -> [bool; 256] {
+ // FIXME: Use as_usize() once const functions in traits are stable.
+ let mut set = [false; 256];
+ set[b'_' as usize] = true;
+
+ let mut byte = b'0';
+ while byte <= b'9' {
+ set[byte as usize] = true;
+ byte += 1;
+ }
+ byte = b'A';
+ while byte <= b'Z' {
+ set[byte as usize] = true;
+ byte += 1;
+ }
+ byte = b'a';
+ while byte <= b'z' {
+ set[byte as usize] = true;
+ byte += 1;
+ }
+ set
+ }
+ const WORD: [bool; 256] = mkwordset();
+ WORD[b as usize]
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+///
+/// This never panics.
+///
+/// *WARNING*: This is not designed for performance. If you're looking for a
+/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
+/// crate, then please file an issue and discuss your use case.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let len = match len(bytes[0]) {
+ None => return Some(Err(bytes[0])),
+ Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+ Some(1) => return Some(Ok(char::from(bytes[0]))),
+ Some(len) => len,
+ };
+ match core::str::from_utf8(&bytes[..len]) {
+ Ok(s) => Some(Ok(s.chars().next().unwrap())),
+ Err(_) => Some(Err(bytes[0])),
+ }
+}
+
+/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the end of the given byte
+/// slice, then the last byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let mut start = bytes.len() - 1;
+ let limit = bytes.len().saturating_sub(4);
+ while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
+ start -= 1;
+ }
+ match decode(&bytes[start..]) {
+ None => None,
+ Some(Ok(ch)) => Some(Ok(ch)),
+ Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
+ }
+}
+
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn len(byte: u8) -> Option<usize> {
+ if byte <= 0x7F {
+ return Some(1);
+ } else if byte & 0b1100_0000 == 0b1000_0000 {
+ return None;
+ } else if byte <= 0b1101_1111 {
+ Some(2)
+ } else if byte <= 0b1110_1111 {
+ Some(3)
+ } else if byte <= 0b1111_0111 {
+ Some(4)
+ } else {
+ None
+ }
+}
+
+/// Returns true if and only if the given offset in the given bytes falls on a
+/// valid UTF-8 encoded codepoint boundary.
+///
+/// If `bytes` is not valid UTF-8, then the behavior of this routine is
+/// unspecified.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
+ match bytes.get(i) {
+ // The position at the end of the bytes always represents an empty
+ // string, which is a valid boundary. But anything after that doesn't
+ // make much sense to call valid a boundary.
+ None => i == bytes.len(),
+ // Other than ASCII (where the most significant bit is never set),
+ // valid starting bytes always have their most significant two bits
+ // set, where as continuation bytes never have their second most
+ // significant bit set. Therefore, this only returns true when bytes[i]
+ // corresponds to a byte that begins a valid UTF-8 encoding of a
+ // Unicode scalar value.
+ Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
+ }
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn is_leading_or_invalid_byte(b: u8) -> bool {
+ // In the ASCII case, the most significant bit is never set. The leading
+ // byte of a 2/3/4-byte sequence always has the top two most significant
+ // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+ // also returns true, since every such byte has its two most significant
+ // bits set:
+ //
+ // \xC0 :: 11000000
+ // \xC1 :: 11000001
+ // \xF5 :: 11110101
+ // \xF6 :: 11110110
+ // \xF7 :: 11110111
+ // \xF8 :: 11111000
+ // \xF9 :: 11111001
+ // \xFA :: 11111010
+ // \xFB :: 11111011
+ // \xFC :: 11111100
+ // \xFD :: 11111101
+ // \xFE :: 11111110
+ // \xFF :: 11111111
+ (b & 0b1100_0000) != 0b1000_0000
+}
+
+/*
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+///
+/// For all inputs, including invalid UTF-8 and any value of `i`, the return
+/// value is guaranteed to be greater than `i`. (If there is no value greater
+/// than `i` that fits in `usize`, then this panics.)
+///
+/// Generally speaking, this should only be called on `text` when it is
+/// permitted to assume that it is valid UTF-8 and where either `i >=
+/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
+///
+/// NOTE: This method was used in a previous conception of iterators where we
+/// specifically tried to skip over empty matches that split a codepoint by
+/// simply requiring that our next search begin at the beginning of codepoint.
+/// But we ended up changing that technique to always advance by 1 byte and
+/// then filter out matches that split a codepoint after-the-fact. Thus, we no
+/// longer use this method. But I've kept it around in case we want to switch
+/// back to this approach. Its guarantees are a little subtle, so I'd prefer
+/// not to rebuild it from whole cloth.
+pub(crate) fn next(text: &[u8], i: usize) -> usize {
+ let b = match text.get(i) {
+ None => return i.checked_add(1).unwrap(),
+ Some(&b) => b,
+ };
+ // For cases where we see an invalid UTF-8 byte, there isn't much we can do
+ // other than just start at the next byte.
+ let inc = len(b).unwrap_or(1);
+ i.checked_add(inc).unwrap()
+}
+*/
diff --git a/vendor/regex-automata/src/util/bytes.rs b/vendor/regex-automata/src/util/wire.rs
index 5877bb149..ecf4fd8c0 100644
--- a/vendor/regex-automata/src/util/bytes.rs
+++ b/vendor/regex-automata/src/util/wire.rs
@@ -1,3 +1,10 @@
+/*!
+Types and routines that support the wire format of finite automata.
+
+Currently, this module just exports a few error types and some small helpers
+for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment.
+*/
+
/*
A collection of helper functions, types and traits for serializing automata.
@@ -10,10 +17,10 @@ generally designed such that deserialization is cheap. More specifically, that
deserialization can be done in constant time. (The idea being that you can
embed it into your binary or mmap it, and then use it immediately.)
-In order to achieve this, most of the structures in this crate use an in-memory
-representation that very closely corresponds to its binary serialized form.
-This pervades and complicates everything, and in some cases, requires dealing
-with alignment and reasoning about safety.
+In order to achieve this, the dense and sparse DFAs in this crate use an
+in-memory representation that very closely corresponds to its binary serialized
+form. This pervades and complicates everything, and in some cases, requires
+dealing with alignment and reasoning about safety.
This technique does have major advantages. In particular, it permits doing
the potentially costly work of compiling a finite state machine in an offline
@@ -43,7 +50,29 @@ use core::{
#[cfg(feature = "alloc")]
use alloc::{vec, vec::Vec};
-use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError};
+use crate::util::{
+ int::Pointer,
+ primitives::{PatternID, PatternIDError, StateID, StateIDError},
+};
+
+/// A hack to align a smaller type `B` with a bigger type `T`.
+///
+/// The usual use of this is with `B = [u8]` and `T = u32`. That is,
+/// it permits aligning a sequence of bytes on a 4-byte boundary. This
+/// is useful in contexts where one wants to embed a serialized [dense
+/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the
+/// alignment required for the DFA.
+///
+/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an
+/// example of how to use this type.
+#[repr(C)]
+#[derive(Debug)]
+pub struct AlignAs<B: ?Sized, T> {
+ /// A zero-sized field indicating the alignment we want.
+ pub _align: [T; 0],
+ /// A possibly non-sized field containing a sequence of bytes.
+ pub bytes: B,
+}
/// An error that occurs when serializing an object from this crate.
///
@@ -117,7 +146,6 @@ enum DeserializeErrorKind {
Generic { msg: &'static str },
BufferTooSmall { what: &'static str },
InvalidUsize { what: &'static str },
- InvalidVarint { what: &'static str },
VersionMismatch { expected: u32, found: u32 },
EndianMismatch { expected: u32, found: u32 },
AlignmentMismatch { alignment: usize, address: usize },
@@ -136,14 +164,10 @@ impl DeserializeError {
DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
}
- pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError {
+ fn invalid_usize(what: &'static str) -> DeserializeError {
DeserializeError(DeserializeErrorKind::InvalidUsize { what })
}
- fn invalid_varint(what: &'static str) -> DeserializeError {
- DeserializeError(DeserializeErrorKind::InvalidVarint { what })
- }
-
fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
DeserializeError(DeserializeErrorKind::VersionMismatch {
expected,
@@ -176,7 +200,7 @@ impl DeserializeError {
DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
}
- pub(crate) fn pattern_id_error(
+ fn pattern_id_error(
err: PatternIDError,
what: &'static str,
) -> DeserializeError {
@@ -206,9 +230,6 @@ impl core::fmt::Display for DeserializeError {
InvalidUsize { what } => {
write!(f, "{} is too big to fit in a usize", what)
}
- InvalidVarint { what } => {
- write!(f, "could not decode valid varint for {}", what)
- }
VersionMismatch { expected, found } => write!(
f,
"unsupported version: \
@@ -248,14 +269,63 @@ impl core::fmt::Display for DeserializeError {
}
}
+/// Safely converts a `&[u32]` to `&[StateID]` with zero cost.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] {
+ // SAFETY: This is safe because StateID is defined to have the same memory
+ // representation as a u32 (it is repr(transparent)). While not every u32
+ // is a "valid" StateID, callers are not permitted to rely on the validity
+ // of StateIDs for memory safety. It can only lead to logical errors. (This
+ // is why StateID::new_unchecked is safe.)
+ unsafe {
+ core::slice::from_raw_parts(
+ slice.as_ptr().cast::<StateID>(),
+ slice.len(),
+ )
+ }
+}
+
+/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost.
+pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] {
+ // SAFETY: This is safe because StateID is defined to have the same memory
+ // representation as a u32 (it is repr(transparent)). While not every u32
+ // is a "valid" StateID, callers are not permitted to rely on the validity
+ // of StateIDs for memory safety. It can only lead to logical errors. (This
+ // is why StateID::new_unchecked is safe.)
+ unsafe {
+ core::slice::from_raw_parts_mut(
+ slice.as_mut_ptr().cast::<StateID>(),
+ slice.len(),
+ )
+ }
+}
+
+/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] {
+ // SAFETY: This is safe because PatternID is defined to have the same
+ // memory representation as a u32 (it is repr(transparent)). While not
+ // every u32 is a "valid" PatternID, callers are not permitted to rely
+ // on the validity of PatternIDs for memory safety. It can only lead to
+ // logical errors. (This is why PatternID::new_unchecked is safe.)
+ unsafe {
+ core::slice::from_raw_parts(
+ slice.as_ptr().cast::<PatternID>(),
+ slice.len(),
+ )
+ }
+}
+
/// Checks that the given slice has an alignment that matches `T`.
///
/// This is useful for checking that a slice has an appropriate alignment
/// before casting it to a &[T]. Note though that alignment is not itself
/// sufficient to perform the cast for any `T`.
-pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> {
+pub(crate) fn check_alignment<T>(
+ slice: &[u8],
+) -> Result<(), DeserializeError> {
let alignment = core::mem::align_of::<T>();
- let address = slice.as_ptr() as usize;
+ let address = slice.as_ptr().as_usize();
if address % alignment == 0 {
return Ok(());
}
@@ -271,7 +341,7 @@ pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> {
/// before the label.
///
/// This returns the number of bytes read from the given slice.
-pub fn skip_initial_padding(slice: &[u8]) -> usize {
+pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize {
let mut nread = 0;
while nread < 7 && nread < slice.len() && slice[nread] == 0 {
nread += 1;
@@ -296,33 +366,48 @@ pub fn skip_initial_padding(slice: &[u8]) -> usize {
/// practice, we never need anything bigger in this crate, and so this function
/// does some sanity asserts under the assumption of a max alignment of `8`.
#[cfg(feature = "alloc")]
-pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
- // FIXME: This is a kludge because there's no easy way to allocate a
- // Vec<u8> with an alignment guaranteed to be greater than 1. We could
- // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8>
- // without concern, since reallocing or dropping the Vec<u8> is UB
- // (different alignment than the initial allocation). We could define a
- // wrapper type to manage this for us, but it seems like more machinery
- // than it's worth.
- let mut buf = vec![0; size];
+pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
+ // NOTE: This is a kludge because there's no easy way to allocate a Vec<u8>
+ // with an alignment guaranteed to be greater than 1. We could create a
+ // Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without
+ // concern, since reallocing or dropping the Vec<u8> is UB (different
+ // alignment than the initial allocation). We could define a wrapper type
+ // to manage this for us, but it seems like more machinery than it's worth.
+ let buf = vec![0; size];
let align = core::mem::align_of::<T>();
- let address = buf.as_ptr() as usize;
+ let address = buf.as_ptr().as_usize();
+ if address % align == 0 {
+ return (buf, 0);
+ }
+ // Let's try this again. We have to create a totally new alloc with
+ // the maximum amount of bytes we might need. We can't just extend our
+ // pre-existing 'buf' because that might create a new alloc with a
+ // different alignment.
+ let extra = align - 1;
+ let mut buf = vec![0; size + extra];
+ let address = buf.as_ptr().as_usize();
+ // The code below handles the case where 'address' is aligned to T, so if
+ // we got lucky and 'address' is now aligned to T (when it previously
+ // wasn't), then we're done.
if address % align == 0 {
+ buf.truncate(size);
return (buf, 0);
}
- // It's not quite clear how to robustly test this code, since the allocator
- // in my environment appears to always return addresses aligned to at
- // least 8 bytes, even when the alignment requirement is smaller. A feeble
- // attempt at ensuring correctness is provided with asserts.
- let padding = ((address & !0b111).checked_add(8).unwrap())
+ let padding = ((address & !(align - 1)).checked_add(align).unwrap())
.checked_sub(address)
.unwrap();
assert!(padding <= 7, "padding of {} is bigger than 7", padding);
- buf.extend(core::iter::repeat(0).take(padding));
+ assert!(
+ padding <= extra,
+ "padding of {} is bigger than extra {} bytes",
+ padding,
+ extra
+ );
+ buf.truncate(size + padding);
assert_eq!(size + padding, buf.len());
assert_eq!(
0,
- buf[padding..].as_ptr() as usize % align,
+ buf[padding..].as_ptr().as_usize() % align,
"expected end of initial padding to be aligned to {}",
align,
);
@@ -332,12 +417,12 @@ pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
/// Reads a NUL terminated label starting at the beginning of the given slice.
///
/// If a NUL terminated label could not be found, then an error is returned.
-/// Similary, if a label is found but doesn't match the expected label, then
+/// Similarly, if a label is found but doesn't match the expected label, then
/// an error is returned.
///
/// Upon success, the total number of bytes read (including padding bytes) is
/// returned.
-pub fn read_label(
+pub(crate) fn read_label(
slice: &[u8],
expected_label: &'static str,
) -> Result<usize, DeserializeError> {
@@ -376,7 +461,7 @@ pub fn read_label(
///
/// Upon success, the total number of bytes written (including padding) is
/// returned.
-pub fn write_label(
+pub(crate) fn write_label(
label: &str,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
@@ -396,7 +481,7 @@ pub fn write_label(
/// for the given label. This panics if the given label contains a NUL byte or
/// is longer than 255 bytes. (The size restriction exists so that searching
/// for a label during deserialization can be done in small bounded space.)
-pub fn write_label_len(label: &str) -> usize {
+pub(crate) fn write_label_len(label: &str) -> usize {
if label.len() > 255 {
panic!("label must not be longer than 255 bytes");
}
@@ -413,7 +498,9 @@ pub fn write_label_len(label: &str) -> usize {
/// this returns an error.
///
/// Upon success, the total number of bytes read is returned.
-pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> {
+pub(crate) fn read_endianness_check(
+ slice: &[u8],
+) -> Result<usize, DeserializeError> {
let (n, nr) = try_read_u32(slice, "endianness check")?;
assert_eq!(nr, write_endianness_check_len());
if n != 0xFEFF {
@@ -429,7 +516,7 @@ pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> {
/// endianness is used.
///
/// Upon success, the total number of bytes written is returned.
-pub fn write_endianness_check<E: Endian>(
+pub(crate) fn write_endianness_check<E: Endian>(
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = write_endianness_check_len();
@@ -441,7 +528,7 @@ pub fn write_endianness_check<E: Endian>(
}
/// Returns the number of bytes written by the endianness check.
-pub fn write_endianness_check_len() -> usize {
+pub(crate) fn write_endianness_check_len() -> usize {
size_of::<u32>()
}
@@ -454,7 +541,7 @@ pub fn write_endianness_check_len() -> usize {
/// N.B. Currently, we require that the version number is exactly equivalent.
/// In the future, if we bump the version number without a semver bump, then
/// we'll need to relax this a bit and support older versions.
-pub fn read_version(
+pub(crate) fn read_version(
slice: &[u8],
expected_version: u32,
) -> Result<usize, DeserializeError> {
@@ -473,7 +560,7 @@ pub fn read_version(
/// code supports the format of the serialized object.
///
/// Upon success, the total number of bytes written is returned.
-pub fn write_version<E: Endian>(
+pub(crate) fn write_version<E: Endian>(
version: u32,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
@@ -486,7 +573,7 @@ pub fn write_version<E: Endian>(
}
/// Returns the number of bytes written by writing the version number.
-pub fn write_version_len() -> usize {
+pub(crate) fn write_version_len() -> usize {
size_of::<u32>()
}
@@ -495,7 +582,7 @@ pub fn write_version_len() -> usize {
/// ID limit for the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
-pub fn read_pattern_id(
+pub(crate) fn read_pattern_id(
slice: &[u8],
what: &'static str,
) -> Result<(PatternID, usize), DeserializeError> {
@@ -511,7 +598,7 @@ pub fn read_pattern_id(
/// to be a valid pattern ID.
///
/// This also returns the number of bytes read.
-pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
+pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
let pid = PatternID::from_ne_bytes_unchecked(
slice[..PatternID::SIZE].try_into().unwrap(),
);
@@ -522,7 +609,10 @@ pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
/// using the specified endianness. The given slice must have length at least
/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
/// bytes written is returned.
-pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize {
+pub(crate) fn write_pattern_id<E: Endian>(
+ pid: PatternID,
+ dst: &mut [u8],
+) -> usize {
E::write_u32(pid.as_u32(), dst);
PatternID::SIZE
}
@@ -532,7 +622,7 @@ pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize {
/// the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
-pub fn try_read_state_id(
+pub(crate) fn try_read_state_id(
slice: &[u8],
what: &'static str,
) -> Result<(StateID, usize), DeserializeError> {
@@ -547,7 +637,7 @@ pub fn try_read_state_id(
/// limit for the current target, then this returns an error.
///
/// Upon success, this also returns the number of bytes read.
-pub fn read_state_id(
+pub(crate) fn read_state_id(
slice: &[u8],
what: &'static str,
) -> Result<(StateID, usize), DeserializeError> {
@@ -563,7 +653,7 @@ pub fn read_state_id(
/// to be a valid state ID.
///
/// This also returns the number of bytes read.
-pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
+pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
let sid = StateID::from_ne_bytes_unchecked(
slice[..StateID::SIZE].try_into().unwrap(),
);
@@ -574,7 +664,10 @@ pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
/// using the specified endianness. The given slice must have length at least
/// `StateID::SIZE`, or else this panics. Upon success, the total number of
/// bytes written is returned.
-pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize {
+pub(crate) fn write_state_id<E: Endian>(
+ sid: StateID,
+ dst: &mut [u8],
+) -> usize {
E::write_u32(sid.as_u32(), dst);
StateID::SIZE
}
@@ -587,7 +680,7 @@ pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize {
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
-pub fn try_read_u16_as_usize(
+pub(crate) fn try_read_u16_as_usize(
slice: &[u8],
what: &'static str,
) -> Result<(usize, usize), DeserializeError> {
@@ -606,7 +699,7 @@ pub fn try_read_u16_as_usize(
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
-pub fn try_read_u32_as_usize(
+pub(crate) fn try_read_u32_as_usize(
slice: &[u8],
what: &'static str,
) -> Result<(usize, usize), DeserializeError> {
@@ -624,13 +717,11 @@ pub fn try_read_u32_as_usize(
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
-pub fn try_read_u16(
+pub(crate) fn try_read_u16(
slice: &[u8],
what: &'static str,
) -> Result<(u16, usize), DeserializeError> {
- if slice.len() < size_of::<u16>() {
- return Err(DeserializeError::buffer_too_small(what));
- }
+ check_slice_len(slice, size_of::<u16>(), what)?;
Ok((read_u16(slice), size_of::<u16>()))
}
@@ -641,23 +732,36 @@ pub fn try_read_u16(
/// singular form.
///
/// Upon success, this also returns the number of bytes read.
-pub fn try_read_u32(
+pub(crate) fn try_read_u32(
slice: &[u8],
what: &'static str,
) -> Result<(u32, usize), DeserializeError> {
- if slice.len() < size_of::<u32>() {
- return Err(DeserializeError::buffer_too_small(what));
- }
+ check_slice_len(slice, size_of::<u32>(), what)?;
Ok((read_u32(slice), size_of::<u32>()))
}
+/// Try to read a u128 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 16 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub(crate) fn try_read_u128(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u128, usize), DeserializeError> {
+ check_slice_len(slice, size_of::<u128>(), what)?;
+ Ok((read_u128(slice), size_of::<u128>()))
+}
+
/// Read a u16 from the beginning of the given slice in native endian format.
/// If the slice has fewer than 2 bytes, then this panics.
///
/// Marked as inline to speed up sparse searching which decodes integers from
/// its automaton at search time.
-#[inline(always)]
-pub fn read_u16(slice: &[u8]) -> u16 {
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn read_u16(slice: &[u8]) -> u16 {
let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
u16::from_ne_bytes(bytes)
}
@@ -667,115 +771,23 @@ pub fn read_u16(slice: &[u8]) -> u16 {
///
/// Marked as inline to speed up sparse searching which decodes integers from
/// its automaton at search time.
-#[inline(always)]
-pub fn read_u32(slice: &[u8]) -> u32 {
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn read_u32(slice: &[u8]) -> u32 {
let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
u32::from_ne_bytes(bytes)
}
-/// Read a u64 from the beginning of the given slice in native endian format.
-/// If the slice has fewer than 8 bytes, then this panics.
-///
-/// Marked as inline to speed up sparse searching which decodes integers from
-/// its automaton at search time.
-#[inline(always)]
-pub fn read_u64(slice: &[u8]) -> u64 {
- let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap();
- u64::from_ne_bytes(bytes)
-}
-
-/// Write a variable sized integer and return the total number of bytes
-/// written. If the slice was not big enough to contain the bytes, then this
-/// returns an error including the "what" description in it. This does no
-/// padding.
-///
-/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
-#[allow(dead_code)]
-pub fn write_varu64(
- mut n: u64,
- what: &'static str,
- dst: &mut [u8],
-) -> Result<usize, SerializeError> {
- let mut i = 0;
- while n >= 0b1000_0000 {
- if i >= dst.len() {
- return Err(SerializeError::buffer_too_small(what));
- }
- dst[i] = (n as u8) | 0b1000_0000;
- n >>= 7;
- i += 1;
- }
- if i >= dst.len() {
- return Err(SerializeError::buffer_too_small(what));
- }
- dst[i] = n as u8;
- Ok(i + 1)
-}
-
-/// Returns the total number of bytes that would be writen to encode n as a
-/// variable sized integer.
-///
-/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
-#[allow(dead_code)]
-pub fn write_varu64_len(mut n: u64) -> usize {
- let mut i = 0;
- while n >= 0b1000_0000 {
- n >>= 7;
- i += 1;
- }
- i + 1
-}
-
-/// Like read_varu64, but attempts to cast the result to usize. If the integer
-/// cannot fit into a usize, then an error is returned.
-#[allow(dead_code)]
-pub fn read_varu64_as_usize(
- slice: &[u8],
- what: &'static str,
-) -> Result<(usize, usize), DeserializeError> {
- let (n, nread) = read_varu64(slice, what)?;
- let n = usize::try_from(n)
- .map_err(|_| DeserializeError::invalid_usize(what))?;
- Ok((n, nread))
-}
-
-/// Reads a variable sized integer from the beginning of slice, and returns the
-/// integer along with the total number of bytes read. If a valid variable
-/// sized integer could not be found, then an error is returned that includes
-/// the "what" description in it.
-///
-/// https://developers.google.com/protocol-buffers/docs/encoding#varints
-#[allow(dead_code)]
-pub fn read_varu64(
- slice: &[u8],
- what: &'static str,
-) -> Result<(u64, usize), DeserializeError> {
- let mut n: u64 = 0;
- let mut shift: u32 = 0;
- // The biggest possible value is u64::MAX, which needs all 64 bits which
- // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading
- // an unnecessary number of bytes.
- let limit = cmp::min(slice.len(), 10);
- for (i, &b) in slice[..limit].iter().enumerate() {
- if b < 0b1000_0000 {
- return match (b as u64).checked_shl(shift) {
- None => Err(DeserializeError::invalid_varint(what)),
- Some(b) => Ok((n | b, i + 1)),
- };
- }
- match ((b as u64) & 0b0111_1111).checked_shl(shift) {
- None => return Err(DeserializeError::invalid_varint(what)),
- Some(b) => n |= b,
- }
- shift += 7;
- }
- Err(DeserializeError::invalid_varint(what))
+/// Read a u128 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 16 bytes, then this panics.
+pub(crate) fn read_u128(slice: &[u8]) -> u128 {
+ let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap();
+ u128::from_ne_bytes(bytes)
}
/// Checks that the given slice has some minimal length. If it's smaller than
/// the bound given, then a "buffer too small" error is returned with `what`
/// describing what the buffer represents.
-pub fn check_slice_len<T>(
+pub(crate) fn check_slice_len<T>(
slice: &[T],
at_least_len: usize,
what: &'static str,
@@ -790,7 +802,7 @@ pub fn check_slice_len<T>(
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
-pub fn mul(
+pub(crate) fn mul(
a: usize,
b: usize,
what: &'static str,
@@ -805,7 +817,7 @@ pub fn mul(
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
-pub fn add(
+pub(crate) fn add(
a: usize,
b: usize,
what: &'static str,
@@ -820,7 +832,7 @@ pub fn add(
/// 'what' in the error message.
///
/// This is useful when doing arithmetic with untrusted data.
-pub fn shl(
+pub(crate) fn shl(
a: usize,
b: usize,
what: &'static str,
@@ -833,11 +845,18 @@ pub fn shl(
}
}
+/// Returns the number of additional bytes required to add to the given length
+/// in order to make the total length a multiple of 4. The return value is
+/// always less than 4.
+pub(crate) fn padding_len(non_padding_len: usize) -> usize {
+ (4 - (non_padding_len & 0b11)) & 0b11
+}
+
/// A simple trait for writing code generic over endianness.
///
/// This is similar to what byteorder provides, but we only need a very small
/// subset.
-pub trait Endian {
+pub(crate) trait Endian {
/// Writes a u16 to the given destination buffer in a particular
/// endianness. If the destination buffer has a length smaller than 2, then
/// this panics.
@@ -852,17 +871,22 @@ pub trait Endian {
/// endianness. If the destination buffer has a length smaller than 8, then
/// this panics.
fn write_u64(n: u64, dst: &mut [u8]);
+
+ /// Writes a u128 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 16,
+ /// then this panics.
+ fn write_u128(n: u128, dst: &mut [u8]);
}
/// Little endian writing.
-pub enum LE {}
+pub(crate) enum LE {}
/// Big endian writing.
-pub enum BE {}
+pub(crate) enum BE {}
#[cfg(target_endian = "little")]
-pub type NE = LE;
+pub(crate) type NE = LE;
#[cfg(target_endian = "big")]
-pub type NE = BE;
+pub(crate) type NE = BE;
impl Endian for LE {
fn write_u16(n: u16, dst: &mut [u8]) {
@@ -876,6 +900,10 @@ impl Endian for LE {
fn write_u64(n: u64, dst: &mut [u8]) {
dst[..8].copy_from_slice(&n.to_le_bytes());
}
+
+ fn write_u128(n: u128, dst: &mut [u8]) {
+ dst[..16].copy_from_slice(&n.to_le_bytes());
+ }
}
impl Endian for BE {
@@ -890,13 +918,10 @@ impl Endian for BE {
fn write_u64(n: u64, dst: &mut [u8]) {
dst[..8].copy_from_slice(&n.to_be_bytes());
}
-}
-/// Returns the number of additional bytes required to add to the given length
-/// in order to make the total length a multiple of 4. The return value is
-/// always less than 4.
-pub fn padding_len(non_padding_len: usize) -> usize {
- (4 - (non_padding_len & 0b11)) & 0b11
+ fn write_u128(n: u128, dst: &mut [u8]) {
+ dst[..16].copy_from_slice(&n.to_be_bytes());
+ }
}
#[cfg(all(test, feature = "alloc"))]
diff --git a/vendor/regex-automata/test b/vendor/regex-automata/test
new file mode 100755
index 000000000..df3e5ae98
--- /dev/null
+++ b/vendor/regex-automata/test
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# This is a script that attempts to *approximately* exhaustively run the test
+# suite for regex-automata. The main reason for why 'cargo test' isn't enough
+# is because of crate features. regex-automata has a ton of them. This script
+# tests many of those feature combinations (although not all) to try to get
+# decent coverage in a finite amount of time.
+
+set -e
+
+# cd to the directory containing this crate's Cargo.toml so that we don't need
+# to pass --manifest-path to every `cargo` command.
+cd "$(dirname "$0")"
+
+echo "===== ALL FEATURES TEST ==="
+cargo test --all-features
+
+# Man I don't *want* to have this many crate features, but... I really want
+# folks to be able to slim the crate down to just the things they want. But
+# the main downside is that I just can't feasibly test every combination of
+# features because there are too many of them. Sad, but I'm not sure if there
+# is a better alternative.
+features=(
+ ""
+ "unicode-word-boundary"
+ "unicode-word-boundary,syntax,unicode-perl"
+ "unicode-word-boundary,syntax,dfa-build"
+ "nfa"
+ "dfa"
+ "hybrid"
+ "nfa,dfa"
+ "nfa,hybrid"
+ "dfa,hybrid"
+ "dfa-onepass"
+ "nfa-pikevm"
+ "nfa-backtrack"
+ "std"
+ "alloc"
+ "syntax"
+ "syntax,nfa-pikevm"
+ "syntax,hybrid"
+ "perf-literal-substring"
+ "perf-literal-multisubstring"
+ "meta"
+ "meta,nfa-backtrack"
+ "meta,hybrid"
+ "meta,dfa-build"
+ "meta,dfa-onepass"
+ "meta,nfa,dfa,hybrid,nfa-backtrack"
+ "meta,nfa,dfa,hybrid,nfa-backtrack,perf-literal-substring"
+ "meta,nfa,dfa,hybrid,nfa-backtrack,perf-literal-multisubstring"
+)
+for f in "${features[@]}"; do
+ echo "===== LIB FEATURES: $f ==="
+ # It's actually important to do a standard 'cargo build' in addition to a
+ # 'cargo test'. In particular, in the latter case, the dev-dependencies may
+ # wind up enabling features in dependencies (like memchr) that make it look
+ # like everything is well, but actually isn't. For example, the 'regex-test'
+ # dev-dependency uses 'bstr' and enables its 'std' feature, which in turn
+ # unconditionally enables 'memchr's 'std' feature. Since we're specifically
+ # looking to test that certain feature combinations work as expected, this
+ # can lead to things testing okay, but would actually fail to build. Yikes.
+ cargo build --no-default-features --lib --features "$f"
+ cargo test --no-default-features --lib --features "$f"
+done
+
+# We can also run the integration test suite on stripped down features too.
+# But the test suite doesn't do well with things like 'std' and 'unicode'
+# disabled, so we always enable them.
+features=(
+ "std,unicode,syntax,nfa-pikevm"
+ "std,unicode,syntax,nfa-backtrack"
+ "std,unicode,syntax,hybrid"
+ "std,unicode,syntax,dfa-onepass"
+ "std,unicode,syntax,dfa-search"
+ "std,unicode,syntax,dfa-build"
+ "std,unicode,meta"
+ # This one is a little tricky because it causes the backtracker to get used
+ # in more instances and results in failing tests for the 'earliest' tests.
+ # The actual results are semantically consistent with the API guarantee
+ # (the backtracker tends to report greater offsets because it isn't an FSM),
+ # but our tests are less flexible than the API guarantee and demand offsets
+ # reported by FSM regex engines. (Which is... all of them except for the
+ # backtracker.)
+ # "std,unicode,meta,nfa-backtrack"
+ "std,unicode,meta,hybrid"
+ "std,unicode,meta,dfa-onepass"
+ "std,unicode,meta,dfa-build"
+ "std,unicode,meta,nfa,dfa-onepass,hybrid"
+)
+for f in "${features[@]}"; do
+ echo "===== INTEGRATION FEATURES: $f ==="
+ cargo build --no-default-features --lib --features "$f"
+ cargo test --no-default-features --test integration --features "$f"
+done
diff --git a/vendor/regex-automata/tests/data/bytes.toml b/vendor/regex-automata/tests/data/bytes.toml
deleted file mode 100644
index eb3a0942e..000000000
--- a/vendor/regex-automata/tests/data/bytes.toml
+++ /dev/null
@@ -1,235 +0,0 @@
-# These are tests specifically crafted for regexes that can match arbitrary
-# bytes. In some cases, we also test the Unicode variant as well, just because
-# it's good sense to do so. But also, these tests aren't really about Unicode,
-# but whether matches are only reported at valid UTF-8 boundaries. For most
-# tests in this entire collection, utf8 = true. But for these tests, we use
-# utf8 = false.
-
-[[tests]]
-name = "word-boundary-ascii"
-regex = ' \b'
-input = " δ"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "word-boundary-unicode"
-regex = ' \b'
-input = " δ"
-matches = [[0, 1]]
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "word-boundary-ascii-not"
-regex = ' \B'
-input = " δ"
-matches = [[0, 1]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "word-boundary-unicode-not"
-regex = ' \B'
-input = " δ"
-matches = []
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "perl-word-ascii"
-regex = '\w+'
-input = "aδ"
-matches = [[0, 1]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "perl-word-unicode"
-regex = '\w+'
-input = "aδ"
-matches = [[0, 3]]
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "perl-decimal-ascii"
-regex = '\d+'
-input = "1२३9"
-matches = [[0, 1], [7, 8]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "perl-decimal-unicode"
-regex = '\d+'
-input = "1२३9"
-matches = [[0, 8]]
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "perl-whitespace-ascii"
-regex = '\s+'
-input = " \u1680"
-matches = [[0, 1]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "perl-whitespace-unicode"
-regex = '\s+'
-input = " \u1680"
-matches = [[0, 4]]
-unicode = true
-utf8 = false
-
-# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
-# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
-# matches.
-[[tests]]
-name = "mixed-dot"
-regex = '(.+)(?-u)(.+)'
-input = '\xCE\x93\xCE\x94\xFF'
-captures = [
- [[0, 5], [0, 4], [4, 5]],
-]
-unescape = true
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "case-one-ascii"
-regex = 'a'
-input = "A"
-matches = [[0, 1]]
-case_insensitive = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "case-one-unicode"
-regex = 'a'
-input = "A"
-matches = [[0, 1]]
-case_insensitive = true
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "case-class-simple-ascii"
-regex = '[a-z]+'
-input = "AaAaA"
-matches = [[0, 5]]
-case_insensitive = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "case-class-ascii"
-regex = '[a-z]+'
-input = "aA\u212AaA"
-matches = [[0, 2], [5, 7]]
-case_insensitive = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "case-class-unicode"
-regex = '[a-z]+'
-input = "aA\u212AaA"
-matches = [[0, 7]]
-case_insensitive = true
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "negate-ascii"
-regex = '[^a]'
-input = "δ"
-matches = [[0, 1], [1, 2]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "negate-unicode"
-regex = '[^a]'
-input = "δ"
-matches = [[0, 2]]
-unicode = true
-utf8 = false
-
-# When utf8=true, this won't match, because the implicit '.*?' prefix is
-# Unicode aware and will refuse to match through invalid UTF-8 bytes.
-[[tests]]
-name = "dotstar-prefix-ascii"
-regex = 'a'
-input = '\xFFa'
-matches = [[1, 2]]
-unescape = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "dotstar-prefix-unicode"
-regex = 'a'
-input = '\xFFa'
-matches = [[1, 2]]
-unescape = true
-unicode = true
-utf8 = false
-
-[[tests]]
-name = "null-bytes"
-regex = '(?P<cstr>[^\x00]+)\x00'
-input = 'foo\x00'
-captures = [
- [[0, 4], [0, 3]],
-]
-unescape = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "invalid-utf8-anchor-100"
-regex = '\xCC?^'
-input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
-matches = [[0, 0]]
-unescape = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "invalid-utf8-anchor-200"
-regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
-input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
-matches = [[22, 22]]
-unescape = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "invalid-utf8-anchor-300"
-regex = '^|ddp\xff\xffdddddlQd@\x80'
-input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
-matches = [[0, 0]]
-unescape = true
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "word-boundary-ascii-100"
-regex = '\Bx\B'
-input = "áxβ"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "word-boundary-ascii-200"
-regex = '\B'
-input = "0\U0007EF5E"
-matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
-unicode = false
-utf8 = false
diff --git a/vendor/regex-automata/tests/data/crazy.toml b/vendor/regex-automata/tests/data/crazy.toml
deleted file mode 100644
index 549b86cca..000000000
--- a/vendor/regex-automata/tests/data/crazy.toml
+++ /dev/null
@@ -1,302 +0,0 @@
-# TODO: There are still a couple of manually written tests in crazy.rs.
-
-[[tests]]
-name = "ranges"
-regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
-input = "num: 255"
-matches = [[5, 8]]
-
-[[tests]]
-name = "ranges-not"
-regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
-input = "num: 256"
-matches = []
-
-[[tests]]
-name = "float1"
-regex = '[-+]?[0-9]*\.?[0-9]+'
-input = "0.1"
-matches = [[0, 3]]
-
-[[tests]]
-name = "float2"
-regex = '[-+]?[0-9]*\.?[0-9]+'
-input = "0.1.2"
-matches = [[0, 3]]
-match_limit = 1
-
-[[tests]]
-name = "float3"
-regex = '[-+]?[0-9]*\.?[0-9]+'
-input = "a1.2"
-matches = [[1, 4]]
-
-[[tests]]
-name = "float4"
-regex = '[-+]?[0-9]*\.?[0-9]+'
-input = "1.a"
-matches = [[0, 1]]
-
-[[tests]]
-name = "float5"
-regex = '^[-+]?[0-9]*\.?[0-9]+$'
-input = "1.a"
-matches = []
-
-[[tests]]
-name = "email"
-regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
-input = "mine is jam.slam@gmail.com "
-matches = [[8, 26]]
-
-[[tests]]
-name = "email-not"
-regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
-input = "mine is jam.slam@gmail "
-matches = []
-
-[[tests]]
-name = "email-big"
-regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
-input = "mine is jam.slam@gmail.com "
-matches = [[8, 26]]
-
-[[tests]]
-name = "date1"
-regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
-input = "1900-01-01"
-matches = [[0, 10]]
-
-[[tests]]
-name = "date2"
-regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
-input = "1900-00-01"
-matches = []
-
-[[tests]]
-name = "date3"
-regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
-input = "1900-13-01"
-matches = []
-
-[[tests]]
-name = "start-end-empty"
-regex = '^$'
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "start-end-empty-rev"
-regex = '$^'
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "start-end-empty-many-1"
-regex = '^$^$^$'
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "start-end-empty-many-2"
-regex = '^^^$$$'
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "start-end-empty-rep"
-regex = '(?:^$)*'
-input = "a\nb\nc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-
-[[tests]]
-name = "start-end-empty-rep-rev"
-regex = '(?:$^)*'
-input = "a\nb\nc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-
-[[tests]]
-name = "neg-class-letter"
-regex = '[^ac]'
-input = "acx"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-letter-comma"
-regex = '[^a,]'
-input = "a,x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-letter-space"
-regex = '[^a[:space:]]'
-input = "a x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-comma"
-regex = '[^,]'
-input = ",,x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-space"
-regex = '[^[:space:]]'
-input = " a"
-matches = [[1, 2]]
-
-[[tests]]
-name = "neg-class-space-comma"
-regex = '[^,[:space:]]'
-input = ", a"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-comma-space"
-regex = '[^[:space:],]'
-input = " ,a"
-matches = [[2, 3]]
-
-[[tests]]
-name = "neg-class-ascii"
-regex = '[^[:alpha:]Z]'
-input = "A1"
-matches = [[1, 2]]
-
-[[tests]]
-name = "lazy-many-many"
-regex = '((?:.*)*?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "lazy-many-optional"
-regex = '((?:.?)*?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "lazy-one-many-many"
-regex = '((?:.*)+?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "lazy-one-many-optional"
-regex = '((?:.?)+?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "lazy-range-min-many"
-regex = '((?:.*){1,}?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "lazy-range-many"
-regex = '((?:.*){1,2}?)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-many-many"
-regex = '((?:.*)*)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-many-optional"
-regex = '((?:.?)*)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-one-many-many"
-regex = '((?:.*)+)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-one-many-optional"
-regex = '((?:.?)+)='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-range-min-many"
-regex = '((?:.*){1,})='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "greedy-range-many"
-regex = '((?:.*){1,2})='
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "empty1"
-regex = ''
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "empty2"
-regex = ''
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty3"
-regex = '()'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty4"
-regex = '()*'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty5"
-regex = '()+'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty6"
-regex = '()?'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty7"
-regex = '()()'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty8"
-regex = '()+|z'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty9"
-regex = 'z|()+'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty10"
-regex = '()+|b'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty11"
-regex = 'b|()+'
-input = "abc"
-matches = [[0, 0], [1, 2], [3, 3]]
diff --git a/vendor/regex-automata/tests/data/earliest.toml b/vendor/regex-automata/tests/data/earliest.toml
deleted file mode 100644
index 6714a850b..000000000
--- a/vendor/regex-automata/tests/data/earliest.toml
+++ /dev/null
@@ -1,48 +0,0 @@
-[[tests]]
-name = "no-greedy-100"
-regex = 'a+'
-input = "aaa"
-matches = [[0, 1], [1, 2], [2, 3]]
-search_kind = "earliest"
-
-[[tests]]
-name = "no-greedy-200"
-regex = 'abc+'
-input = "zzzabccc"
-matches = [[3, 6]]
-search_kind = "earliest"
-
-[[tests]]
-name = "is-ungreedy"
-regex = 'a+?'
-input = "aaa"
-matches = [[0, 1], [1, 2], [2, 3]]
-search_kind = "earliest"
-
-[[tests]]
-name = "look-start-test"
-regex = '^(abc|a)'
-input = "abc"
-matches = [[0, 1]]
-search_kind = "earliest"
-
-[[tests]]
-name = "look-end-test"
-regex = '(abc|a)$'
-input = "abc"
-matches = [[0, 3]]
-search_kind = "earliest"
-
-[[tests]]
-name = "no-leftmost-first-100"
-regex = 'abc|a'
-input = "abc"
-matches = [[0, 1]]
-search_kind = "earliest"
-
-[[tests]]
-name = "no-leftmost-first-200"
-regex = 'aba|a'
-input = "aba"
-matches = [[0, 1], [2, 3]]
-search_kind = "earliest"
diff --git a/vendor/regex-automata/tests/data/empty.toml b/vendor/regex-automata/tests/data/empty.toml
deleted file mode 100644
index ad703e601..000000000
--- a/vendor/regex-automata/tests/data/empty.toml
+++ /dev/null
@@ -1,113 +0,0 @@
-[[tests]]
-name = "100"
-regex = "|b"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "110"
-regex = "b|"
-input = "abc"
-matches = [[0, 0], [1, 2], [3, 3]]
-
-[[tests]]
-name = "120"
-regex = "|z"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "130"
-regex = "z|"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "200"
-regex = "|"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "210"
-regex = "||"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "220"
-regex = "||b"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "230"
-regex = "b||"
-input = "abc"
-matches = [[0, 0], [1, 2], [3, 3]]
-
-[[tests]]
-name = "240"
-regex = "||z"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "300"
-regex = "(?:)|b"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "310"
-regex = "b|(?:)"
-input = "abc"
-matches = [[0, 0], [1, 2], [3, 3]]
-
-[[tests]]
-name = "320"
-regex = "(?:|)"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "330"
-regex = "(?:|)|z"
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "400"
-regex = "a(?:)|b"
-input = "abc"
-matches = [[0, 1], [1, 2]]
-
-[[tests]]
-name = "500"
-regex = ""
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "510"
-regex = ""
-input = "a"
-matches = [[0, 0], [1, 1]]
-
-[[tests]]
-name = "520"
-regex = ""
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "600"
-regex = '(|a)*'
-input = "aaa"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "610"
-regex = '(|a)+'
-input = "aaa"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
diff --git a/vendor/regex-automata/tests/data/expensive.toml b/vendor/regex-automata/tests/data/expensive.toml
deleted file mode 100644
index e062e3902..000000000
--- a/vendor/regex-automata/tests/data/expensive.toml
+++ /dev/null
@@ -1,12 +0,0 @@
-# These represent tests that may be expensive to run on some regex engines. For
-# example, tests that build a full DFA ahead of time and minimize it can take a
-# horrendously long time on regexes that are large (or result in an explosion
-# in the number of states). We group these tests together so that such engines
-# can simply skip these tests.
-
-# See: https://github.com/rust-lang/regex/issues/98
-[[tests]]
-name = "regression-many-repeat-no-stack-overflow"
-regex = '^.{1,2500}'
-input = "a"
-matches = [[0, 1]]
diff --git a/vendor/regex-automata/tests/data/flags.toml b/vendor/regex-automata/tests/data/flags.toml
deleted file mode 100644
index 2b631ef23..000000000
--- a/vendor/regex-automata/tests/data/flags.toml
+++ /dev/null
@@ -1,67 +0,0 @@
-[[tests]]
-name = "1"
-regex = "(?i)abc"
-input = "ABC"
-matches = [[0, 3]]
-
-[[tests]]
-name = "2"
-regex = "(?i)a(?-i)bc"
-input = "Abc"
-matches = [[0, 3]]
-
-[[tests]]
-name = "3"
-regex = "(?i)a(?-i)bc"
-input = "ABC"
-matches = []
-
-[[tests]]
-name = "4"
-regex = "(?is)a."
-input = "A\n"
-matches = [[0, 2]]
-
-[[tests]]
-name = "5"
-regex = "(?is)a.(?-is)a."
-input = "A\nab"
-matches = [[0, 4]]
-
-[[tests]]
-name = "6"
-regex = "(?is)a.(?-is)a."
-input = "A\na\n"
-matches = []
-
-[[tests]]
-name = "7"
-regex = "(?is)a.(?-is:a.)?"
-input = "A\na\n"
-matches = [[0, 2]]
-match_limit = 1
-
-[[tests]]
-name = "8"
-regex = "(?U)a+"
-input = "aa"
-matches = [[0, 1]]
-match_limit = 1
-
-[[tests]]
-name = "9"
-regex = "(?U)a+?"
-input = "aa"
-matches = [[0, 2]]
-
-[[tests]]
-name = "10"
-regex = "(?U)(?-U)a+"
-input = "aa"
-matches = [[0, 2]]
-
-[[tests]]
-name = "11"
-regex = '(?m)(?:^\d+$\n?)+'
-input = "123\n456\n789"
-matches = [[0, 11]]
diff --git a/vendor/regex-automata/tests/data/fowler/basic.toml b/vendor/regex-automata/tests/data/fowler/basic.toml
deleted file mode 100644
index c965f26ff..000000000
--- a/vendor/regex-automata/tests/data/fowler/basic.toml
+++ /dev/null
@@ -1,1638 +0,0 @@
-# !!! DO NOT EDIT !!!
-# Automatically generated by scripts/fowler-to-toml.
-# Numbers in the test names correspond to the line number of the test from
-# the original dat file.
-
-[[tests]]
-name = "basic3"
-regex = '''abracadabra$'''
-input = '''abracadabracadabra'''
-captures = [[[7, 18]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic4"
-regex = '''a...b'''
-input = '''abababbb'''
-captures = [[[2, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic5"
-regex = '''XXXXXX'''
-input = '''..XXXXXX'''
-captures = [[[2, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic6"
-regex = '''\)'''
-input = '''()'''
-captures = [[[1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic7"
-regex = '''a]'''
-input = '''a]a'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic9"
-regex = '''\}'''
-input = '''}'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic10"
-regex = '''\]'''
-input = ''']'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic12"
-regex = ''']'''
-input = ''']'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic15"
-regex = '''^a'''
-input = '''ax'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic16"
-regex = '''\^a'''
-input = '''a^a'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic17"
-regex = '''a\^'''
-input = '''a^'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic18"
-regex = '''a$'''
-input = '''aa'''
-captures = [[[1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic19"
-regex = '''a\$'''
-input = '''a$'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic20"
-regex = '''^$'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic21"
-regex = '''$^'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic22"
-regex = '''a($)'''
-input = '''aa'''
-captures = [[[1, 2], [2, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic23"
-regex = '''a*(^a)'''
-input = '''aa'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic24"
-regex = '''(..)*(...)*'''
-input = '''a'''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic25"
-regex = '''(..)*(...)*'''
-input = '''abcd'''
-captures = [[[0, 4], [2, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic26"
-regex = '''(ab|a)(bc|c)'''
-input = '''abc'''
-captures = [[[0, 3], [0, 2], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic27"
-regex = '''(ab)c|abc'''
-input = '''abc'''
-captures = [[[0, 3], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic28"
-regex = '''a{0}b'''
-input = '''ab'''
-captures = [[[1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic29"
-regex = '''(a*)(b?)(b+)b{3}'''
-input = '''aaabbbbbbb'''
-captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic30"
-regex = '''(a*)(b{0,1})(b{1,})b{3}'''
-input = '''aaabbbbbbb'''
-captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic32"
-regex = '''((a|a)|a)'''
-input = '''a'''
-captures = [[[0, 1], [0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic33"
-regex = '''(a*)(a|aa)'''
-input = '''aaaa'''
-captures = [[[0, 4], [0, 3], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic34"
-regex = '''a*(a.|aa)'''
-input = '''aaaa'''
-captures = [[[0, 4], [2, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic35"
-regex = '''a(b)|c(d)|a(e)f'''
-input = '''aef'''
-captures = [[[0, 3], [], [], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic36"
-regex = '''(a|b)?.*'''
-input = '''b'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic37"
-regex = '''(a|b)c|a(b|c)'''
-input = '''ac'''
-captures = [[[0, 2], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic38"
-regex = '''(a|b)c|a(b|c)'''
-input = '''ab'''
-captures = [[[0, 2], [], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic39"
-regex = '''(a|b)*c|(a|ab)*c'''
-input = '''abc'''
-captures = [[[0, 3], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic40"
-regex = '''(a|b)*c|(a|ab)*c'''
-input = '''xc'''
-captures = [[[1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic41"
-regex = '''(.a|.b).*|.*(.a|.b)'''
-input = '''xa'''
-captures = [[[0, 2], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic42"
-regex = '''a?(ab|ba)ab'''
-input = '''abab'''
-captures = [[[0, 4], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic43"
-regex = '''a?(ac{0}b|ba)ab'''
-input = '''abab'''
-captures = [[[0, 4], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic44"
-regex = '''ab|abab'''
-input = '''abbabab'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic45"
-regex = '''aba|bab|bba'''
-input = '''baaabbbaba'''
-captures = [[[5, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic46"
-regex = '''aba|bab'''
-input = '''baaabbbaba'''
-captures = [[[6, 9]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic47"
-regex = '''(aa|aaa)*|(a|aaaaa)'''
-input = '''aa'''
-captures = [[[0, 2], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic48"
-regex = '''(a.|.a.)*|(a|.a...)'''
-input = '''aa'''
-captures = [[[0, 2], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic49"
-regex = '''ab|a'''
-input = '''xabc'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic50"
-regex = '''ab|a'''
-input = '''xxabc'''
-captures = [[[2, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic51"
-regex = '''(Ab|cD)*'''
-input = '''aBcD'''
-captures = [[[0, 4], [2, 4]]]
-match_limit = 1
-unescape = true
-case_insensitive = true
-
-[[tests]]
-name = "basic52"
-regex = '''[^-]'''
-input = '''--a'''
-captures = [[[2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic53"
-regex = '''[a-]*'''
-input = '''--a'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic54"
-regex = '''[a-m-]*'''
-input = '''--amoma--'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic55"
-regex = ''':::1:::0:|:::1:1:0:'''
-input = ''':::0:::1:::1:::0:'''
-captures = [[[8, 17]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic56"
-regex = ''':::1:::0:|:::1:1:1:'''
-input = ''':::0:::1:::1:::0:'''
-captures = [[[8, 17]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic57"
-regex = '''[[:upper:]]'''
-input = '''A'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic58"
-regex = '''[[:lower:]]+'''
-input = '''`az{'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic59"
-regex = '''[[:upper:]]+'''
-input = '''@AZ['''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic65"
-regex = '''\n'''
-input = '''\n'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic66"
-regex = '''\n'''
-input = '''\n'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic67"
-regex = '''[^a]'''
-input = '''\n'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic68"
-regex = '''\na'''
-input = '''\na'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic69"
-regex = '''(a)(b)(c)'''
-input = '''abc'''
-captures = [[[0, 3], [0, 1], [1, 2], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic70"
-regex = '''xxx'''
-input = '''xxx'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic71"
-regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''feb 6,'''
-captures = [[[0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic72"
-regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''2/7'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic73"
-regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''feb 1,Feb 6'''
-captures = [[[5, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic74"
-regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'''
-input = '''x'''
-captures = [[[0, 1], [0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic75"
-regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*'''
-input = '''xx'''
-captures = [[[0, 2], [1, 2], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic76"
-regex = '''a?(ab|ba)*'''
-input = '''ababababababababababababababababababababababababababababababababababababababababa'''
-captures = [[[0, 81], [79, 81]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic77"
-regex = '''abaa|abbaa|abbbaa|abbbbaa'''
-input = '''ababbabbbabbbabbbbabbbbaa'''
-captures = [[[18, 25]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic78"
-regex = '''abaa|abbaa|abbbaa|abbbbaa'''
-input = '''ababbabbbabbbabbbbabaa'''
-captures = [[[18, 22]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic79"
-regex = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc'''
-input = '''baaabbbabac'''
-captures = [[[7, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic80"
-regex = '''.*'''
-input = '''\x01\x7f'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic81"
-regex = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll'''
-input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa'''
-captures = [[[53, 57]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic83"
-regex = '''a*a*a*a*a*b'''
-input = '''aaaaaaaaab'''
-captures = [[[0, 10]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic84"
-regex = '''^'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic85"
-regex = '''$'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic86"
-regex = '''^$'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic87"
-regex = '''^a$'''
-input = '''a'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic88"
-regex = '''abc'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic89"
-regex = '''abc'''
-input = '''xabcy'''
-captures = [[[1, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic90"
-regex = '''abc'''
-input = '''ababc'''
-captures = [[[2, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic91"
-regex = '''ab*c'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic92"
-regex = '''ab*bc'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic93"
-regex = '''ab*bc'''
-input = '''abbc'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic94"
-regex = '''ab*bc'''
-input = '''abbbbc'''
-captures = [[[0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic95"
-regex = '''ab+bc'''
-input = '''abbc'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic96"
-regex = '''ab+bc'''
-input = '''abbbbc'''
-captures = [[[0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic97"
-regex = '''ab?bc'''
-input = '''abbc'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic98"
-regex = '''ab?bc'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic99"
-regex = '''ab?c'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic100"
-regex = '''^abc$'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic101"
-regex = '''^abc'''
-input = '''abcc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic102"
-regex = '''abc$'''
-input = '''aabc'''
-captures = [[[1, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic103"
-regex = '''^'''
-input = '''abc'''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic104"
-regex = '''$'''
-input = '''abc'''
-captures = [[[3, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic105"
-regex = '''a.c'''
-input = '''abc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic106"
-regex = '''a.c'''
-input = '''axc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic107"
-regex = '''a.*c'''
-input = '''axyzc'''
-captures = [[[0, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic108"
-regex = '''a[bc]d'''
-input = '''abd'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic109"
-regex = '''a[b-d]e'''
-input = '''ace'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic110"
-regex = '''a[b-d]'''
-input = '''aac'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic111"
-regex = '''a[-b]'''
-input = '''a-'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic112"
-regex = '''a[b-]'''
-input = '''a-'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic113"
-regex = '''a]'''
-input = '''a]'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic114"
-regex = '''a[]]b'''
-input = '''a]b'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic115"
-regex = '''a[^bc]d'''
-input = '''aed'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic116"
-regex = '''a[^-b]c'''
-input = '''adc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic117"
-regex = '''a[^]b]c'''
-input = '''adc'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic118"
-regex = '''ab|cd'''
-input = '''abc'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic119"
-regex = '''ab|cd'''
-input = '''abcd'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic120"
-regex = '''a\(b'''
-input = '''a(b'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic121"
-regex = '''a\(*b'''
-input = '''ab'''
-captures = [[[0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic122"
-regex = '''a\(*b'''
-input = '''a((b'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic123"
-regex = '''((a))'''
-input = '''abc'''
-captures = [[[0, 1], [0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic124"
-regex = '''(a)b(c)'''
-input = '''abc'''
-captures = [[[0, 3], [0, 1], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic125"
-regex = '''a+b+c'''
-input = '''aabbabc'''
-captures = [[[4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic126"
-regex = '''a*'''
-input = '''aaa'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic128"
-regex = '''(a*)*'''
-input = '''-'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic129"
-regex = '''(a*)+'''
-input = '''-'''
-captures = [[[0, 0], [0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic131"
-regex = '''(a*|b)*'''
-input = '''-'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic132"
-regex = '''(a+|b)*'''
-input = '''ab'''
-captures = [[[0, 2], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic133"
-regex = '''(a+|b)+'''
-input = '''ab'''
-captures = [[[0, 2], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic134"
-regex = '''(a+|b)?'''
-input = '''ab'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic135"
-regex = '''[^ab]*'''
-input = '''cde'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic137"
-regex = '''(^)*'''
-input = '''-'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic138"
-regex = '''a*'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic139"
-regex = '''([abc])*d'''
-input = '''abbbcd'''
-captures = [[[0, 6], [4, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic140"
-regex = '''([abc])*bcd'''
-input = '''abcd'''
-captures = [[[0, 4], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic141"
-regex = '''a|b|c|d|e'''
-input = '''e'''
-captures = [[[0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic142"
-regex = '''(a|b|c|d|e)f'''
-input = '''ef'''
-captures = [[[0, 2], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic144"
-regex = '''((a*|b))*'''
-input = '''-'''
-captures = [[[0, 0], [], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic145"
-regex = '''abcd*efg'''
-input = '''abcdefg'''
-captures = [[[0, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic146"
-regex = '''ab*'''
-input = '''xabyabbbz'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic147"
-regex = '''ab*'''
-input = '''xayabbbz'''
-captures = [[[1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic148"
-regex = '''(ab|cd)e'''
-input = '''abcde'''
-captures = [[[2, 5], [2, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic149"
-regex = '''[abhgefdc]ij'''
-input = '''hij'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic150"
-regex = '''(a|b)c*d'''
-input = '''abcd'''
-captures = [[[1, 4], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic151"
-regex = '''(ab|ab*)bc'''
-input = '''abc'''
-captures = [[[0, 3], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic152"
-regex = '''a([bc]*)c*'''
-input = '''abc'''
-captures = [[[0, 3], [1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic153"
-regex = '''a([bc]*)(c*d)'''
-input = '''abcd'''
-captures = [[[0, 4], [1, 3], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic154"
-regex = '''a([bc]+)(c*d)'''
-input = '''abcd'''
-captures = [[[0, 4], [1, 3], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic155"
-regex = '''a([bc]*)(c+d)'''
-input = '''abcd'''
-captures = [[[0, 4], [1, 2], [2, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic156"
-regex = '''a[bcd]*dcdcde'''
-input = '''adcdcde'''
-captures = [[[0, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic157"
-regex = '''(ab|a)b*c'''
-input = '''abc'''
-captures = [[[0, 3], [0, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic158"
-regex = '''((a)(b)c)(d)'''
-input = '''abcd'''
-captures = [[[0, 4], [0, 3], [0, 1], [1, 2], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic159"
-regex = '''[A-Za-z_][A-Za-z0-9_]*'''
-input = '''alpha'''
-captures = [[[0, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic160"
-regex = '''^a(bc+|b[eh])g|.h$'''
-input = '''abh'''
-captures = [[[1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic161"
-regex = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''effgz'''
-captures = [[[0, 5], [0, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic162"
-regex = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''ij'''
-captures = [[[0, 2], [0, 2], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic163"
-regex = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''reffgz'''
-captures = [[[1, 6], [1, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic164"
-regex = '''(((((((((a)))))))))'''
-input = '''a'''
-captures = [[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic165"
-regex = '''multiple words'''
-input = '''multiple words yeah'''
-captures = [[[0, 14]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic166"
-regex = '''(.*)c(.*)'''
-input = '''abcde'''
-captures = [[[0, 5], [0, 2], [3, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic167"
-regex = '''abcd'''
-input = '''abcd'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic168"
-regex = '''a(bc)d'''
-input = '''abcd'''
-captures = [[[0, 4], [1, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic169"
-regex = '''a[\x01-\x03]?c'''
-input = '''a\x02c'''
-captures = [[[0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic170"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Qaddafi'''
-captures = [[[0, 15], [], [10, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic171"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mo'ammar Gadhafi'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic172"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Kaddafi'''
-captures = [[[0, 15], [], [10, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic173"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Qadhafi'''
-captures = [[[0, 15], [], [10, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic174"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Gadafi'''
-captures = [[[0, 14], [], [10, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic175"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mu'ammar Qadafi'''
-captures = [[[0, 15], [], [11, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic176"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moamar Gaddafi'''
-captures = [[[0, 14], [], [9, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic177"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mu'ammar Qadhdhafi'''
-captures = [[[0, 18], [], [13, 15]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic178"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Khaddafi'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic179"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghaddafy'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic180"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghadafi'''
-captures = [[[0, 15], [], [11, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic181"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghaddafi'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic182"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muamar Kaddafi'''
-captures = [[[0, 14], [], [9, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic183"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Quathafi'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic184"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Gheddafi'''
-captures = [[[0, 16], [], [11, 13]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic185"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moammar Khadafy'''
-captures = [[[0, 15], [], [11, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic186"
-regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moammar Qudhafi'''
-captures = [[[0, 15], [], [10, 12]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic187"
-regex = '''a+(b|c)*d+'''
-input = '''aabcdd'''
-captures = [[[0, 6], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic188"
-regex = '''^.+$'''
-input = '''vivi'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic189"
-regex = '''^(.+)$'''
-input = '''vivi'''
-captures = [[[0, 4], [0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic190"
-regex = '''^([^!.]+).att.com!(.+)$'''
-input = '''gryphon.att.com!eby'''
-captures = [[[0, 19], [0, 7], [16, 19]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic191"
-regex = '''^([^!]+!)?([^!]+)$'''
-input = '''bas'''
-captures = [[[0, 3], [], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic192"
-regex = '''^([^!]+!)?([^!]+)$'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic193"
-regex = '''^([^!]+!)?([^!]+)$'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic194"
-regex = '''^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bar!bas'''
-captures = [[[0, 11], [4, 8], [8, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic195"
-regex = '''((foo)|(bar))!bas'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 3], [], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic196"
-regex = '''((foo)|(bar))!bas'''
-input = '''foo!bar!bas'''
-captures = [[[4, 11], [4, 7], [], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic197"
-regex = '''((foo)|(bar))!bas'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 3], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic198"
-regex = '''((foo)|bar)!bas'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic199"
-regex = '''((foo)|bar)!bas'''
-input = '''foo!bar!bas'''
-captures = [[[4, 11], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic200"
-regex = '''((foo)|bar)!bas'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 3], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic201"
-regex = '''(foo|(bar))!bas'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 3], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic202"
-regex = '''(foo|(bar))!bas'''
-input = '''foo!bar!bas'''
-captures = [[[4, 11], [4, 7], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic203"
-regex = '''(foo|(bar))!bas'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic204"
-regex = '''(foo|bar)!bas'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic205"
-regex = '''(foo|bar)!bas'''
-input = '''foo!bar!bas'''
-captures = [[[4, 11], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic206"
-regex = '''(foo|bar)!bas'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic207"
-regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bar!bas'''
-captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic208"
-regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''bas'''
-captures = [[[0, 3], [], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic209"
-regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic210"
-regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bar!bas'''
-captures = [[[0, 11], [], [], [4, 8], [8, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic211"
-regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic212"
-regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''bas'''
-captures = [[[0, 3], [0, 3], [], [0, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic213"
-regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''bar!bas'''
-captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic214"
-regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bar!bas'''
-captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic215"
-regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bas'''
-captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic216"
-regex = '''.*(/XXX).*'''
-input = '''/XXX'''
-captures = [[[0, 4], [0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic217"
-regex = '''.*(\\XXX).*'''
-input = '''\\XXX'''
-captures = [[[0, 4], [0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic218"
-regex = '''\\XXX'''
-input = '''\\XXX'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic219"
-regex = '''.*(/000).*'''
-input = '''/000'''
-captures = [[[0, 4], [0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic220"
-regex = '''.*(\\000).*'''
-input = '''\\000'''
-captures = [[[0, 4], [0, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "basic221"
-regex = '''\\000'''
-input = '''\\000'''
-captures = [[[0, 4]]]
-match_limit = 1
-unescape = true
-
diff --git a/vendor/regex-automata/tests/data/fowler/dat/README b/vendor/regex-automata/tests/data/fowler/dat/README
deleted file mode 100644
index e70072500..000000000
--- a/vendor/regex-automata/tests/data/fowler/dat/README
+++ /dev/null
@@ -1,24 +0,0 @@
-Test data was taken from the Go distribution, which was in turn taken from the
-testregex test suite:
-
- http://www2.research.att.com/~astopen/testregex/testregex.html
-
-Unfortunately, the above link is now dead, but the test data lives on.
-
-The LICENSE in this directory corresponds to the LICENSE that the data was
-originally released under.
-
-The tests themselves were modified for RE2/Go. A couple were modified further
-by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
-(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
-have been a bad idea, but I think being consistent with an established Regex
-library is worth something.
-
-After some number of years, these tests were transformed into a TOML format
-using the fowler-to-toml script in the 'scripts' directory. To re-generate the
-TOML files, then run the following from the root of this repository:
-
- ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat
-
-which brings them into a sensible structured format in which other tests can
-be written.
diff --git a/vendor/regex-automata/tests/data/fowler/dat/basic.dat b/vendor/regex-automata/tests/data/fowler/dat/basic.dat
deleted file mode 100644
index e55efaeec..000000000
--- a/vendor/regex-automata/tests/data/fowler/dat/basic.dat
+++ /dev/null
@@ -1,221 +0,0 @@
-NOTE all standard compliant implementations should pass these : 2002-05-31
-
-BE abracadabra$ abracadabracadabra (7,18)
-BE a...b abababbb (2,7)
-BE XXXXXX ..XXXXXX (2,8)
-E \) () (1,2)
-BE a] a]a (0,2)
-B } } (0,1)
-E \} } (0,1)
-BE \] ] (0,1)
-B ] ] (0,1)
-E ] ] (0,1)
-B { { (0,1)
-B } } (0,1)
-BE ^a ax (0,1)
-BE \^a a^a (1,3)
-BE a\^ a^ (0,2)
-BE a$ aa (1,2)
-BE a\$ a$ (0,2)
-BE ^$ NULL (0,0)
-E $^ NULL (0,0)
-E a($) aa (1,2)(2,2)
-E a*(^a) aa (0,1)(0,1)
-E (..)*(...)* a (0,0)
-E (..)*(...)* abcd (0,4)(2,4)
-E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
-E (ab)c|abc abc (0,3)(0,2)
-E a{0}b ab (1,2)
-E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E a{9876543210} NULL BADBR
-E ((a|a)|a) a (0,1)(0,1)(0,1)
-E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
-E a*(a.|aa) aaaa (0,4)(2,4)
-E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
-E (a|b)?.* b (0,1)(0,1)
-E (a|b)c|a(b|c) ac (0,2)(0,1)
-E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
-E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
-E (a|b)*c|(a|ab)*c xc (1,2)
-E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
-E a?(ab|ba)ab abab (0,4)(0,2)
-E a?(ac{0}b|ba)ab abab (0,4)(0,2)
-E ab|abab abbabab (0,2)
-E aba|bab|bba baaabbbaba (5,8)
-E aba|bab baaabbbaba (6,9)
-E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
-E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
-E ab|a xabc (1,3)
-E ab|a xxabc (2,4)
-Ei (Ab|cD)* aBcD (0,4)(2,4)
-BE [^-] --a (2,3)
-BE [a-]* --a (0,3)
-BE [a-m-]* --amoma-- (0,4)
-E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
-E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
-{E [[:upper:]] A (0,1) [[<element>]] not supported
-E [[:lower:]]+ `az{ (1,3)
-E [[:upper:]]+ @AZ[ (1,3)
-# No collation in Go
-#BE [[-]] [[-]] (2,4)
-#BE [[.NIL.]] NULL ECOLLATE
-#BE [[=aleph=]] NULL ECOLLATE
-}
-BE$ \n \n (0,1)
-BEn$ \n \n (0,1)
-BE$ [^a] \n (0,1)
-BE$ \na \na (0,2)
-E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
-BE xxx xxx (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
-E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
-E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
-E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
-E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
-BE$ .* \x01\x7f (0,2)
-E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
-L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
-E a*a*a*a*a*b aaaaaaaaab (0,10)
-BE ^ NULL (0,0)
-BE $ NULL (0,0)
-BE ^$ NULL (0,0)
-BE ^a$ a (0,1)
-BE abc abc (0,3)
-BE abc xabcy (1,4)
-BE abc ababc (2,5)
-BE ab*c abc (0,3)
-BE ab*bc abc (0,3)
-BE ab*bc abbc (0,4)
-BE ab*bc abbbbc (0,6)
-E ab+bc abbc (0,4)
-E ab+bc abbbbc (0,6)
-E ab?bc abbc (0,4)
-E ab?bc abc (0,3)
-E ab?c abc (0,3)
-BE ^abc$ abc (0,3)
-BE ^abc abcc (0,3)
-BE abc$ aabc (1,4)
-BE ^ abc (0,0)
-BE $ abc (3,3)
-BE a.c abc (0,3)
-BE a.c axc (0,3)
-BE a.*c axyzc (0,5)
-BE a[bc]d abd (0,3)
-BE a[b-d]e ace (0,3)
-BE a[b-d] aac (1,3)
-BE a[-b] a- (0,2)
-BE a[b-] a- (0,2)
-BE a] a] (0,2)
-BE a[]]b a]b (0,3)
-BE a[^bc]d aed (0,3)
-BE a[^-b]c adc (0,3)
-BE a[^]b]c adc (0,3)
-E ab|cd abc (0,2)
-E ab|cd abcd (0,2)
-E a\(b a(b (0,3)
-E a\(*b ab (0,2)
-E a\(*b a((b (0,4)
-E ((a)) abc (0,1)(0,1)(0,1)
-E (a)b(c) abc (0,3)(0,1)(2,3)
-E a+b+c aabbabc (4,7)
-E a* aaa (0,3)
-#E (a*)* - (0,0)(0,0)
-E (a*)* - (0,0)(?,?) RE2/Go
-E (a*)+ - (0,0)(0,0)
-#E (a*|b)* - (0,0)(0,0)
-E (a*|b)* - (0,0)(?,?) RE2/Go
-E (a+|b)* ab (0,2)(1,2)
-E (a+|b)+ ab (0,2)(1,2)
-E (a+|b)? ab (0,1)(0,1)
-BE [^ab]* cde (0,3)
-#E (^)* - (0,0)(0,0)
-E (^)* - (0,0)(?,?) RE2/Go
-BE a* NULL (0,0)
-E ([abc])*d abbbcd (0,6)(4,5)
-E ([abc])*bcd abcd (0,4)(0,1)
-E a|b|c|d|e e (0,1)
-E (a|b|c|d|e)f ef (0,2)(0,1)
-#E ((a*|b))* - (0,0)(0,0)(0,0)
-E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
-BE abcd*efg abcdefg (0,7)
-BE ab* xabyabbbz (1,3)
-BE ab* xayabbbz (1,2)
-E (ab|cd)e abcde (2,5)(2,4)
-BE [abhgefdc]ij hij (0,3)
-E (a|b)c*d abcd (1,4)(1,2)
-E (ab|ab*)bc abc (0,3)(0,1)
-E a([bc]*)c* abc (0,3)(1,3)
-E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
-E a[bcd]*dcdcde adcdcde (0,7)
-E (ab|a)b*c abc (0,3)(0,2)
-E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
-BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
-E ^a(bc+|b[eh])g|.h$ abh (1,3)
-E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
-E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
-E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
-E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
-BE multiple words multiple words yeah (0,14)
-E (.*)c(.*) abcde (0,5)(0,2)(3,5)
-BE abcd abcd (0,4)
-E a(bc)d abcd (0,4)(1,3)
-E a[-]?c ac (0,3)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
-E a+(b|c)*d+ aabcdd (0,6)(3,4)
-E ^.+$ vivi (0,4)
-E ^(.+)$ vivi (0,4)(0,4)
-E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
-E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
-E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
-E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
-E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
-E ((foo)|bar)!bas bar!bas (0,7)(0,3)
-E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
-E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
-E (foo|(bar))!bas foo!bas (0,7)(0,3)
-E (foo|bar)!bas bar!bas (0,7)(0,3)
-E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
-E (foo|bar)!bas foo!bas (0,7)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
-E .*(/XXX).* /XXX (0,4)(0,4)
-E .*(\\XXX).* \XXX (0,4)(0,4)
-E \\XXX \XXX (0,4)
-E .*(/000).* /000 (0,4)(0,4)
-E .*(\\000).* \000 (0,4)(0,4)
-E \\000 \000 (0,4)
diff --git a/vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat b/vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat
deleted file mode 100644
index 2e18fbb91..000000000
--- a/vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat
+++ /dev/null
@@ -1,79 +0,0 @@
-NOTE null subexpression matches : 2002-06-06
-
-E (a*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)* a (0,1)(0,1)
-E SAME x (0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)+ a (0,1)(0,1)
-E SAME x NOMATCH
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-
-E ([a]*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([a]*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([^b]*)* a (0,1)(0,1)
-#E SAME b (0,0)(0,0)
-E SAME b (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaab (0,6)(0,6)
-E ([ab]*)* a (0,1)(0,1)
-E SAME aaaaaa (0,6)(0,6)
-E SAME ababab (0,6)(0,6)
-E SAME bababa (0,6)(0,6)
-E SAME b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-E SAME aaaabcde (0,5)(0,5)
-E ([^a]*)* b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-#E SAME aaaaaa (0,0)(0,0)
-E SAME aaaaaa (0,0)(?,?) RE2/Go
-E ([^ab]*)* ccccxx (0,6)(0,6)
-#E SAME ababab (0,0)(0,0)
-E SAME ababab (0,0)(?,?) RE2/Go
-
-E ((z)+|a)* zabcde (0,2)(1,2)
-
-#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
-#E (a) aaa (0,1)(0,1)
-#E (a*?) aaa (0,0)(0,0)
-#E (a)*? aaa (0,0)
-#E (a*?)*? aaa (0,0)
-#}
-
-B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
-B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
-B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
-B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
-B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
-B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
-B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
-B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
-
-#E (a*)*(x) x (0,1)(0,0)(0,1)
-E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
-E (a*)*(x) ax (0,2)(0,1)(1,2)
-E (a*)*(x) axa (0,2)(0,1)(1,2)
-
-E (a*)+(x) x (0,1)(0,0)(0,1)
-E (a*)+(x) ax (0,2)(0,1)(1,2)
-E (a*)+(x) axa (0,2)(0,1)(1,2)
-
-E (a*){2}(x) x (0,1)(0,0)(0,1)
-E (a*){2}(x) ax (0,2)(1,1)(1,2)
-E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat b/vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat
deleted file mode 100644
index c91580236..000000000
--- a/vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat
+++ /dev/null
@@ -1,85 +0,0 @@
-NOTE implicit vs. explicit repetitions : 2009-02-02
-
-# Glenn Fowler <gsf@research.att.com>
-# conforming matches (column 4) must match one of the following BREs
-# NOMATCH
-# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
-# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
-# i.e., each 3-tuple has two identical elements and one (?,?)
-
-NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
-
-:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
-:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
-:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
-:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
-:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
-:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
-:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
-:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
-:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
-#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
-:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
-:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
-:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
-:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
-:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
-:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
-:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
-:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
-:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
-
-# These test a fixed bug in my regex-tdfa that did not keep the expanded
-# form properly grouped, so right association did the wrong thing with
-# these ambiguous patterns (crafted just to test my code when I became
-# suspicious of my implementation). The first subexpression should use
-# "ab" then "a" then "bcd".
-
-# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
-# results like (0,6)(4,5)(6,6).
-
-:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
-:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
-:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
-:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
-
-# The above worked on Linux/GLIBC but the following often fail.
-# They also trip up OS X / FreeBSD / NetBSD:
-
-#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
-#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
-#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
-:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
-:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/vendor/regex-automata/tests/data/fowler/dat/repetition.dat b/vendor/regex-automata/tests/data/fowler/dat/repetition.dat
deleted file mode 100644
index 2dac0823f..000000000
--- a/vendor/regex-automata/tests/data/fowler/dat/repetition.dat
+++ /dev/null
@@ -1,83 +0,0 @@
-NOTE implicit vs. explicit repetitions : 2009-02-02
-
-# Glenn Fowler <gsf@research.att.com>
-# conforming matches (column 4) must match one of the following BREs
-# NOMATCH
-# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
-# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
-# i.e., each 3-tuple has two identical elements and one (?,?)
-
-E ((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
-
-E ((..)|(.)){1} NULL NOMATCH
-E ((..)|(.)){2} NULL NOMATCH
-E ((..)|(.)){3} NULL NOMATCH
-
-E ((..)|(.))* NULL (0,0)
-
-E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.))((..)|(.)) a NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
-
-E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.)){2} a NOMATCH
-E ((..)|(.)){3} a NOMATCH
-
-E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
-
-E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
-E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
-
-E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
-E ((..)|(.)){3} aa NOMATCH
-
-E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
-
-E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
-E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
-
-E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
-#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
-
-#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-
-E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
-
-E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
-E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
-
-E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
-
-E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
-
-E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
-
-E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
-E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
-
-E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
diff --git a/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml b/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml
deleted file mode 100644
index 55d1d5b43..000000000
--- a/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml
+++ /dev/null
@@ -1,405 +0,0 @@
-# !!! DO NOT EDIT !!!
-# Automatically generated by scripts/fowler-to-toml.
-# Numbers in the test names correspond to the line number of the test from
-# the original dat file.
-
-[[tests]]
-name = "nullsubexpr3"
-regex = '''(a*)*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr5"
-regex = '''(a*)*'''
-input = '''x'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr6"
-regex = '''(a*)*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr7"
-regex = '''(a*)*'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr8"
-regex = '''(a*)+'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr9"
-regex = '''(a*)+'''
-input = '''x'''
-captures = [[[0, 0], [0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr10"
-regex = '''(a*)+'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr11"
-regex = '''(a*)+'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr12"
-regex = '''(a+)*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr13"
-regex = '''(a+)*'''
-input = '''x'''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr14"
-regex = '''(a+)*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr15"
-regex = '''(a+)*'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr16"
-regex = '''(a+)+'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr17"
-regex = '''(a+)+'''
-input = '''x'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr18"
-regex = '''(a+)+'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr19"
-regex = '''(a+)+'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr21"
-regex = '''([a]*)*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr23"
-regex = '''([a]*)*'''
-input = '''x'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr24"
-regex = '''([a]*)*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr25"
-regex = '''([a]*)*'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr26"
-regex = '''([a]*)+'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr27"
-regex = '''([a]*)+'''
-input = '''x'''
-captures = [[[0, 0], [0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr28"
-regex = '''([a]*)+'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr29"
-regex = '''([a]*)+'''
-input = '''aaaaaax'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr30"
-regex = '''([^b]*)*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr32"
-regex = '''([^b]*)*'''
-input = '''b'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr33"
-regex = '''([^b]*)*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr34"
-regex = '''([^b]*)*'''
-input = '''aaaaaab'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr35"
-regex = '''([ab]*)*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr36"
-regex = '''([ab]*)*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr37"
-regex = '''([ab]*)*'''
-input = '''ababab'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr38"
-regex = '''([ab]*)*'''
-input = '''bababa'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr39"
-regex = '''([ab]*)*'''
-input = '''b'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr40"
-regex = '''([ab]*)*'''
-input = '''bbbbbb'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr41"
-regex = '''([ab]*)*'''
-input = '''aaaabcde'''
-captures = [[[0, 5], [0, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr42"
-regex = '''([^a]*)*'''
-input = '''b'''
-captures = [[[0, 1], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr43"
-regex = '''([^a]*)*'''
-input = '''bbbbbb'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr45"
-regex = '''([^a]*)*'''
-input = '''aaaaaa'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr46"
-regex = '''([^ab]*)*'''
-input = '''ccccxx'''
-captures = [[[0, 6], [0, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr48"
-regex = '''([^ab]*)*'''
-input = '''ababab'''
-captures = [[[0, 0], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr50"
-regex = '''((z)+|a)*'''
-input = '''zabcde'''
-captures = [[[0, 2], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr69"
-regex = '''(a*)*(x)'''
-input = '''x'''
-captures = [[[0, 1], [], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr70"
-regex = '''(a*)*(x)'''
-input = '''ax'''
-captures = [[[0, 2], [0, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr71"
-regex = '''(a*)*(x)'''
-input = '''axa'''
-captures = [[[0, 2], [0, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr73"
-regex = '''(a*)+(x)'''
-input = '''x'''
-captures = [[[0, 1], [0, 0], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr74"
-regex = '''(a*)+(x)'''
-input = '''ax'''
-captures = [[[0, 2], [0, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr75"
-regex = '''(a*)+(x)'''
-input = '''axa'''
-captures = [[[0, 2], [0, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr77"
-regex = '''(a*){2}(x)'''
-input = '''x'''
-captures = [[[0, 1], [0, 0], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr78"
-regex = '''(a*){2}(x)'''
-input = '''ax'''
-captures = [[[0, 2], [1, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "nullsubexpr79"
-regex = '''(a*){2}(x)'''
-input = '''axa'''
-captures = [[[0, 2], [1, 1], [1, 2]]]
-match_limit = 1
-unescape = true
-
diff --git a/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml b/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml
deleted file mode 100644
index 81a896452..000000000
--- a/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml
+++ /dev/null
@@ -1,341 +0,0 @@
-# !!! DO NOT EDIT !!!
-# Automatically generated by scripts/fowler-to-toml.
-# Numbers in the test names correspond to the line number of the test from
-# the original dat file.
-
-[[tests]]
-name = "repetition-expensive12"
-regex = '''X(.?){0,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive13"
-regex = '''X(.?){1,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive14"
-regex = '''X(.?){2,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive15"
-regex = '''X(.?){3,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive16"
-regex = '''X(.?){4,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive17"
-regex = '''X(.?){5,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive18"
-regex = '''X(.?){6,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive19"
-regex = '''X(.?){7,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive20"
-regex = '''X(.?){8,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive22"
-regex = '''X(.?){0,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive24"
-regex = '''X(.?){1,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive26"
-regex = '''X(.?){2,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive28"
-regex = '''X(.?){3,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive30"
-regex = '''X(.?){4,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive32"
-regex = '''X(.?){5,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive34"
-regex = '''X(.?){6,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive36"
-regex = '''X(.?){7,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive37"
-regex = '''X(.?){8,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive48"
-regex = '''(a|ab|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive49"
-regex = '''(a|ab|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive50"
-regex = '''(a|ab|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive51"
-regex = '''(a|ab|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive52"
-regex = '''(a|ab|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive53"
-regex = '''(a|ab|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive54"
-regex = '''(a|ab|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive55"
-regex = '''(a|ab|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive56"
-regex = '''(a|ab|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive57"
-regex = '''(a|ab|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive58"
-regex = '''(a|ab|c|bcd)*(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive59"
-regex = '''(a|ab|c|bcd)+(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive65"
-regex = '''(ab|a|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive67"
-regex = '''(ab|a|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive69"
-regex = '''(ab|a|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive71"
-regex = '''(ab|a|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive72"
-regex = '''(ab|a|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive74"
-regex = '''(ab|a|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive76"
-regex = '''(ab|a|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive78"
-regex = '''(ab|a|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive80"
-regex = '''(ab|a|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive81"
-regex = '''(ab|a|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive83"
-regex = '''(ab|a|c|bcd)*(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-expensive85"
-regex = '''(ab|a|c|bcd)+(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
diff --git a/vendor/regex-automata/tests/data/fowler/repetition-long.toml b/vendor/regex-automata/tests/data/fowler/repetition-long.toml
deleted file mode 100644
index fa24c834a..000000000
--- a/vendor/regex-automata/tests/data/fowler/repetition-long.toml
+++ /dev/null
@@ -1,341 +0,0 @@
-# !!! DO NOT EDIT !!!
-# Automatically generated by scripts/fowler-to-toml.
-# Numbers in the test names correspond to the line number of the test from
-# the original dat file.
-
-[[tests]]
-name = "repetition-long12"
-regex = '''X(.?){0,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long13"
-regex = '''X(.?){1,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long14"
-regex = '''X(.?){2,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long15"
-regex = '''X(.?){3,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long16"
-regex = '''X(.?){4,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long17"
-regex = '''X(.?){5,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long18"
-regex = '''X(.?){6,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long19"
-regex = '''X(.?){7,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [7, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long20"
-regex = '''X(.?){8,}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long22"
-regex = '''X(.?){0,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long24"
-regex = '''X(.?){1,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long26"
-regex = '''X(.?){2,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long28"
-regex = '''X(.?){3,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long30"
-regex = '''X(.?){4,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long32"
-regex = '''X(.?){5,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long34"
-regex = '''X(.?){6,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long36"
-regex = '''X(.?){7,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long37"
-regex = '''X(.?){8,8}Y'''
-input = '''X1234567Y'''
-captures = [[[0, 9], [8, 8]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long48"
-regex = '''(a|ab|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long49"
-regex = '''(a|ab|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long50"
-regex = '''(a|ab|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long51"
-regex = '''(a|ab|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long52"
-regex = '''(a|ab|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long53"
-regex = '''(a|ab|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long54"
-regex = '''(a|ab|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long55"
-regex = '''(a|ab|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long56"
-regex = '''(a|ab|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [3, 6], [6, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long57"
-regex = '''(a|ab|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long58"
-regex = '''(a|ab|c|bcd)*(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long59"
-regex = '''(a|ab|c|bcd)+(d*)'''
-input = '''ababcd'''
-captures = [[[0, 1], [0, 1], [1, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long65"
-regex = '''(ab|a|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long67"
-regex = '''(ab|a|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long69"
-regex = '''(ab|a|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long71"
-regex = '''(ab|a|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long72"
-regex = '''(ab|a|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long74"
-regex = '''(ab|a|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long76"
-regex = '''(ab|a|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long78"
-regex = '''(ab|a|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long80"
-regex = '''(ab|a|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long81"
-regex = '''(ab|a|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long83"
-regex = '''(ab|a|c|bcd)*(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition-long85"
-regex = '''(ab|a|c|bcd)+(d*)'''
-input = '''ababcd'''
-captures = [[[0, 6], [4, 5], [5, 6]]]
-match_limit = 1
-unescape = true
-
diff --git a/vendor/regex-automata/tests/data/fowler/repetition.toml b/vendor/regex-automata/tests/data/fowler/repetition.toml
deleted file mode 100644
index fc8da8df4..000000000
--- a/vendor/regex-automata/tests/data/fowler/repetition.toml
+++ /dev/null
@@ -1,397 +0,0 @@
-# !!! DO NOT EDIT !!!
-# Automatically generated by scripts/fowler-to-toml.
-# Numbers in the test names correspond to the line number of the test from
-# the original dat file.
-
-[[tests]]
-name = "repetition10"
-regex = '''((..)|(.))'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition11"
-regex = '''((..)|(.))((..)|(.))'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition12"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition14"
-regex = '''((..)|(.)){1}'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition15"
-regex = '''((..)|(.)){2}'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition16"
-regex = '''((..)|(.)){3}'''
-input = ''''''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition18"
-regex = '''((..)|(.))*'''
-input = ''''''
-captures = [[[0, 0]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition20"
-regex = '''((..)|(.))'''
-input = '''a'''
-captures = [[[0, 1], [0, 1], [], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition21"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''a'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition22"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''a'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition24"
-regex = '''((..)|(.)){1}'''
-input = '''a'''
-captures = [[[0, 1], [0, 1], [], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition25"
-regex = '''((..)|(.)){2}'''
-input = '''a'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition26"
-regex = '''((..)|(.)){3}'''
-input = '''a'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition28"
-regex = '''((..)|(.))*'''
-input = '''a'''
-captures = [[[0, 1], [0, 1], [], [0, 1]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition30"
-regex = '''((..)|(.))'''
-input = '''aa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition31"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''aa'''
-captures = [[[0, 2], [0, 1], [], [0, 1], [1, 2], [], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition32"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aa'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition34"
-regex = '''((..)|(.)){1}'''
-input = '''aa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition35"
-regex = '''((..)|(.)){2}'''
-input = '''aa'''
-captures = [[[0, 2], [1, 2], [], [1, 2]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition36"
-regex = '''((..)|(.)){3}'''
-input = '''aa'''
-captures = []
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition38"
-regex = '''((..)|(.))*'''
-input = '''aa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition40"
-regex = '''((..)|(.))'''
-input = '''aaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition41"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''aaa'''
-captures = [[[0, 3], [0, 2], [0, 2], [], [2, 3], [], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition42"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaa'''
-captures = [[[0, 3], [0, 1], [], [0, 1], [1, 2], [], [1, 2], [2, 3], [], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition44"
-regex = '''((..)|(.)){1}'''
-input = '''aaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition46"
-regex = '''((..)|(.)){2}'''
-input = '''aaa'''
-captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition47"
-regex = '''((..)|(.)){3}'''
-input = '''aaa'''
-captures = [[[0, 3], [2, 3], [], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition50"
-regex = '''((..)|(.))*'''
-input = '''aaa'''
-captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition52"
-regex = '''((..)|(.))'''
-input = '''aaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition53"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''aaaa'''
-captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition54"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaa'''
-captures = [[[0, 4], [0, 2], [0, 2], [], [2, 3], [], [2, 3], [3, 4], [], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition56"
-regex = '''((..)|(.)){1}'''
-input = '''aaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition57"
-regex = '''((..)|(.)){2}'''
-input = '''aaaa'''
-captures = [[[0, 4], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition59"
-regex = '''((..)|(.)){3}'''
-input = '''aaaa'''
-captures = [[[0, 4], [3, 4], [0, 2], [3, 4]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition61"
-regex = '''((..)|(.))*'''
-input = '''aaaa'''
-captures = [[[0, 4], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition63"
-regex = '''((..)|(.))'''
-input = '''aaaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition64"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''aaaaa'''
-captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition65"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaaa'''
-captures = [[[0, 5], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 5], [], [4, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition67"
-regex = '''((..)|(.)){1}'''
-input = '''aaaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition68"
-regex = '''((..)|(.)){2}'''
-input = '''aaaaa'''
-captures = [[[0, 4], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition70"
-regex = '''((..)|(.)){3}'''
-input = '''aaaaa'''
-captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition73"
-regex = '''((..)|(.))*'''
-input = '''aaaaa'''
-captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition75"
-regex = '''((..)|(.))'''
-input = '''aaaaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition76"
-regex = '''((..)|(.))((..)|(.))'''
-input = '''aaaaaa'''
-captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition77"
-regex = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 6], [4, 6], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition79"
-regex = '''((..)|(.)){1}'''
-input = '''aaaaaa'''
-captures = [[[0, 2], [0, 2], [0, 2], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition80"
-regex = '''((..)|(.)){2}'''
-input = '''aaaaaa'''
-captures = [[[0, 4], [2, 4], [2, 4], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition81"
-regex = '''((..)|(.)){3}'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [4, 6], [4, 6], []]]
-match_limit = 1
-unescape = true
-
-[[tests]]
-name = "repetition83"
-regex = '''((..)|(.))*'''
-input = '''aaaaaa'''
-captures = [[[0, 6], [4, 6], [4, 6], []]]
-match_limit = 1
-unescape = true
-
diff --git a/vendor/regex-automata/tests/data/iter.toml b/vendor/regex-automata/tests/data/iter.toml
deleted file mode 100644
index 6c0539fd4..000000000
--- a/vendor/regex-automata/tests/data/iter.toml
+++ /dev/null
@@ -1,119 +0,0 @@
-[[tests]]
-name = "1"
-regex = "a"
-input = "aaa"
-matches = [[0, 1], [1, 2], [2, 3]]
-
-[[tests]]
-name = "2"
-regex = "a"
-input = "aba"
-matches = [[0, 1], [2, 3]]
-
-[[tests]]
-name = "empty1"
-regex = ''
-input = ''
-matches = [[0, 0]]
-
-[[tests]]
-name = "empty2"
-regex = ''
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty3"
-regex = '()'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty4"
-regex = '()*'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty5"
-regex = '()+'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty6"
-regex = '()?'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty7"
-regex = '()()'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty8"
-regex = '()+|z'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty9"
-regex = 'z|()+'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty10"
-regex = '()+|b'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "empty11"
-regex = 'b|()+'
-input = 'abc'
-matches = [[0, 0], [1, 2], [3, 3]]
-
-[[tests]]
-name = "start1"
-regex = "^a"
-input = "a"
-matches = [[0, 1]]
-
-[[tests]]
-name = "start2"
-regex = "^a"
-input = "aa"
-matches = [[0, 1]]
-
-[[tests]]
-name = "anchored1"
-regex = "a"
-input = "a"
-matches = [[0, 1]]
-anchored = true
-
-# This test is pretty subtle. It demonstrates the crucial difference between
-# '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively
-# matches at the start of a haystack and nowhere else. The latter regex has
-# no such restriction, but its automaton is constructed such that it lacks a
-# `.*?` prefix. So it can actually produce matches at multiple locations.
-# The anchored3 test drives this point home.
-[[tests]]
-name = "anchored2"
-regex = "a"
-input = "aa"
-matches = [[0, 1], [1, 2]]
-anchored = true
-
-# Unlikely anchored2, this test stops matching anything after it sees `b`
-# since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it
-# determines that there are no remaining matches.
-[[tests]]
-name = "anchored3"
-regex = "a"
-input = "aaba"
-matches = [[0, 1], [1, 2]]
-anchored = true
diff --git a/vendor/regex-automata/tests/data/misc.toml b/vendor/regex-automata/tests/data/misc.toml
deleted file mode 100644
index c05418dd6..000000000
--- a/vendor/regex-automata/tests/data/misc.toml
+++ /dev/null
@@ -1,99 +0,0 @@
-[[tests]]
-name = "ascii-literal"
-regex = "a"
-input = "a"
-matches = [[0, 1]]
-
-[[tests]]
-name = "ascii-literal-not"
-regex = "a"
-input = "z"
-matches = []
-
-[[tests]]
-name = "ascii-literal-anchored"
-regex = "a"
-input = "a"
-matches = [[0, 1]]
-anchored = true
-
-[[tests]]
-name = "ascii-literal-anchored-not"
-regex = "a"
-input = "z"
-matches = []
-anchored = true
-
-[[tests]]
-name = "anchor-start-end-line"
-regex = '(?m)^bar$'
-input = "foo\nbar\nbaz"
-matches = [[4, 7]]
-
-[[tests]]
-name = "prefix-literal-match"
-regex = '^abc'
-input = "abc"
-matches = [[0, 3]]
-
-[[tests]]
-name = "prefix-literal-match-ascii"
-regex = '^abc'
-input = "abc"
-matches = [[0, 3]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "prefix-literal-no-match"
-regex = '^abc'
-input = "zabc"
-matches = []
-
-[[tests]]
-name = "one-literal-edge"
-regex = 'abc'
-input = "xxxxxab"
-matches = []
-
-[[tests]]
-name = "terminates"
-regex = 'a$'
-input = "a"
-matches = [[0, 1]]
-
-[[tests]]
-name = "suffix-100"
-regex = '.*abcd'
-input = "abcd"
-matches = [[0, 4]]
-
-[[tests]]
-name = "suffix-200"
-regex = '.*(?:abcd)+'
-input = "abcd"
-matches = [[0, 4]]
-
-[[tests]]
-name = "suffix-300"
-regex = '.*(?:abcd)+'
-input = "abcdabcd"
-matches = [[0, 8]]
-
-[[tests]]
-name = "suffix-400"
-regex = '.*(?:abcd)+'
-input = "abcdxabcd"
-matches = [[0, 9]]
-
-[[tests]]
-name = "suffix-500"
-regex = '.*x(?:abcd)+'
-input = "abcdxabcd"
-matches = [[0, 9]]
-
-[[tests]]
-name = "suffix-600"
-regex = '[^abcd]*x(?:abcd)+'
-input = "abcdxabcd"
-matches = [[4, 9]]
diff --git a/vendor/regex-automata/tests/data/multiline.toml b/vendor/regex-automata/tests/data/multiline.toml
deleted file mode 100644
index cefdb2629..000000000
--- a/vendor/regex-automata/tests/data/multiline.toml
+++ /dev/null
@@ -1,275 +0,0 @@
-[[tests]]
-name = "basic1"
-regex = '(?m)^[a-z]+$'
-input = "abc\ndef\nxyz"
-matches = [[0, 3], [4, 7], [8, 11]]
-
-[[tests]]
-name = "basic2"
-regex = '(?m)^$'
-input = "abc\ndef\nxyz"
-matches = []
-
-[[tests]]
-name = "basic3"
-regex = '(?m)^'
-input = "abc\ndef\nxyz"
-matches = [[0, 0], [4, 4], [8, 8]]
-
-[[tests]]
-name = "basic4"
-regex = '(?m)$'
-input = "abc\ndef\nxyz"
-matches = [[3, 3], [7, 7], [11, 11]]
-
-[[tests]]
-name = "basic5"
-regex = '(?m)^[a-z]'
-input = "abc\ndef\nxyz"
-matches = [[0, 1], [4, 5], [8, 9]]
-
-[[tests]]
-name = "basic6"
-regex = '(?m)[a-z]^'
-input = "abc\ndef\nxyz"
-matches = []
-
-[[tests]]
-name = "basic7"
-regex = '(?m)[a-z]$'
-input = "abc\ndef\nxyz"
-matches = [[2, 3], [6, 7], [10, 11]]
-
-[[tests]]
-name = "basic8"
-regex = '(?m)$[a-z]'
-input = "abc\ndef\nxyz"
-matches = []
-
-[[tests]]
-name = "basic9"
-regex = '(?m)^$'
-input = ""
-matches = [[0, 0]]
-
-[[tests]]
-name = "repeat1"
-regex = '(?m)(?:^$)*'
-input = "a\nb\nc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-
-[[tests]]
-name = "repeat1-no-multi"
-regex = '(?:^$)*'
-input = "a\nb\nc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-
-[[tests]]
-name = "repeat2"
-regex = '(?m)(?:^|a)+'
-input = "a\naaa\n"
-matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
-
-[[tests]]
-name = "repeat100"
-regex = '(?m)(?:^|a)+'
-input = "a\naaa\n"
-matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
-
-[[tests]]
-name = "repeat2-no-multi"
-regex = '(?:^|a)+'
-input = "a\naaa\n"
-matches = [[0, 0], [2, 5]]
-
-[[tests]]
-name = "repeat3"
-regex = '(?m)(?:^|a)*'
-input = "a\naaa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
-
-[[tests]]
-name = "repeat3-no-multi"
-regex = '(?:^|a)*'
-input = "a\naaa\n"
-matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
-
-[[tests]]
-name = "repeat4"
-regex = '(?m)(?:^|a+)'
-input = "a\naaa\n"
-matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
-
-[[tests]]
-name = "repeat4-no-multi"
-regex = '(?:^|a+)'
-input = "a\naaa\n"
-matches = [[0, 0], [2, 5]]
-
-[[tests]]
-name = "repeat5"
-regex = '(?m)(?:^|a*)'
-input = "a\naaa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
-
-[[tests]]
-name = "repeat5-no-multi"
-regex = '(?:^|a*)'
-input = "a\naaa\n"
-matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
-
-[[tests]]
-name = "repeat6"
-regex = '(?m)(?:^[a-z])+'
-input = "abc\ndef\nxyz"
-matches = [[0, 1], [4, 5], [8, 9]]
-
-[[tests]]
-name = "repeat6-no-multi"
-regex = '(?:^[a-z])+'
-input = "abc\ndef\nxyz"
-matches = [[0, 1]]
-
-[[tests]]
-name = "repeat7"
-regex = '(?m)(?:^[a-z]{3}\n?)+'
-input = "abc\ndef\nxyz"
-matches = [[0, 11]]
-
-[[tests]]
-name = "repeat7-no-multi"
-regex = '(?:^[a-z]{3}\n?)+'
-input = "abc\ndef\nxyz"
-matches = [[0, 4]]
-
-[[tests]]
-name = "repeat8"
-regex = '(?m)(?:^[a-z]{3}\n?)*'
-input = "abc\ndef\nxyz"
-matches = [[0, 11]]
-
-[[tests]]
-name = "repeat8-no-multi"
-regex = '(?:^[a-z]{3}\n?)*'
-input = "abc\ndef\nxyz"
-matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
-
-[[tests]]
-name = "repeat9"
-regex = '(?m)(?:\n?[a-z]{3}$)+'
-input = "abc\ndef\nxyz"
-matches = [[0, 11]]
-
-[[tests]]
-name = "repeat9-no-multi"
-regex = '(?:\n?[a-z]{3}$)+'
-input = "abc\ndef\nxyz"
-matches = [[7, 11]]
-
-[[tests]]
-name = "repeat10"
-regex = '(?m)(?:\n?[a-z]{3}$)*'
-input = "abc\ndef\nxyz"
-matches = [[0, 11]]
-
-[[tests]]
-name = "repeat10-no-multi"
-regex = '(?:\n?[a-z]{3}$)*'
-input = "abc\ndef\nxyz"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
-
-[[tests]]
-name = "repeat11"
-regex = '(?m)^*'
-input = "\naa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-
-[[tests]]
-name = "repeat11-no-multi"
-regex = '^*'
-input = "\naa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-
-[[tests]]
-name = "repeat12"
-regex = '(?m)^+'
-input = "\naa\n"
-matches = [[0, 0], [1, 1], [4, 4]]
-
-[[tests]]
-name = "repeat12-no-multi"
-regex = '^+'
-input = "\naa\n"
-matches = [[0, 0]]
-
-[[tests]]
-name = "repeat13"
-regex = '(?m)$*'
-input = "\naa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-
-[[tests]]
-name = "repeat13-no-multi"
-regex = '$*'
-input = "\naa\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-
-[[tests]]
-name = "repeat14"
-regex = '(?m)$+'
-input = "\naa\n"
-matches = [[0, 0], [3, 3], [4, 4]]
-
-[[tests]]
-name = "repeat14-no-multi"
-regex = '$+'
-input = "\naa\n"
-matches = [[4, 4]]
-
-[[tests]]
-name = "repeat15"
-regex = '(?m)(?:$\n)+'
-input = "\n\naaa\n\n"
-matches = [[0, 2], [5, 7]]
-
-[[tests]]
-name = "repeat15-no-multi"
-regex = '(?:$\n)+'
-input = "\n\naaa\n\n"
-matches = []
-
-[[tests]]
-name = "repeat16"
-regex = '(?m)(?:$\n)*'
-input = "\n\naaa\n\n"
-matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
-
-[[tests]]
-name = "repeat16-no-multi"
-regex = '(?:$\n)*'
-input = "\n\naaa\n\n"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
-
-[[tests]]
-name = "repeat17"
-regex = '(?m)(?:$\n^)+'
-input = "\n\naaa\n\n"
-matches = [[0, 2], [5, 7]]
-
-[[tests]]
-name = "repeat17-no-multi"
-regex = '(?:$\n^)+'
-input = "\n\naaa\n\n"
-matches = []
-
-[[tests]]
-name = "repeat18"
-regex = '(?m)(?:^|$)+'
-input = "\n\naaa\n\n"
-matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
-
-[[tests]]
-name = "repeat18-no-multi"
-regex = '(?:^|$)+'
-input = "\n\naaa\n\n"
-matches = [[0, 0], [7, 7]]
diff --git a/vendor/regex-automata/tests/data/no-unicode.toml b/vendor/regex-automata/tests/data/no-unicode.toml
deleted file mode 100644
index c7fc9664f..000000000
--- a/vendor/regex-automata/tests/data/no-unicode.toml
+++ /dev/null
@@ -1,158 +0,0 @@
-[[tests]]
-name = "invalid-utf8-literal1"
-regex = '\xFF'
-input = '\xFF'
-matches = [[0, 1]]
-unicode = false
-utf8 = false
-unescape = true
-
-
-[[tests]]
-name = "mixed"
-regex = '(.+)(?-u)(.+)'
-input = '\xCE\x93\xCE\x94\xFF'
-matches = [[0, 5]]
-utf8 = false
-unescape = true
-
-
-[[tests]]
-name = "case1"
-regex = "a"
-input = "A"
-matches = [[0, 1]]
-case_insensitive = true
-unicode = false
-
-[[tests]]
-name = "case2"
-regex = "[a-z]+"
-input = "AaAaA"
-matches = [[0, 5]]
-case_insensitive = true
-unicode = false
-
-[[tests]]
-name = "case3"
-regex = "[a-z]+"
-input = "aA\u212AaA"
-matches = [[0, 7]]
-case_insensitive = true
-
-[[tests]]
-name = "case4"
-regex = "[a-z]+"
-input = "aA\u212AaA"
-matches = [[0, 2], [5, 7]]
-case_insensitive = true
-unicode = false
-
-
-[[tests]]
-name = "negate1"
-regex = "[^a]"
-input = "δ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "negate2"
-regex = "[^a]"
-input = "δ"
-matches = [[0, 1], [1, 2]]
-unicode = false
-utf8 = false
-
-
-[[tests]]
-name = "dotstar-prefix1"
-regex = "a"
-input = '\xFFa'
-matches = [[1, 2]]
-unicode = false
-utf8 = false
-unescape = true
-
-[[tests]]
-name = "dotstar-prefix2"
-regex = "a"
-input = '\xFFa'
-matches = [[1, 2]]
-utf8 = false
-unescape = true
-
-
-[[tests]]
-name = "null-bytes1"
-regex = '[^\x00]+\x00'
-input = 'foo\x00'
-matches = [[0, 4]]
-unicode = false
-utf8 = false
-unescape = true
-
-
-[[tests]]
-name = "word-ascii"
-regex = '\w+'
-input = "aδ"
-matches = [[0, 1]]
-unicode = false
-
-[[tests]]
-name = "word-unicode"
-regex = '\w+'
-input = "aδ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "decimal-ascii"
-regex = '\d+'
-input = "1२३9"
-matches = [[0, 1], [7, 8]]
-unicode = false
-
-[[tests]]
-name = "decimal-unicode"
-regex = '\d+'
-input = "1२३9"
-matches = [[0, 8]]
-
-[[tests]]
-name = "space-ascii"
-regex = '\s+'
-input = " \u1680"
-matches = [[0, 1]]
-unicode = false
-
-[[tests]]
-name = "space-unicode"
-regex = '\s+'
-input = " \u1680"
-matches = [[0, 4]]
-
-
-[[tests]]
-# See: https://github.com/rust-lang/regex/issues/484
-name = "iter1-bytes"
-regex = ''
-input = "☃"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-utf8 = false
-
-[[tests]]
-# See: https://github.com/rust-lang/regex/issues/484
-name = "iter1-utf8"
-regex = ''
-input = "☃"
-matches = [[0, 0], [3, 3]]
-
-[[tests]]
-# See: https://github.com/rust-lang/regex/issues/484
-# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
-name = "iter2-bytes"
-regex = ''
-input = 'b\xFFr'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-unescape = true
-utf8 = false
diff --git a/vendor/regex-automata/tests/data/overlapping.toml b/vendor/regex-automata/tests/data/overlapping.toml
deleted file mode 100644
index 6662876b4..000000000
--- a/vendor/regex-automata/tests/data/overlapping.toml
+++ /dev/null
@@ -1,126 +0,0 @@
-[[tests]]
-name = "repetition-plus-leftmost-first-100"
-regex = 'a+'
-input = "aaa"
-matches = [[0, 1], [0, 2], [0, 3]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-plus-all-100"
-regex = 'a+'
-input = "aaa"
-matches = [[0, 1], [0, 2], [0, 3]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-plus-leftmost-first-200"
-regex = '(abc)+'
-input = "zzabcabczzabc"
-matches = [[2, 5], [2, 8]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-plus-all-200"
-regex = '(abc)+'
-input = "zzabcabczzabc"
-matches = [[2, 5], [2, 8], [10, 13]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-star-leftmost-first-100"
-regex = 'a*'
-input = "aaa"
-matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-star-all-100"
-regex = 'a*'
-input = "aaa"
-matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-star-leftmost-first-200"
-regex = '(abc)*'
-input = "zzabcabczzabc"
-matches = [[0, 0]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "repetition-star-all-200"
-regex = '(abc)*'
-input = "zzabcabczzabc"
-matches = [
- [0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
- [2, 5],
- [6, 6], [7, 7],
- [2, 8],
- [9, 9], [10, 10], [11, 11], [12, 12],
- [10, 13],
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "start-end-rep-leftmost-first"
-regex = '(^$)*'
-input = "abc"
-matches = [[0, 0]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "start-end-rep-all"
-regex = '(^$)*'
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "alt-leftmost-first-100"
-regex = 'abc|a'
-input = "zzabcazzaabc"
-matches = [[2, 3], [2, 5]]
-match_kind = "leftmost-first"
-search_kind = "overlapping"
-
-[[tests]]
-name = "alt-all-100"
-regex = 'abc|a'
-input = "zzabcazzaabc"
-matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty-000"
-regex = ""
-input = "abc"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty-alt-000"
-regex = "|b"
-input = "abc"
-matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty-alt-010"
-regex = "b|"
-input = "abc"
-matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
-match_kind = "all"
-search_kind = "overlapping"
diff --git a/vendor/regex-automata/tests/data/regression.toml b/vendor/regex-automata/tests/data/regression.toml
deleted file mode 100644
index 6a4dbb151..000000000
--- a/vendor/regex-automata/tests/data/regression.toml
+++ /dev/null
@@ -1,423 +0,0 @@
-# See: https://github.com/rust-lang/regex/issues/48
-[[tests]]
-name = "invalid-regex-no-crash-100"
-regex = '(*)'
-input = ""
-matches = []
-compiles = false
-
-# See: https://github.com/rust-lang/regex/issues/48
-[[tests]]
-name = "invalid-regex-no-crash-200"
-regex = '(?:?)'
-input = ""
-matches = []
-compiles = false
-
-# See: https://github.com/rust-lang/regex/issues/48
-[[tests]]
-name = "invalid-regex-no-crash-300"
-regex = '(?)'
-input = ""
-matches = []
-compiles = false
-
-# See: https://github.com/rust-lang/regex/issues/48
-[[tests]]
-name = "invalid-regex-no-crash-400"
-regex = '*'
-input = ""
-matches = []
-compiles = false
-
-# See: https://github.com/rust-lang/regex/issues/75
-[[tests]]
-name = "unsorted-binary-search-100"
-regex = '(?i-u)[a_]+'
-input = "A_"
-matches = [[0, 2]]
-
-# See: https://github.com/rust-lang/regex/issues/75
-[[tests]]
-name = "unsorted-binary-search-200"
-regex = '(?i-u)[A_]+'
-input = "a_"
-matches = [[0, 2]]
-
-# See: https://github.com/rust-lang/regex/issues/76
-[[tests]]
-name = "unicode-case-lower-nocase-flag"
-regex = '(?i)\p{Ll}+'
-input = "ΛΘΓΔα"
-matches = [[0, 10]]
-
-# See: https://github.com/rust-lang/regex/issues/99
-[[tests]]
-name = "negated-char-class-100"
-regex = '(?i)[^x]'
-input = "x"
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/99
-[[tests]]
-name = "negated-char-class-200"
-regex = '(?i)[^x]'
-input = "X"
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/101
-[[tests]]
-name = "ascii-word-underscore"
-regex = '[[:word:]]'
-input = "_"
-matches = [[0, 1]]
-
-# See: https://github.com/rust-lang/regex/issues/129
-[[tests]]
-name = "captures-repeat"
-regex = '([a-f]){2}(?P<foo>[x-z])'
-input = "abx"
-captures = [
- [[0, 3], [0, 2], [2, 3]],
-]
-
-# See: https://github.com/rust-lang/regex/issues/153
-[[tests]]
-name = "alt-in-alt-100"
-regex = 'ab?|$'
-input = "az"
-matches = [[0, 1], [2, 2]]
-
-# See: https://github.com/rust-lang/regex/issues/153
-[[tests]]
-name = "alt-in-alt-200"
-regex = '^(.*?)(\n|\r\n?|$)'
-input = "ab\rcd"
-matches = [[0, 3]]
-
-# See: https://github.com/rust-lang/regex/issues/169
-[[tests]]
-name = "leftmost-first-prefix"
-regex = 'z*azb'
-input = "azb"
-matches = [[0, 3]]
-
-# See: https://github.com/rust-lang/regex/issues/191
-[[tests]]
-name = "many-alternates"
-regex = '1|2|3|4|5|6|7|8|9|10|int'
-input = "int"
-matches = [[0, 3]]
-
-# See: https://github.com/rust-lang/regex/issues/204
-[[tests]]
-name = "word-boundary-alone-100"
-regex = '\b'
-input = "Should this (work?)"
-matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]]
-
-# See: https://github.com/rust-lang/regex/issues/204
-[[tests]]
-name = "word-boundary-alone-200"
-regex = '\b'
-input = "a b c"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-
-# See: https://github.com/rust-lang/regex/issues/264
-[[tests]]
-name = "word-boundary-ascii-no-capture"
-regex = '\B'
-input = "\U00028F3E"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-unicode = false
-utf8 = false
-
-# See: https://github.com/rust-lang/regex/issues/264
-[[tests]]
-name = "word-boundary-ascii-capture"
-regex = '(\B)'
-input = "\U00028F3E"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
-unicode = false
-utf8 = false
-
-# See: https://github.com/rust-lang/regex/issues/268
-[[tests]]
-name = "partial-anchor"
-regex = '^a|b'
-input = "ba"
-matches = [[0, 1]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "endl-or-word-boundary"
-regex = '(?m:$)|(?-u:\b)'
-input = "\U0006084E"
-matches = [[4, 4]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "zero-or-end"
-regex = '(?i-u:\x00)|$'
-input = "\U000E682F"
-matches = [[4, 4]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "y-or-endl"
-regex = '(?i-u:y)|(?m:$)'
-input = "\U000B4331"
-matches = [[4, 4]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "word-boundary-start-x"
-regex = '(?u:\b)^(?-u:X)'
-input = "X"
-matches = [[0, 1]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "word-boundary-ascii-start-x"
-regex = '(?-u:\b)^(?-u:X)'
-input = "X"
-matches = [[0, 1]]
-
-# See: https://github.com/rust-lang/regex/issues/271
-[[tests]]
-name = "end-not-word-boundary"
-regex = '$\B'
-input = "\U0005C124\U000B576C"
-matches = [[8, 8]]
-unicode = false
-utf8 = false
-
-# See: https://github.com/rust-lang/regex/issues/280
-[[tests]]
-name = "partial-anchor-alternate-begin"
-regex = '^a|z'
-input = "yyyyya"
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/280
-[[tests]]
-name = "partial-anchor-alternate-end"
-regex = 'a$|z'
-input = "ayyyyy"
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/289
-[[tests]]
-name = "lits-unambiguous-100"
-regex = '(ABC|CDA|BC)X'
-input = "CDAX"
-matches = [[0, 4]]
-
-# See: https://github.com/rust-lang/regex/issues/291
-[[tests]]
-name = "lits-unambiguous-200"
-regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$'
-input = "CIMG2341"
-captures = [
- [[0, 8], [0, 4], [], [0, 4], [4, 8]],
-]
-
-# See: https://github.com/rust-lang/regex/issues/303
-[[tests]]
-name = "negated-full-byte-range"
-regex = '[^\x00-\xFF]'
-input = ""
-matches = []
-compiles = false
-unicode = false
-utf8 = false
-
-# See: https://github.com/rust-lang/regex/issues/321
-[[tests]]
-name = "strange-anchor-non-complete-prefix"
-regex = 'a^{2}'
-input = ""
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/321
-[[tests]]
-name = "strange-anchor-non-complete-suffix"
-regex = '${2}a'
-input = ""
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/334
-# See: https://github.com/rust-lang/regex/issues/557
-[[tests]]
-name = "captures-after-dfa-premature-end-100"
-regex = 'a(b*(X|$))?'
-input = "abcbX"
-captures = [
- [[0, 1], [], []],
-]
-
-# See: https://github.com/rust-lang/regex/issues/334
-# See: https://github.com/rust-lang/regex/issues/557
-[[tests]]
-name = "captures-after-dfa-premature-end-200"
-regex = 'a(bc*(X|$))?'
-input = "abcbX"
-captures = [
- [[0, 1], [], []],
-]
-
-# See: https://github.com/rust-lang/regex/issues/334
-# See: https://github.com/rust-lang/regex/issues/557
-[[tests]]
-name = "captures-after-dfa-premature-end-300"
-regex = '(aa$)?'
-input = "aaz"
-captures = [
- [[0, 0]],
- [[1, 1]],
- [[2, 2]],
- [[3, 3]],
-]
-
-# See: https://github.com/rust-lang/regex/issues/437
-[[tests]]
-name = "literal-panic"
-regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+'
-input = "test"
-matches = []
-
-# See: https://github.com/rust-lang/regex/issues/527
-[[tests]]
-name = "empty-flag-expr"
-regex = '(((?x)))'
-input = ""
-matches = [[0, 0]]
-
-# See: https://github.com/rust-lang/regex/issues/533
-[[tests]]
-name = "blank-matches-nothing-between-space-and-tab"
-regex = '[[:blank:]]'
-input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
-match = false
-unescape = true
-
-# See: https://github.com/rust-lang/regex/issues/533
-[[tests]]
-name = "blank-matches-nothing-between-space-and-tab-inverted"
-regex = '^[[:^blank:]]+$'
-input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
-match = true
-unescape = true
-
-# See: https://github.com/rust-lang/regex/issues/555
-[[tests]]
-name = "invalid-repetition"
-regex = '(?m){1,1}'
-input = ""
-matches = []
-compiles = false
-
-# See: https://github.com/rust-lang/regex/issues/640
-[[tests]]
-name = "flags-are-unset"
-regex = '((?i)foo)|Bar'
-input = "foo Foo bar Bar"
-matches = [[0, 3], [4, 7], [12, 15]]
-
-# Note that 'Ј' is not 'j', but cyrillic Je
-# https://en.wikipedia.org/wiki/Je_(Cyrillic)
-#
-# See: https://github.com/rust-lang/regex/issues/659
-[[tests]]
-name = "empty-group-with-unicode"
-regex = '()Ј01'
-input = 'zЈ01'
-matches = [[1, 5]]
-
-# See: https://github.com/rust-lang/regex/issues/579
-[[tests]]
-name = "word-boundary-weird"
-regex = '\b..\b'
-input = "I have 12, he has 2!"
-matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
-
-# See: https://github.com/rust-lang/regex/issues/579
-[[tests]]
-name = "word-boundary-weird-ascii"
-regex = '\b..\b'
-input = "I have 12, he has 2!"
-matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
-unicode = false
-utf8 = false
-
-# See: https://github.com/rust-lang/regex/issues/579
-[[tests]]
-name = "word-boundary-weird-minimal-ascii"
-regex = '\b..\b'
-input = "az,,b"
-matches = [[0, 2], [2, 4]]
-unicode = false
-utf8 = false
-
-# See: https://github.com/BurntSushi/ripgrep/issues/1203
-[[tests]]
-name = "reverse-suffix-100"
-regex = '[0-4][0-4][0-4]000'
-input = "153.230000"
-matches = [[4, 10]]
-
-# See: https://github.com/BurntSushi/ripgrep/issues/1203
-[[tests]]
-name = "reverse-suffix-200"
-regex = '[0-9][0-9][0-9]000'
-input = "153.230000\n"
-matches = [[4, 10]]
-
-# See: https://github.com/BurntSushi/ripgrep/issues/1247
-[[tests]]
-name = "stops"
-regex = '\bs(?:[ab])'
-input = 's\xE4'
-matches = []
-unescape = true
-
-# See: https://github.com/BurntSushi/ripgrep/issues/1247
-[[tests]]
-name = "stops-ascii"
-regex = '(?-u:\b)s(?:[ab])'
-input = 's\xE4'
-matches = []
-unescape = true
-
-# There is no issue for this bug.
-[[tests]]
-name = "anchored-prefix-100"
-regex = '^a[[:^space:]]'
-input = "a "
-matches = []
-
-# There is no issue for this bug.
-[[tests]]
-name = "anchored-prefix-200"
-regex = '^a[[:^space:]]'
-input = "foo boo a"
-matches = []
-
-# There is no issue for this bug.
-[[tests]]
-name = "anchored-prefix-300"
-regex = '^-[a-z]'
-input = "r-f"
-matches = []
-
-# Tests that a possible Aho-Corasick optimization works correctly. It only
-# kicks in when we have a lot of literals. By "works correctly," we mean that
-# leftmost-first match semantics are properly respected. That is, samwise
-# should match, not sam.
-#
-# There is no issue for this bug.
-[[tests]]
-name = "aho-corasick-100"
-regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z'
-input = "samwise"
-matches = [[0, 7]]
diff --git a/vendor/regex-automata/tests/data/set.toml b/vendor/regex-automata/tests/data/set.toml
deleted file mode 100644
index e0eb0583e..000000000
--- a/vendor/regex-automata/tests/data/set.toml
+++ /dev/null
@@ -1,523 +0,0 @@
-[[tests]]
-name = "basic10"
-regexes = ["a", "a"]
-input = "a"
-matches = [
- { id = 0, offsets = [0, 1] },
- { id = 1, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic10-leftmost-first"
-regexes = ["a", "a"]
-input = "a"
-matches = [
- { id = 0, offsets = [0, 1] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "basic20"
-regexes = ["a", "a"]
-input = "ba"
-matches = [
- { id = 0, offsets = [1, 2] },
- { id = 1, offsets = [1, 2] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic30"
-regexes = ["a", "b"]
-input = "a"
-matches = [
- { id = 0, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic40"
-regexes = ["a", "b"]
-input = "b"
-matches = [
- { id = 1, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic50"
-regexes = ["a|b", "b|a"]
-input = "b"
-matches = [
- { id = 0, offsets = [0, 1] },
- { id = 1, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic60"
-regexes = ["foo", "oo"]
-input = "foo"
-matches = [
- { id = 0, offsets = [0, 3] },
- { id = 1, offsets = [1, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic60-leftmost-first"
-regexes = ["foo", "oo"]
-input = "foo"
-matches = [
- { id = 0, offsets = [0, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "basic61"
-regexes = ["oo", "foo"]
-input = "foo"
-matches = [
- { id = 1, offsets = [0, 3] },
- { id = 0, offsets = [1, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic61-leftmost-first"
-regexes = ["oo", "foo"]
-input = "foo"
-matches = [
- { id = 1, offsets = [0, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "basic70"
-regexes = ["abcd", "bcd", "cd", "d"]
-input = "abcd"
-matches = [
- { id = 0, offsets = [0, 4] },
- { id = 1, offsets = [1, 4] },
- { id = 2, offsets = [2, 4] },
- { id = 3, offsets = [3, 4] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic71"
-regexes = ["bcd", "cd", "d", "abcd"]
-input = "abcd"
-matches = [
- { id = 3, offsets = [0, 4] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "basic80"
-regexes = ["^foo", "bar$"]
-input = "foo"
-matches = [
- { id = 0, offsets = [0, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic81"
-regexes = ["^foo", "bar$"]
-input = "foo bar"
-matches = [
- { id = 0, offsets = [0, 3] },
- { id = 1, offsets = [4, 7] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic82"
-regexes = ["^foo", "bar$"]
-input = "bar"
-matches = [
- { id = 1, offsets = [0, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic90"
-regexes = ["[a-z]+$", "foo"]
-input = "01234 foo"
-matches = [
- { id = 0, offsets = [6, 9] },
- { id = 1, offsets = [6, 9] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic91"
-regexes = ["[a-z]+$", "foo"]
-input = "foo 01234"
-matches = [
- { id = 1, offsets = [0, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic100"
-regexes = [".*?", "a"]
-input = "zzza"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [0, 1] },
- { id = 0, offsets = [0, 2] },
- { id = 0, offsets = [0, 3] },
- { id = 0, offsets = [0, 4] },
- { id = 1, offsets = [3, 4] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic101"
-regexes = [".*", "a"]
-input = "zzza"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [0, 1] },
- { id = 0, offsets = [0, 2] },
- { id = 0, offsets = [0, 3] },
- { id = 0, offsets = [0, 4] },
- { id = 1, offsets = [3, 4] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic102"
-regexes = [".*", "a"]
-input = "zzz"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [0, 1] },
- { id = 0, offsets = [0, 2] },
- { id = 0, offsets = [0, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic110"
-regexes = ['\ba\b']
-input = "hello a bye"
-matches = [
- { id = 0, offsets = [6, 7] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic111"
-regexes = ['\ba\b', '\be\b']
-input = "hello a bye e"
-matches = [
- { id = 0, offsets = [6, 7] },
- { id = 1, offsets = [12, 13] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic120"
-regexes = ["a"]
-input = "a"
-matches = [
- { id = 0, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic121"
-regexes = [".*a"]
-input = "a"
-matches = [
- { id = 0, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic122"
-regexes = [".*a", "β"]
-input = "β"
-matches = [
- { id = 1, offsets = [0, 2] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "basic130"
-regexes = ["ab", "b"]
-input = "ba"
-matches = [
- { id = 1, offsets = [0, 1] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty10"
-regexes = ["", "a"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 1, offsets = [0, 1] },
- { id = 0, offsets = [1, 1] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty10-leftmost-first"
-regexes = ["", "a"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty11"
-regexes = ["a", ""]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 0, offsets = [0, 1] },
- { id = 1, offsets = [1, 1] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty11-leftmost-first"
-regexes = ["a", ""]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 1] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty20"
-regexes = ["", "b"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 1, offsets = [1, 2] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty20-leftmost-first"
-regexes = ["", "b"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty21"
-regexes = ["b", ""]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 1, offsets = [1, 1] },
- { id = 0, offsets = [1, 2] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty21-leftmost-first"
-regexes = ["b", ""]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 0, offsets = [1, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty22"
-regexes = ["(?:)", "b"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 1, offsets = [1, 2] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty23"
-regexes = ["b", "(?:)"]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 1, offsets = [1, 1] },
- { id = 0, offsets = [1, 2] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty30"
-regexes = ["", "z"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty30-leftmost-first"
-regexes = ["", "z"]
-input = "abc"
-matches = [
- { id = 0, offsets = [0, 0] },
- { id = 0, offsets = [1, 1] },
- { id = 0, offsets = [2, 2] },
- { id = 0, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty31"
-regexes = ["z", ""]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 1, offsets = [1, 1] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty31-leftmost-first"
-regexes = ["z", ""]
-input = "abc"
-matches = [
- { id = 1, offsets = [0, 0] },
- { id = 1, offsets = [1, 1] },
- { id = 1, offsets = [2, 2] },
- { id = 1, offsets = [3, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "empty40"
-regexes = ["c(?:)", "b"]
-input = "abc"
-matches = [
- { id = 1, offsets = [1, 2] },
- { id = 0, offsets = [2, 3] },
-]
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "empty40-leftmost-first"
-regexes = ["c(?:)", "b"]
-input = "abc"
-matches = [
- { id = 1, offsets = [1, 2] },
- { id = 0, offsets = [2, 3] },
-]
-match_kind = "leftmost-first"
-search_kind = "leftmost"
-
-[[tests]]
-name = "nomatch10"
-regexes = ["a", "a"]
-input = "b"
-matches = []
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "nomatch20"
-regexes = ["^foo", "bar$"]
-input = "bar foo"
-matches = []
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "nomatch30"
-regexes = []
-input = "a"
-matches = []
-match_kind = "all"
-search_kind = "overlapping"
-
-[[tests]]
-name = "nomatch40"
-regexes = ["^rooted$", '\.log$']
-input = "notrooted"
-matches = []
-match_kind = "all"
-search_kind = "overlapping"
diff --git a/vendor/regex-automata/tests/data/unicode.toml b/vendor/regex-automata/tests/data/unicode.toml
deleted file mode 100644
index 016bbfd9b..000000000
--- a/vendor/regex-automata/tests/data/unicode.toml
+++ /dev/null
@@ -1,514 +0,0 @@
-# Basic Unicode literal support.
-[[tests]]
-name = "literal1"
-regex = '☃'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "literal2"
-regex = '☃+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "literal3"
-regex = '(?i)☃+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "literal4"
-regex = '(?i)Δ'
-input = "δ"
-matches = [[0, 2]]
-
-# Unicode word boundaries.
-[[tests]]
-name = "wb-100"
-regex = '\d\b'
-input = "6δ"
-matches = []
-
-[[tests]]
-name = "wb-200"
-regex = '\d\b'
-input = "6 "
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb-300"
-regex = '\d\B'
-input = "6δ"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb-400"
-regex = '\d\B'
-input = "6 "
-matches = []
-
-# Unicode character class support.
-[[tests]]
-name = "class1"
-regex = '[☃Ⅰ]+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class2"
-regex = '\pN'
-input = "Ⅰ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class3"
-regex = '\pN+'
-input = "Ⅰ1Ⅱ2"
-matches = [[0, 8]]
-
-[[tests]]
-name = "class4"
-regex = '\PN+'
-input = "abⅠ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class5"
-regex = '[\PN]+'
-input = "abⅠ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class6"
-regex = '[^\PN]+'
-input = "abⅠ"
-matches = [[2, 5]]
-
-[[tests]]
-name = "class7"
-regex = '\p{Lu}+'
-input = "ΛΘΓΔα"
-matches = [[0, 8]]
-
-[[tests]]
-name = "class8"
-regex = '(?i)\p{Lu}+'
-input = "ΛΘΓΔα"
-matches = [[0, 10]]
-
-[[tests]]
-name = "class9"
-regex = '\pL+'
-input = "ΛΘΓΔα"
-matches = [[0, 10]]
-
-[[tests]]
-name = "class10"
-regex = '\p{Ll}+'
-input = "ΛΘΓΔα"
-matches = [[8, 10]]
-
-# Unicode aware "Perl" character classes.
-[[tests]]
-name = "perl1"
-regex = '\w+'
-input = "dδd"
-matches = [[0, 4]]
-
-[[tests]]
-name = "perl2"
-regex = '\w+'
-input = "⥡"
-matches = []
-
-[[tests]]
-name = "perl3"
-regex = '\W+'
-input = "⥡"
-matches = [[0, 3]]
-
-[[tests]]
-name = "perl4"
-regex = '\d+'
-input = "1२३9"
-matches = [[0, 8]]
-
-[[tests]]
-name = "perl5"
-regex = '\d+'
-input = "Ⅱ"
-matches = []
-
-[[tests]]
-name = "perl6"
-regex = '\D+'
-input = "Ⅱ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "perl7"
-regex = '\s+'
-input = " "
-matches = [[0, 3]]
-
-[[tests]]
-name = "perl8"
-regex = '\s+'
-input = "☃"
-matches = []
-
-[[tests]]
-name = "perl9"
-regex = '\S+'
-input = "☃"
-matches = [[0, 3]]
-
-# Specific tests for Unicode general category classes.
-[[tests]]
-name = "class-gencat1"
-regex = '\p{Cased_Letter}'
-input = "A"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat2"
-regex = '\p{Close_Punctuation}'
-input = "❯"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat3"
-regex = '\p{Connector_Punctuation}'
-input = "⁀"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat4"
-regex = '\p{Control}'
-input = "\u009F"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class-gencat5"
-regex = '\p{Currency_Symbol}'
-input = "£"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat6"
-regex = '\p{Dash_Punctuation}'
-input = "〰"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat7"
-regex = '\p{Decimal_Number}'
-input = "𑓙"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat8"
-regex = '\p{Enclosing_Mark}'
-input = "\uA672"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat9"
-regex = '\p{Final_Punctuation}'
-input = "⸡"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat10"
-regex = '\p{Format}'
-input = "\U000E007F"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat11"
-regex = '\p{Initial_Punctuation}'
-input = "⸜"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat12"
-regex = '\p{Letter}'
-input = "Έ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class-gencat13"
-regex = '\p{Letter_Number}'
-input = "ↂ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat14"
-regex = '\p{Line_Separator}'
-input = "\u2028"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat15"
-regex = '\p{Lowercase_Letter}'
-input = "ϛ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class-gencat16"
-regex = '\p{Mark}'
-input = "\U000E01EF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat17"
-regex = '\p{Math}'
-input = "⋿"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat18"
-regex = '\p{Modifier_Letter}'
-input = "𖭃"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat19"
-regex = '\p{Modifier_Symbol}'
-input = "🏿"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat20"
-regex = '\p{Nonspacing_Mark}'
-input = "\U0001E94A"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat21"
-regex = '\p{Number}'
-input = "⓿"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat22"
-regex = '\p{Open_Punctuation}'
-input = "⦅"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat23"
-regex = '\p{Other}'
-input = "\u0BC9"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat24"
-regex = '\p{Other_Letter}'
-input = "ꓷ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat25"
-regex = '\p{Other_Number}'
-input = "㉏"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat26"
-regex = '\p{Other_Punctuation}'
-input = "𞥞"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat27"
-regex = '\p{Other_Symbol}'
-input = "⅌"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat28"
-regex = '\p{Paragraph_Separator}'
-input = "\u2029"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat29"
-regex = '\p{Private_Use}'
-input = "\U0010FFFD"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat30"
-regex = '\p{Punctuation}'
-input = "𑁍"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat31"
-regex = '\p{Separator}'
-input = "\u3000"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat32"
-regex = '\p{Space_Separator}'
-input = "\u205F"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat33"
-regex = '\p{Spacing_Mark}'
-input = "\U00016F7E"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat34"
-regex = '\p{Symbol}'
-input = "⯈"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat35"
-regex = '\p{Titlecase_Letter}'
-input = "ῼ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gencat36"
-regex = '\p{Unassigned}'
-input = "\U0010FFFF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gencat37"
-regex = '\p{Uppercase_Letter}'
-input = "Ꝋ"
-matches = [[0, 3]]
-
-
-# Tests for Unicode emoji properties.
-[[tests]]
-name = "class-emoji1"
-regex = '\p{Emoji}'
-input = "\u23E9"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-emoji2"
-regex = '\p{emoji}'
-input = "\U0001F21A"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-emoji3"
-regex = '\p{extendedpictographic}'
-input = "\U0001FA6E"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-emoji4"
-regex = '\p{extendedpictographic}'
-input = "\U0001FFFD"
-matches = [[0, 4]]
-
-
-# Tests for Unicode grapheme cluster properties.
-[[tests]]
-name = "class-gcb1"
-regex = '\p{grapheme_cluster_break=prepend}'
-input = "\U00011D46"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gcb2"
-regex = '\p{gcb=regional_indicator}'
-input = "\U0001F1E6"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gcb3"
-regex = '\p{gcb=ri}'
-input = "\U0001F1E7"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gcb4"
-regex = '\p{regionalindicator}'
-input = "\U0001F1FF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-gcb5"
-regex = '\p{gcb=lvt}'
-input = "\uC989"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-gcb6"
-regex = '\p{gcb=zwj}'
-input = "\u200D"
-matches = [[0, 3]]
-
-# Tests for Unicode word boundary properties.
-[[tests]]
-name = "class-word-break1"
-regex = '\p{word_break=Hebrew_Letter}'
-input = "\uFB46"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-word-break2"
-regex = '\p{wb=hebrewletter}'
-input = "\uFB46"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-word-break3"
-regex = '\p{wb=ExtendNumLet}'
-input = "\uFF3F"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-word-break4"
-regex = '\p{wb=WSegSpace}'
-input = "\u3000"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-word-break5"
-regex = '\p{wb=numeric}'
-input = "\U0001E950"
-matches = [[0, 4]]
-
-# Tests for Unicode sentence boundary properties.
-[[tests]]
-name = "class-sentence-break1"
-regex = '\p{sentence_break=Lower}'
-input = "\u0469"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class-sentence-break2"
-regex = '\p{sb=lower}'
-input = "\u0469"
-matches = [[0, 2]]
-
-[[tests]]
-name = "class-sentence-break3"
-regex = '\p{sb=Close}'
-input = "\uFF60"
-matches = [[0, 3]]
-
-[[tests]]
-name = "class-sentence-break4"
-regex = '\p{sb=Close}'
-input = "\U0001F677"
-matches = [[0, 4]]
-
-[[tests]]
-name = "class-sentence-break5"
-regex = '\p{sb=SContinue}'
-input = "\uFF64"
-matches = [[0, 3]]
diff --git a/vendor/regex-automata/tests/data/word-boundary.toml b/vendor/regex-automata/tests/data/word-boundary.toml
deleted file mode 100644
index e84b25c2a..000000000
--- a/vendor/regex-automata/tests/data/word-boundary.toml
+++ /dev/null
@@ -1,771 +0,0 @@
-# Some of these are cribbed from RE2's test suite.
-
-# These test \b. Below are tests for \B.
-[[tests]]
-name = "wb1"
-regex = '\b'
-input = ""
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb2"
-regex = '\b'
-input = "a"
-matches = [[0, 0], [1, 1]]
-unicode = false
-
-[[tests]]
-name = "wb3"
-regex = '\b'
-input = "ab"
-matches = [[0, 0], [2, 2]]
-unicode = false
-
-[[tests]]
-name = "wb4"
-regex = '^\b'
-input = "ab"
-matches = [[0, 0]]
-unicode = false
-
-[[tests]]
-name = "wb5"
-regex = '\b$'
-input = "ab"
-matches = [[2, 2]]
-unicode = false
-
-[[tests]]
-name = "wb6"
-regex = '^\b$'
-input = "ab"
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb7"
-regex = '\bbar\b'
-input = "nobar bar foo bar"
-matches = [[6, 9], [14, 17]]
-unicode = false
-
-[[tests]]
-name = "wb8"
-regex = 'a\b'
-input = "faoa x"
-matches = [[3, 4]]
-unicode = false
-
-[[tests]]
-name = "wb9"
-regex = '\bbar'
-input = "bar x"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb10"
-regex = '\bbar'
-input = "foo\nbar x"
-matches = [[4, 7]]
-unicode = false
-
-[[tests]]
-name = "wb11"
-regex = 'bar\b'
-input = "foobar"
-matches = [[3, 6]]
-unicode = false
-
-[[tests]]
-name = "wb12"
-regex = 'bar\b'
-input = "foobar\nxxx"
-matches = [[3, 6]]
-unicode = false
-
-[[tests]]
-name = "wb13"
-regex = '(foo|bar|[A-Z])\b'
-input = "foo"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb14"
-regex = '(foo|bar|[A-Z])\b'
-input = "foo\n"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb15"
-regex = '\b(foo|bar|[A-Z])'
-input = "foo"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb16"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "X"
-matches = [[0, 1]]
-unicode = false
-
-[[tests]]
-name = "wb17"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "XY"
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb18"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "bar"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb19"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "foo"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb20"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "foo\n"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb21"
-regex = '\b(foo|bar|[A-Z])\b'
-input = "ffoo bbar N x"
-matches = [[10, 11]]
-unicode = false
-
-[[tests]]
-name = "wb22"
-regex = '\b(fo|foo)\b'
-input = "fo"
-matches = [[0, 2]]
-unicode = false
-
-[[tests]]
-name = "wb23"
-regex = '\b(fo|foo)\b'
-input = "foo"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb24"
-regex = '\b\b'
-input = ""
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb25"
-regex = '\b\b'
-input = "a"
-matches = [[0, 0], [1, 1]]
-unicode = false
-
-[[tests]]
-name = "wb26"
-regex = '\b$'
-input = ""
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb27"
-regex = '\b$'
-input = "x"
-matches = [[1, 1]]
-unicode = false
-
-[[tests]]
-name = "wb28"
-regex = '\b$'
-input = "y x"
-matches = [[3, 3]]
-unicode = false
-
-[[tests]]
-name = "wb29"
-regex = '(?-u:\b).$'
-input = "x"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb30"
-regex = '^\b(fo|foo)\b'
-input = "fo"
-matches = [[0, 2]]
-unicode = false
-
-[[tests]]
-name = "wb31"
-regex = '^\b(fo|foo)\b'
-input = "foo"
-matches = [[0, 3]]
-unicode = false
-
-[[tests]]
-name = "wb32"
-regex = '^\b$'
-input = ""
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb33"
-regex = '^\b$'
-input = "x"
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb34"
-regex = '^(?-u:\b).$'
-input = "x"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb35"
-regex = '^(?-u:\b).(?-u:\b)$'
-input = "x"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb36"
-regex = '^^^^^\b$$$$$'
-input = ""
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb37"
-regex = '^^^^^(?-u:\b).$$$$$'
-input = "x"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb38"
-regex = '^^^^^\b$$$$$'
-input = "x"
-matches = []
-unicode = false
-
-[[tests]]
-name = "wb39"
-regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
-input = "x"
-matches = [[0, 1]]
-
-[[tests]]
-name = "wb40"
-regex = '(?-u:\b).+(?-u:\b)'
-input = "$$abc$$"
-matches = [[2, 5]]
-
-[[tests]]
-name = "wb41"
-regex = '\b'
-input = "a b c"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
-unicode = false
-
-[[tests]]
-name = "wb42"
-regex = '\bfoo\b'
-input = "zzz foo zzz"
-matches = [[4, 7]]
-unicode = false
-
-[[tests]]
-name = "wb43"
-regex = '\b^'
-input = "ab"
-matches = [[0, 0]]
-unicode = false
-
-[[tests]]
-name = "wb44"
-regex = '$\b'
-input = "ab"
-matches = [[2, 2]]
-unicode = false
-
-
-# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
-# have to disable it for most of these tests. This is because \B can match at
-# non-UTF-8 boundaries.
-[[tests]]
-name = "nb1"
-regex = '\Bfoo\B'
-input = "n foo xfoox that"
-matches = [[7, 10]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb2"
-regex = 'a\B'
-input = "faoa x"
-matches = [[1, 2]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb3"
-regex = '\Bbar'
-input = "bar x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb4"
-regex = '\Bbar'
-input = "foo\nbar x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb5"
-regex = 'bar\B'
-input = "foobar"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb6"
-regex = 'bar\B'
-input = "foobar\nxxx"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb7"
-regex = '(foo|bar|[A-Z])\B'
-input = "foox"
-matches = [[0, 3]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb8"
-regex = '(foo|bar|[A-Z])\B'
-input = "foo\n"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb9"
-regex = '\B'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb10"
-regex = '\B'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb11"
-regex = '\B(foo|bar|[A-Z])'
-input = "foo"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb12"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "xXy"
-matches = [[1, 2]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb13"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "XY"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb14"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "XYZ"
-matches = [[1, 2]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb15"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "abara"
-matches = [[1, 4]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb16"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "xfoo_"
-matches = [[1, 4]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb17"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "xfoo\n"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb18"
-regex = '\B(foo|bar|[A-Z])\B'
-input = "foo bar vNX"
-matches = [[9, 10]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb19"
-regex = '\B(fo|foo)\B'
-input = "xfoo"
-matches = [[1, 3]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb20"
-regex = '\B(foo|fo)\B'
-input = "xfooo"
-matches = [[1, 4]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb21"
-regex = '\B\B'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb22"
-regex = '\B\B'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb23"
-regex = '\B$'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb24"
-regex = '\B$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb25"
-regex = '\B$'
-input = "y x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb26"
-regex = '\B.$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb27"
-regex = '^\B(fo|foo)\B'
-input = "fo"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb28"
-regex = '^\B(fo|foo)\B'
-input = "fo"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb29"
-regex = '^\B'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb30"
-regex = '^\B'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb31"
-regex = '^\B\B'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb32"
-regex = '^\B\B'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb33"
-regex = '^\B$'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb34"
-regex = '^\B$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb35"
-regex = '^\B.$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb36"
-regex = '^\B.\B$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb37"
-regex = '^^^^^\B$$$$$'
-input = ""
-matches = [[0, 0]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb38"
-regex = '^^^^^\B.$$$$$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "nb39"
-regex = '^^^^^\B$$$$$'
-input = "x"
-matches = []
-unicode = false
-utf8 = false
-
-
-# unicode1* and unicode2* work for both Unicode and ASCII because all matches
-# are reported as byte offsets, and « and » do not correspond to word
-# boundaries at either the character or byte level.
-[[tests]]
-name = "unicode1"
-regex = '\bx\b'
-input = "«x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "unicode1-only-ascii"
-regex = '\bx\b'
-input = "«x"
-matches = [[2, 3]]
-unicode = false
-
-[[tests]]
-name = "unicode2"
-regex = '\bx\b'
-input = "x»"
-matches = [[0, 1]]
-
-[[tests]]
-name = "unicode2-only-ascii"
-regex = '\bx\b'
-input = "x»"
-matches = [[0, 1]]
-unicode = false
-
-# ASCII word boundaries are completely oblivious to Unicode characters, so
-# even though β is a character, an ASCII \b treats it as a word boundary
-# when it is adjacent to another ASCII character. (The ASCII \b only looks
-# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
-[[tests]]
-name = "unicode3"
-regex = '\bx\b'
-input = 'áxβ'
-matches = []
-
-[[tests]]
-name = "unicode3-only-ascii"
-regex = '\bx\b'
-input = 'áxβ'
-matches = [[2, 3]]
-unicode = false
-
-[[tests]]
-name = "unicode4"
-regex = '\Bx\B'
-input = 'áxβ'
-matches = [[2, 3]]
-
-[[tests]]
-name = "unicode4-only-ascii"
-regex = '\Bx\B'
-input = 'áxβ'
-matches = []
-unicode = false
-utf8 = false
-
-# The same as above, but with \b instead of \B as a sanity check.
-[[tests]]
-name = "unicode5"
-regex = '\b'
-input = "0\U0007EF5E"
-matches = [[0, 0], [1, 1]]
-
-[[tests]]
-name = "unicode5-only-ascii"
-regex = '\b'
-input = "0\U0007EF5E"
-matches = [[0, 0], [1, 1]]
-unicode = false
-utf8 = false
-
-[[tests]]
-name = "unicode5-noutf8"
-regex = '\b'
-input = '0\xFF\xFF\xFF\xFF'
-matches = [[0, 0], [1, 1]]
-unescape = true
-utf8 = false
-
-[[tests]]
-name = "unicode5-noutf8-only-ascii"
-regex = '\b'
-input = '0\xFF\xFF\xFF\xFF'
-matches = [[0, 0], [1, 1]]
-unescape = true
-unicode = false
-utf8 = false
-
-# Weird special case to ensure that ASCII \B treats each individual code unit
-# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
-# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
-# \w character class.)
-[[tests]]
-name = "unicode5-not"
-regex = '\B'
-input = "0\U0007EF5E"
-matches = [[5, 5]]
-
-[[tests]]
-name = "unicode5-not-only-ascii"
-regex = '\B'
-input = "0\U0007EF5E"
-matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
-unicode = false
-utf8 = false
-
-# This gets no matches since \B only matches in the presence of valid UTF-8
-# when Unicode is enabled, even when UTF-8 mode is disabled.
-[[tests]]
-name = "unicode5-not-noutf8"
-regex = '\B'
-input = '0\xFF\xFF\xFF\xFF'
-matches = []
-unescape = true
-utf8 = false
-
-# But this DOES get matches since \B in ASCII mode only looks at individual
-# bytes.
-[[tests]]
-name = "unicode5-not-noutf8-only-ascii"
-regex = '\B'
-input = '0\xFF\xFF\xFF\xFF'
-matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
-unescape = true
-unicode = false
-utf8 = false
-
-# Some tests of no particular significance.
-[[tests]]
-name = "unicode6"
-regex = '\b[0-9]+\b'
-input = "foo 123 bar 456 quux 789"
-matches = [[4, 7], [12, 15], [21, 24]]
-
-[[tests]]
-name = "unicode7"
-regex = '\b[0-9]+\b'
-input = "foo 123 bar a456 quux 789"
-matches = [[4, 7], [22, 25]]
-
-[[tests]]
-name = "unicode8"
-regex = '\b[0-9]+\b'
-input = "foo 123 bar 456a quux 789"
-matches = [[4, 7], [22, 25]]
diff --git a/vendor/regex-automata/tests/dfa/api.rs b/vendor/regex-automata/tests/dfa/api.rs
index 80d7d704c..96e73af6c 100644
--- a/vendor/regex-automata/tests/dfa/api.rs
+++ b/vendor/regex-automata/tests/dfa/api.rs
@@ -1,13 +1,11 @@
use std::error::Error;
use regex_automata::{
- dfa::{dense, regex::Regex, Automaton, OverlappingState},
+ dfa::{dense, Automaton, OverlappingState},
nfa::thompson,
- HalfMatch, MatchError, MatchKind, MultiMatch,
+ HalfMatch, Input, MatchError,
};
-use crate::util::{BunkPrefilter, SubstringPrefilter};
-
// Tests that quit bytes in the forward direction work correctly.
#[test]
fn quit_fwd() -> Result<(), Box<dyn Error>> {
@@ -16,16 +14,15 @@ fn quit_fwd() -> Result<(), Box<dyn Error>> {
.build("[[:word:]]+$")?;
assert_eq!(
- dfa.find_earliest_fwd(b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_leftmost_fwd(b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ Err(MatchError::quit(b'x', 3)),
+ dfa.try_search_fwd(&Input::new(b"abcxyz"))
);
assert_eq!(
- dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ dfa.try_search_overlapping_fwd(
+ &Input::new(b"abcxyz"),
+ &mut OverlappingState::start()
+ ),
+ Err(MatchError::quit(b'x', 3)),
);
Ok(())
@@ -40,12 +37,8 @@ fn quit_rev() -> Result<(), Box<dyn Error>> {
.build("^[[:word:]]+")?;
assert_eq!(
- dfa.find_earliest_rev(b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_leftmost_rev(b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ Err(MatchError::quit(b'x', 3)),
+ dfa.try_search_rev(&Input::new(b"abcxyz"))
);
Ok(())
@@ -60,28 +53,6 @@ fn quit_panics() {
dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false);
}
-// Tests that if we attempt an overlapping search using a regex without a
-// reverse DFA compiled with 'starts_for_each_pattern', then we get a panic.
-#[test]
-#[should_panic]
-fn incorrect_config_overlapping_search_panics() {
- let forward = dense::DFA::new(r"abca").unwrap();
- let reverse = dense::Builder::new()
- .configure(
- dense::Config::new()
- .anchored(true)
- .match_kind(MatchKind::All)
- .starts_for_each_pattern(false),
- )
- .thompson(thompson::Config::new().reverse(true))
- .build(r"abca")
- .unwrap();
-
- let re = Regex::builder().build_from_dfas(forward, reverse);
- let haystack = "bar abcabcabca abca foo".as_bytes();
- re.find_overlapping(haystack, &mut OverlappingState::start());
-}
-
// This tests an intesting case where even if the Unicode word boundary option
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
// word boundaries to be enabled.
@@ -93,41 +64,6 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
}
let dfa = dense::Builder::new().configure(config).build(r"\b")?;
let expected = HalfMatch::must(0, 1);
- assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected)));
- Ok(())
-}
-
-// Tests that we can provide a prefilter to a Regex, and the search reports
-// correct results.
-#[test]
-fn prefilter_works() -> Result<(), Box<dyn Error>> {
- let re = Regex::new(r"a[0-9]+")
- .unwrap()
- .with_prefilter(SubstringPrefilter::new("a"));
- let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
- let matches: Vec<(usize, usize)> =
- re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect();
- assert_eq!(
- matches,
- vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
- );
- Ok(())
-}
-
-// This test confirms that a prefilter is active by using a prefilter that
-// reports false negatives.
-#[test]
-fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
- let text = b"za123";
- let re = Regex::new(r"a[0-9]+")
- .unwrap()
- .with_prefilter(SubstringPrefilter::new("a"));
- assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5)));
- assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4)));
- let re = re.with_prefilter(BunkPrefilter::new());
- assert_eq!(re.find_leftmost(b"za123"), None);
- // This checks that the prefilter is used when first starting the search,
- // instead of waiting until at least one transition has occurred.
- assert_eq!(re.find_leftmost(b"a123"), None);
+ assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a")));
Ok(())
}
diff --git a/vendor/regex-automata/tests/dfa/mod.rs b/vendor/regex-automata/tests/dfa/mod.rs
index f4299510c..0d8f539db 100644
--- a/vendor/regex-automata/tests/dfa/mod.rs
+++ b/vendor/regex-automata/tests/dfa/mod.rs
@@ -1,2 +1,8 @@
+#[cfg(all(feature = "dfa-build", feature = "dfa-search"))]
mod api;
+#[cfg(feature = "dfa-onepass")]
+mod onepass;
+#[cfg(all(feature = "dfa-build", feature = "dfa-search"))]
+mod regression;
+#[cfg(all(not(miri), feature = "dfa-build", feature = "dfa-search"))]
mod suite;
diff --git a/vendor/regex-automata/tests/dfa/onepass/mod.rs b/vendor/regex-automata/tests/dfa/onepass/mod.rs
new file mode 100644
index 000000000..9d6ab475e
--- /dev/null
+++ b/vendor/regex-automata/tests/dfa/onepass/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(not(miri))]
+mod suite;
diff --git a/vendor/regex-automata/tests/dfa/onepass/suite.rs b/vendor/regex-automata/tests/dfa/onepass/suite.rs
new file mode 100644
index 000000000..20bd6965c
--- /dev/null
+++ b/vendor/regex-automata/tests/dfa/onepass/suite.rs
@@ -0,0 +1,197 @@
+use {
+ anyhow::Result,
+ regex_automata::{
+ dfa::onepass::{self, DFA},
+ nfa::thompson,
+ util::{iter, syntax},
+ },
+ regex_test::{
+ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
+ TestRunner,
+ },
+};
+
+use crate::{create_input, suite, testify_captures, untestify_kind};
+
+const EXPANSIONS: &[&str] = &["is_match", "find", "captures"];
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+ let builder = DFA::builder();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all
+/// tests.
+#[test]
+fn starts_for_each_pattern() -> Result<()> {
+ let mut builder = DFA::builder();
+ builder.configure(DFA::config().starts_for_each_pattern(true));
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when byte classes are disabled.
+///
+/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
+/// All it does is cause every byte value to be its own distinct equivalence
+/// class.
+#[test]
+fn no_byte_classes() -> Result<()> {
+ let mut builder = DFA::builder();
+ builder.configure(DFA::config().byte_classes(false));
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: onepass::Builder,
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ // Check if our regex contains things that aren't supported by DFAs.
+ // That is, Unicode word boundaries when searching non-ASCII text.
+ if !configure_onepass_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = match builder.build_many(&regexes) {
+ Ok(re) => re,
+ Err(err) => {
+ let msg = err.to_string();
+ // This is pretty gross, but when a regex fails to compile as
+ // a one-pass regex, then we want to be OK with that and just
+ // skip the test. But we have to be careful to only skip it
+ // when the expected result is that the regex compiles. If
+ // the test is specifically checking that the regex does not
+ // compile, then we should bubble up that error and allow the
+ // test to pass.
+ //
+ // Since our error types are all generally opaque, we just
+ // look for an error string. Not great, but not the end of the
+ // world.
+ if test.compiles() && msg.contains("not one-pass") {
+ return Ok(CompiledRegex::skip());
+ }
+ return Err(err.into());
+ }
+ };
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, &mut cache, test)
+ }))
+ }
+}
+
+fn run_test(
+ re: &DFA,
+ cache: &mut onepass::Cache,
+ test: &RegexTest,
+) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => {
+ TestResult::matched(re.is_match(cache, input.earliest(true)))
+ }
+ "find" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ let input =
+ input.earliest(test.search_kind() == SearchKind::Earliest);
+ let mut caps = re.create_captures();
+ let it = iter::Searcher::new(input)
+ .into_matches_iter(|input| {
+ re.try_search(cache, input, &mut caps)?;
+ Ok(caps.get_match())
+ })
+ .infallible()
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ });
+ TestResult::matches(it)
+ }
+ SearchKind::Overlapping => {
+ // The one-pass DFA does not support any kind of overlapping
+ // search. This is not just a matter of not having the API.
+ // It's fundamentally incompatible with the one-pass concept.
+ // If overlapping matches were possible, then the one-pass DFA
+ // would fail to build.
+ TestResult::skip()
+ }
+ },
+ "captures" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ let input =
+ input.earliest(test.search_kind() == SearchKind::Earliest);
+ let it = iter::Searcher::new(input)
+ .into_captures_iter(re.create_captures(), |input, caps| {
+ re.try_search(cache, input, caps)
+ })
+ .infallible()
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|caps| testify_captures(&caps));
+ TestResult::captures(it)
+ }
+ SearchKind::Overlapping => {
+ // The one-pass DFA does not support any kind of overlapping
+ // search. This is not just a matter of not having the API.
+ // It's fundamentally incompatible with the one-pass concept.
+ // If overlapping matches were possible, then the one-pass DFA
+ // would fail to build.
+ TestResult::skip()
+ }
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_onepass_builder(
+ test: &RegexTest,
+ builder: &mut onepass::Builder,
+) -> bool {
+ if !test.anchored() {
+ return false;
+ }
+ let match_kind = match untestify_kind(test.match_kind()) {
+ None => return false,
+ Some(k) => k,
+ };
+
+ let config = DFA::config().match_kind(match_kind);
+ builder
+ .configure(config)
+ .syntax(config_syntax(test))
+ .thompson(config_thompson(test));
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ let mut lookm = regex_automata::util::look::LookMatcher::new();
+ lookm.set_line_terminator(test.line_terminator());
+ thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
+}
diff --git a/vendor/regex-automata/tests/regression.rs b/vendor/regex-automata/tests/dfa/regression.rs
index e5355fed7..09caffabc 100644
--- a/vendor/regex-automata/tests/regression.rs
+++ b/vendor/regex-automata/tests/dfa/regression.rs
@@ -1,13 +1,14 @@
-use regex_automata::{
- dfa::{dense, Automaton},
- MatchError,
-};
-
// A regression test for checking that minimization correctly translates
// whether a state is a match state or not. Previously, it was possible for
// minimization to mark a non-matching state as matching.
#[test]
+#[cfg(not(miri))]
fn minimize_sets_correct_match_states() {
+ use regex_automata::{
+ dfa::{dense::DFA, Automaton, StartKind},
+ Anchored, Input,
+ };
+
let pattern =
// This is a subset of the grapheme matching regex. I couldn't seem
// to get a repro any smaller than this unfortunately.
@@ -36,9 +37,12 @@ fn minimize_sets_correct_match_states() {
)
";
- let dfa = dense::Builder::new()
- .configure(dense::Config::new().anchored(true).minimize(true))
+ let dfa = DFA::builder()
+ .configure(
+ DFA::config().start_kind(StartKind::Anchored).minimize(true),
+ )
.build(pattern)
.unwrap();
- assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2"));
+ let input = Input::new(b"\xE2").anchored(Anchored::Yes);
+ assert_eq!(Ok(None), dfa.try_search_fwd(&input));
}
diff --git a/vendor/regex-automata/tests/dfa/suite.rs b/vendor/regex-automata/tests/dfa/suite.rs
index 426ae346d..f3445e02a 100644
--- a/vendor/regex-automata/tests/dfa/suite.rs
+++ b/vendor/regex-automata/tests/dfa/suite.rs
@@ -1,23 +1,77 @@
-use regex_automata::{
- dfa::{self, dense, regex::Regex, sparse, Automaton},
- nfa::thompson,
- MatchKind, SyntaxConfig,
+use {
+ anyhow::Result,
+ regex_automata::{
+ dfa::{
+ self, dense, regex::Regex, sparse, Automaton, OverlappingState,
+ StartKind,
+ },
+ nfa::thompson,
+ util::{prefilter::Prefilter, syntax},
+ Anchored, Input, PatternSet,
+ },
+ regex_syntax::hir,
+ regex_test::{
+ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
+ TestRunner,
+ },
};
-use regex_syntax as syntax;
-use regex_test::{
- bstr::{BString, ByteSlice},
- CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
- SearchKind as TestSearchKind, TestResult, TestRunner,
-};
+use crate::{create_input, suite, untestify_kind};
-use crate::{suite, Result};
+const EXPANSIONS: &[&str] = &["is_match", "find", "which"];
/// Runs the test suite with the default configuration.
#[test]
fn unminimized_default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite with the default configuration and a prefilter enabled,
+/// if one can be built.
+#[test]
+fn unminimized_prefilter() -> Result<()> {
+ let my_compiler = |test: &RegexTest, regexes: &[String]| {
+ // Parse regexes as HIRs so we can get literals to build a prefilter.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+ let kind = match untestify_kind(test.match_kind()) {
+ None => return Ok(CompiledRegex::skip()),
+ Some(kind) => kind,
+ };
+ let pre = Prefilter::from_hirs_prefix(kind, &hirs);
+ let mut builder = Regex::builder();
+ builder.dense(dense::DFA::config().prefilter(pre));
+ compiler(builder, |_, _, re| {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, test)
+ }))
+ })(test, regexes)
+ };
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
+ .test_iter(suite()?.iter(), my_compiler)
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite with start states specialized.
+#[test]
+fn unminimized_specialized_start_states() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dense(dense::Config::new().specialize_start_states(true));
+
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
@@ -30,18 +84,22 @@ fn unminimized_no_byte_class() -> Result<()> {
builder.dense(dense::Config::new().byte_classes(false));
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
}
-/// Runs the test suite with NFA shrinking disabled.
+/// Runs the test suite with NFA shrinking enabled.
#[test]
-fn unminimized_no_nfa_shrink() -> Result<()> {
+fn unminimized_nfa_shrink() -> Result<()> {
let mut builder = Regex::builder();
- builder.thompson(thompson::Config::new().shrink(false));
+ builder.thompson(thompson::Config::new().shrink(true));
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
Ok(())
@@ -54,7 +112,7 @@ fn minimized_default() -> Result<()> {
let mut builder = Regex::builder();
builder.dense(dense::Config::new().minimize(true));
TestRunner::new()?
- // These regexes tend to be too big. Minimization takes... forever.
+ .expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
@@ -68,7 +126,7 @@ fn minimized_no_byte_class() -> Result<()> {
builder.dense(dense::Config::new().minimize(true).byte_classes(false));
TestRunner::new()?
- // These regexes tend to be too big. Minimization takes... forever.
+ .expand(EXPANSIONS, |t| t.compiles())
.blacklist("expensive")
.test_iter(suite()?.iter(), dense_compiler(builder))
.assert();
@@ -80,22 +138,57 @@ fn minimized_no_byte_class() -> Result<()> {
fn sparse_unminimized_default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), sparse_compiler(builder))
.assert();
Ok(())
}
+/// Runs the test suite on a sparse unminimized DFA with prefilters enabled.
+#[test]
+fn sparse_unminimized_prefilter() -> Result<()> {
+ let my_compiler = |test: &RegexTest, regexes: &[String]| {
+ // Parse regexes as HIRs so we can get literals to build a prefilter.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+ let kind = match untestify_kind(test.match_kind()) {
+ None => return Ok(CompiledRegex::skip()),
+ Some(kind) => kind,
+ };
+ let pre = Prefilter::from_hirs_prefix(kind, &hirs);
+ let mut builder = Regex::builder();
+ builder.dense(dense::DFA::config().prefilter(pre));
+ compiler(builder, |builder, _, re| {
+ let fwd = re.forward().to_sparse()?;
+ let rev = re.reverse().to_sparse()?;
+ let re = builder.build_from_dfas(fwd, rev);
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, test)
+ }))
+ })(test, regexes)
+ };
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
+ .test_iter(suite()?.iter(), my_compiler)
+ .assert();
+ Ok(())
+}
+
/// Another basic sanity test that checks we can serialize and then deserialize
/// a regex, and that the resulting regex can be used for searching correctly.
#[test]
fn serialization_unminimized_default() -> Result<()> {
let builder = Regex::builder();
let my_compiler = |builder| {
- compiler(builder, |builder, re| {
+ compiler(builder, |builder, _, re| {
let builder = builder.clone();
let (fwd_bytes, _) = re.forward().to_bytes_native_endian();
let (rev_bytes, _) = re.reverse().to_bytes_native_endian();
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
let fwd: dense::DFA<&[u32]> =
dense::DFA::from_bytes(&fwd_bytes).unwrap().0;
let rev: dense::DFA<&[u32]> =
@@ -107,6 +200,8 @@ fn serialization_unminimized_default() -> Result<()> {
})
};
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler(builder))
.assert();
Ok(())
@@ -119,11 +214,11 @@ fn serialization_unminimized_default() -> Result<()> {
fn sparse_serialization_unminimized_default() -> Result<()> {
let builder = Regex::builder();
let my_compiler = |builder| {
- compiler(builder, |builder, re| {
+ compiler(builder, |builder, _, re| {
let builder = builder.clone();
let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian();
let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian();
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
let fwd: sparse::DFA<&[u8]> =
sparse::DFA::from_bytes(&fwd_bytes).unwrap().0;
let rev: sparse::DFA<&[u8]> =
@@ -134,6 +229,8 @@ fn sparse_serialization_unminimized_default() -> Result<()> {
})
};
TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .blacklist("expensive")
.test_iter(suite()?.iter(), my_compiler(builder))
.assert();
Ok(())
@@ -141,9 +238,9 @@ fn sparse_serialization_unminimized_default() -> Result<()> {
fn dense_compiler(
builder: dfa::regex::Builder,
-) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
- compiler(builder, |_, re| {
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
+ compiler(builder, |_, _, re| {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})
@@ -151,12 +248,12 @@ fn dense_compiler(
fn sparse_compiler(
builder: dfa::regex::Builder,
-) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
- compiler(builder, |builder, re| {
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
+ compiler(builder, |builder, _, re| {
let fwd = re.forward().to_sparse()?;
let rev = re.reverse().to_sparse()?;
let re = builder.build_from_dfas(fwd, rev);
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, test)
}))
})
@@ -166,79 +263,79 @@ fn compiler(
mut builder: dfa::regex::Builder,
mut create_matcher: impl FnMut(
&dfa::regex::Builder,
+ Option<Prefilter>,
Regex,
) -> Result<CompiledRegex>,
-) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
- let regexes = regexes
- .iter()
- .map(|r| r.to_str().map(|s| s.to_string()))
- .collect::<std::result::Result<Vec<String>, _>>()?;
+ // Parse regexes as HIRs for some analysis below.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+
+ // Get a prefilter in case the test wants it.
+ let kind = match untestify_kind(test.match_kind()) {
+ None => return Ok(CompiledRegex::skip()),
+ Some(kind) => kind,
+ };
+ let pre = Prefilter::from_hirs_prefix(kind, &hirs);
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
- let mut thompson = thompson::Builder::new();
- thompson.configure(config_thompson(test));
- // TODO: Modify Hir to report facts like this, instead of needing to
- // build an NFA to do it.
- if let Ok(nfa) = thompson.build_many(&regexes) {
- let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
- if nfa.has_word_boundary_unicode() && non_ascii {
- return Ok(CompiledRegex::skip());
+ if !test.haystack().is_ascii() {
+ for hir in hirs.iter() {
+ let looks = hir.properties().look_set();
+ if looks.contains(hir::Look::WordUnicode)
+ || looks.contains(hir::Look::WordUnicodeNegate)
+ {
+ return Ok(CompiledRegex::skip());
+ }
}
}
if !configure_regex_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
- create_matcher(&builder, builder.build_many(&regexes)?)
+ create_matcher(&builder, pre, builder.build_many(&regexes)?)
}
}
-fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> Vec<TestResult> {
- let is_match = if re.is_match(test.input()) {
- TestResult::matched()
- } else {
- TestResult::no_match()
- };
- let is_match = is_match.name("is_match");
-
- let find_matches = match test.search_kind() {
- TestSearchKind::Earliest => {
- let it = re
- .find_earliest_iter(test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_earliest_iter")
- }
- TestSearchKind::Leftmost => {
- let it = re
- .find_leftmost_iter(test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_leftmost_iter")
- }
- TestSearchKind::Overlapping => {
- let it = re
- .find_overlapping_iter(test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_overlapping_iter")
- }
- };
-
- vec![is_match, find_matches]
+fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => TestResult::matched(re.is_match(input.earliest(true))),
+ "find" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ let input =
+ input.earliest(test.search_kind() == SearchKind::Earliest);
+ TestResult::matches(
+ re.find_iter(input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ }),
+ )
+ }
+ SearchKind::Overlapping => {
+ try_search_overlapping(re, &input).unwrap()
+ }
+ },
+ "which" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ // There are no "which" APIs for standard searches.
+ TestResult::skip()
+ }
+ SearchKind::Overlapping => {
+ let dfa = re.forward();
+ let mut patset = PatternSet::new(dfa.pattern_len());
+ dfa.try_which_overlapping_matches(&input, &mut patset)
+ .unwrap();
+ TestResult::which(patset.iter().map(|p| p.as_usize()))
+ }
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
}
/// Configures the given regex builder with all relevant settings on the given
@@ -250,25 +347,32 @@ fn configure_regex_builder(
test: &RegexTest,
builder: &mut dfa::regex::Builder,
) -> bool {
- let match_kind = match test.match_kind() {
- TestMatchKind::All => MatchKind::All,
- TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
- TestMatchKind::LeftmostLongest => return false,
+ let match_kind = match untestify_kind(test.match_kind()) {
+ None => return false,
+ Some(k) => k,
};
- let syntax_config = SyntaxConfig::new()
- .case_insensitive(test.case_insensitive())
- .unicode(test.unicode())
- .utf8(test.utf8());
- let dense_config = dense::Config::new()
- .anchored(test.anchored())
+ let starts = if test.anchored() {
+ StartKind::Anchored
+ } else {
+ StartKind::Unanchored
+ };
+ let mut dense_config = dense::Config::new()
+ .start_kind(starts)
.match_kind(match_kind)
.unicode_word_boundary(true);
- let regex_config = Regex::config().utf8(test.utf8());
+ // When doing an overlapping search, we might try to find the start of each
+ // match with a custom search routine. In that case, we need to tell the
+ // reverse search (for the start offset) which pattern to look for. The
+ // only way that API works is when anchored starting states are compiled
+ // for each pattern. This does technically also enable it for the forward
+ // DFA, but we're okay with that.
+ if test.search_kind() == SearchKind::Overlapping {
+ dense_config = dense_config.starts_for_each_pattern(true);
+ }
builder
- .configure(regex_config)
- .syntax(syntax_config)
+ .syntax(config_syntax(test))
.thompson(config_thompson(test))
.dense(dense_config);
true
@@ -276,5 +380,68 @@ fn configure_regex_builder(
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
- thompson::Config::new().utf8(test.utf8())
+ let mut lookm = regex_automata::util::look::LookMatcher::new();
+ lookm.set_line_terminator(test.line_terminator());
+ thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
+}
+
+/// Configuration of the regex syntax from a regex test.
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
+}
+
+/// Execute an overlapping search, and for each match found, also find its
+/// overlapping starting positions.
+///
+/// N.B. This routine used to be part of the crate API, but 1) it wasn't clear
+/// to me how useful it was and 2) it wasn't clear to me what its semantics
+/// should be. In particular, a potentially surprising footgun of this routine
+/// that it is worst case *quadratic* in the size of the haystack. Namely, it's
+/// possible to report a match at every position, and for every such position,
+/// scan all the way to the beginning of the haystack to find the starting
+/// position. Typical leftmost non-overlapping searches don't suffer from this
+/// because, well, matches can't overlap. So subsequent searches after a match
+/// is found don't revisit previously scanned parts of the haystack.
+///
+/// Its semantics can be strange for other reasons too. For example, given
+/// the regex '.*' and the haystack 'zz', the full set of overlapping matches
+/// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of
+/// those matches is quite strange, but makes sense when you think about the
+/// implementation: an end offset is found left-to-right, and then one or more
+/// starting offsets are found right-to-left.
+///
+/// Nevertheless, we provide this routine in our test suite because it's
+/// useful to test the low level DFA overlapping search and our test suite
+/// is written in a way that requires starting offsets.
+fn try_search_overlapping<A: Automaton>(
+ re: &Regex<A>,
+ input: &Input<'_>,
+) -> Result<TestResult> {
+ let mut matches = vec![];
+ let mut fwd_state = OverlappingState::start();
+ let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse());
+ while let Some(end) = {
+ fwd_dfa.try_search_overlapping_fwd(input, &mut fwd_state)?;
+ fwd_state.get_match()
+ } {
+ let revsearch = input
+ .clone()
+ .range(input.start()..end.offset())
+ .anchored(Anchored::Pattern(end.pattern()))
+ .earliest(false);
+ let mut rev_state = OverlappingState::start();
+ while let Some(start) = {
+ rev_dfa.try_search_overlapping_rev(&revsearch, &mut rev_state)?;
+ rev_state.get_match()
+ } {
+ let span = Span { start: start.offset(), end: end.offset() };
+ let mat = Match { id: end.pattern().as_usize(), span };
+ matches.push(mat);
+ }
+ }
+ Ok(TestResult::matches(matches))
}
diff --git a/vendor/regex-automata/tests/fuzz/dense.rs b/vendor/regex-automata/tests/fuzz/dense.rs
new file mode 100644
index 000000000..213891b3e
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/dense.rs
@@ -0,0 +1,52 @@
+// This test was found by a fuzzer input that crafted a way to provide
+// an invalid serialization of ByteClasses that passed our verification.
+// Specifically, the verification step in the deserialization of ByteClasses
+// used an iterator that depends on part of the serialized bytes being correct.
+// (Specifically, the encoding of the number of classes.)
+#[test]
+fn invalid_byte_classes() {
+ let data = include_bytes!(
+ "testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9",
+ );
+ let _ = fuzz_run(data);
+}
+
+#[test]
+fn invalid_byte_classes_min() {
+ let data = include_bytes!(
+ "testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This is the code from the fuzz target. Kind of sucks to duplicate it here,
+// but this is fundamentally how we interpret the date.
+fn fuzz_run(given_data: &[u8]) -> Option<()> {
+ use regex_automata::dfa::Automaton;
+
+ if given_data.len() < 2 {
+ return None;
+ }
+ let haystack_len = usize::from(given_data[0]);
+ let haystack = given_data.get(1..1 + haystack_len)?;
+ let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+ // We help the fuzzer along by adding a preamble to the bytes that should
+ // at least make these first parts valid. The preamble expects a very
+ // specific sequence of bytes, so it makes sense to just force this.
+ let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
+ assert_eq!(0, label.len() % 4);
+ let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+ let version_check = 2u32.to_ne_bytes().to_vec();
+ let mut dfa_bytes: Vec<u8> = vec![];
+ dfa_bytes.extend(label.as_bytes());
+ dfa_bytes.extend(&endianness_check);
+ dfa_bytes.extend(&version_check);
+ dfa_bytes.extend(given_dfa_bytes);
+ // This is the real test: checking that any input we give to
+ // DFA::from_bytes will never result in a panic.
+ let (dfa, _) =
+ regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
+ let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+ Some(())
+}
diff --git a/vendor/regex-automata/tests/fuzz/mod.rs b/vendor/regex-automata/tests/fuzz/mod.rs
new file mode 100644
index 000000000..960cb4251
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/mod.rs
@@ -0,0 +1,2 @@
+mod dense;
+mod sparse;
diff --git a/vendor/regex-automata/tests/fuzz/sparse.rs b/vendor/regex-automata/tests/fuzz/sparse.rs
new file mode 100644
index 000000000..837ad1014
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/sparse.rs
@@ -0,0 +1,132 @@
+// This is a regression test for a bug in how special states are handled. The
+// fuzzer found a case where a state returned true for 'is_special_state' but
+// *didn't* return true for 'is_dead_state', 'is_quit_state', 'is_match_state',
+// 'is_start_state' or 'is_accel_state'. This in turn tripped a debug assertion
+// in the core matching loop that requires 'is_special_state' being true to
+// imply that one of the other routines returns true.
+//
+// We fixed this by adding some validation to both dense and sparse DFAs that
+// checks that this property is true for every state ID in the DFA.
+#[test]
+fn invalid_special_state() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This is an interesting case where a fuzzer generated a DFA with
+// a transition to a state ID that decoded as a valid state, but
+// where the ID itself did not point to one of the two existing
+// states for this particular DFA. This combined with marking this
+// transition's state ID as special but without actually making one of the
+// 'is_{dead,quit,match,start,accel}_state' predicates return true ended up
+// tripping the 'debug_assert(dfa.is_quit_state(sid))' code in the search
+// routine.
+//
+// We fixed this in alloc mode by checking that every transition points to a
+// valid state ID. Technically this bug still exists in core-only mode, but
+// it's not clear how to fix it. And it's worth pointing out that the search
+// routine won't panic in production. It will just provide invalid results. And
+// that's acceptable within the contract of DFA::from_bytes.
+#[test]
+fn transition_to_invalid_but_valid_state() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9",
+ );
+ let _ = fuzz_run(data);
+}
+
+// Another one caught by the fuzzer where it generated a DFA that reported a
+// start state as a match state. Since matches are always delayed by one byte,
+// start states specifically cannot be match states. And indeed, the search
+// code relies on this.
+#[test]
+fn start_state_is_not_match_state() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This is variation on 'transition_to_invalid_but_valid_state', but happens
+// to a start state. Namely, the fuzz data here builds a DFA with a start
+// state ID that is incorrect but points to a sequence of bytes that satisfies
+// state decoding validation. This errant state in turn has a non-zero number
+// of transitions, and its those transitions that point to a state that does
+// *not* satisfy state decoding validation. But we never checked those. So the
+// fix here was to add validation of the transitions off of the start state.
+#[test]
+fn start_state_has_valid_transitions() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This fuzz input generated a DFA with a state whose ID was in the match state
+// ID range, but where the state itself was encoded with zero pattern IDs. We
+// added validation code to check this case.
+#[test]
+fn match_state_inconsistency() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This fuzz input generated a DFA with a state whose ID was in the accelerator
+// range, but who didn't have any accelerators. This violated an invariant that
+// assumes that if 'dfa.is_accel_state(sid)' returns true, then the state must
+// have some accelerators.
+#[test]
+fn invalid_accelerators() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This fuzz input generated a DFA with a state whose EOI transition led to
+// a quit state, which is generally considered illegal. Why? Because the EOI
+// transition is defined over a special sentinel alphabet element and one
+// cannot configure a DFA to "quit" on that sentinel.
+#[test]
+fn eoi_transition_to_quit_state() {
+ let data = include_bytes!(
+ "testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9",
+ );
+ let _ = fuzz_run(data);
+}
+
+// This is the code from the fuzz target. Kind of sucks to duplicate it here,
+// but this is fundamentally how we interpret the date.
+fn fuzz_run(given_data: &[u8]) -> Option<()> {
+ use regex_automata::dfa::Automaton;
+
+ if given_data.len() < 2 {
+ return None;
+ }
+ let haystack_len = usize::from(given_data[0]);
+ let haystack = given_data.get(1..1 + haystack_len)?;
+ let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+ // We help the fuzzer along by adding a preamble to the bytes that should
+ // at least make these first parts valid. The preamble expects a very
+ // specific sequence of bytes, so it makes sense to just force this.
+ let label = "rust-regex-automata-dfa-sparse\x00\x00";
+ assert_eq!(0, label.len() % 4);
+ let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+ let version_check = 2u32.to_ne_bytes().to_vec();
+ let mut dfa_bytes: Vec<u8> = vec![];
+ dfa_bytes.extend(label.as_bytes());
+ dfa_bytes.extend(&endianness_check);
+ dfa_bytes.extend(&version_check);
+ dfa_bytes.extend(given_dfa_bytes);
+ // This is the real test: checking that any input we give to
+ // DFA::from_bytes will never result in a panic.
+ let (dfa, _) =
+ regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
+ let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+ Some(())
+}
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9
new file mode 100644
index 000000000..972bfb2cd
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9
new file mode 100644
index 000000000..72dbdad82
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000
new file mode 100644
index 000000000..5ce508803
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9
new file mode 100644
index 000000000..4fa13fbed
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98
new file mode 100644
index 000000000..0f809f33f
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838
new file mode 100644
index 000000000..8b435fd26
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570
new file mode 100644
index 000000000..69b65160c
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b
new file mode 100644
index 000000000..15b43e47f
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b
Binary files differ
diff --git a/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9 b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9
new file mode 100644
index 000000000..aa72eb1dd
--- /dev/null
+++ b/vendor/regex-automata/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/README.md b/vendor/regex-automata/tests/gen/README.md
new file mode 100644
index 000000000..59439a11f
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/README.md
@@ -0,0 +1,65 @@
+This directory contains tests for serialized objects from the regex-automata
+crate. Currently, there are only two supported such objects: dense and sparse
+DFAs.
+
+The idea behind these tests is to commit some serialized objects and run some
+basic tests by deserializing them and running searches and ensuring they are
+correct. We also make sure these are run under Miri, since deserialization is
+one of the biggest places where undefined behavior might occur in this crate
+(at the time of writing).
+
+The main thing we're testing is that the *current* code can still deserialize
+*old* objects correctly. Generally speaking, compatibility extends to semver
+compatible releases of this crate. Beyond that, no promises are made, although
+in practice callers can at least depend on errors occurring. (The serialized
+format always includes a version number, and incompatible changes increment
+that version number such that an error will occur if an unsupported version is
+detected.)
+
+To generate the dense DFAs, I used this command:
+
+```
+$ regex-cli generate serialize dense regex \
+ MULTI_PATTERN_V2 \
+ tests/gen/dense/ \
+ --rustfmt \
+ --safe \
+ --starts-for-each-pattern \
+ --specialize-start-states \
+ --start-kind both \
+ --unicode-word-boundary \
+ --minimize \
+ '\b[a-zA-Z]+\b' \
+ '(?m)^\S+$' \
+ '(?Rm)^\S+$'
+```
+
+And to generate the sparse DFAs, I used this command, which is the same as
+above, but with `s/dense/sparse/g`.
+
+```
+$ regex-cli generate serialize sparse regex \
+ MULTI_PATTERN_V2 \
+ tests/gen/sparse/ \
+ --rustfmt \
+ --safe \
+ --starts-for-each-pattern \
+ --specialize-start-states \
+ --start-kind both \
+ --unicode-word-boundary \
+ --minimize \
+ '\b[a-zA-Z]+\b' \
+ '(?m)^\S+$' \
+ '(?Rm)^\S+$'
+```
+
+The idea is to try to enable as many of the DFA's options as possible in order
+to test that serialization works for all of them.
+
+Arguably we should increase test coverage here, but this is a start. Note
+that in particular, this does not need to test that serialization and
+deserialization correctly roundtrips on its own. Indeed, the normal regex test
+suite has a test that does a serialization round trip for every test supported
+by DFAs. So that has very good coverage. What we're interested in testing here
+is our compatibility promise: do DFAs generated with an older revision of the
+code still deserialize correctly?
diff --git a/vendor/regex-automata/tests/gen/dense/mod.rs b/vendor/regex-automata/tests/gen/dense/mod.rs
new file mode 100644
index 000000000..b4365d4e1
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/mod.rs
@@ -0,0 +1,22 @@
+use regex_automata::{Input, Match};
+
+mod multi_pattern_v2;
+
+#[test]
+fn multi_pattern_v2() {
+ use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
+
+ assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
+ assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
+ assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
+ assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
+ assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
+ assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
+ assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
+ assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
+ assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
+
+ // Fails because we have heuristic support for Unicode word boundaries
+ // enabled.
+ assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
+}
diff --git a/vendor/regex-automata/tests/gen/dense/multi_pattern_v2.rs b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2.rs
new file mode 100644
index 000000000..a95fd204b
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2.rs
@@ -0,0 +1,43 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// regex-cli generate serialize dense regex MULTI_PATTERN_V2 tests/gen/dense/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+ dfa::{dense::DFA, regex::Regex},
+ util::{lazy::Lazy, wire::AlignAs},
+};
+
+pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u32]>>> =
+ Lazy::new(|| {
+ let dfafwd = {
+ static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+ _align: [],
+ #[cfg(target_endian = "big")]
+ bytes: *include_bytes!("multi_pattern_v2_fwd.bigendian.dfa"),
+ #[cfg(target_endian = "little")]
+ bytes: *include_bytes!(
+ "multi_pattern_v2_fwd.littleendian.dfa"
+ ),
+ };
+ DFA::from_bytes(&ALIGNED.bytes)
+ .expect("serialized forward DFA should be valid")
+ .0
+ };
+ let dfarev = {
+ static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+ _align: [],
+ #[cfg(target_endian = "big")]
+ bytes: *include_bytes!("multi_pattern_v2_rev.bigendian.dfa"),
+ #[cfg(target_endian = "little")]
+ bytes: *include_bytes!(
+ "multi_pattern_v2_rev.littleendian.dfa"
+ ),
+ };
+ DFA::from_bytes(&ALIGNED.bytes)
+ .expect("serialized reverse DFA should be valid")
+ .0
+ };
+ Regex::builder().build_from_dfas(dfafwd, dfarev)
+ });
diff --git a/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa
new file mode 100644
index 000000000..6d6e040c3
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa
new file mode 100644
index 000000000..a1f4b3da1
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa
new file mode 100644
index 000000000..74f74ec2a
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa
new file mode 100644
index 000000000..663bdb9ea
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/mod.rs b/vendor/regex-automata/tests/gen/mod.rs
new file mode 100644
index 000000000..960cb4251
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/mod.rs
@@ -0,0 +1,2 @@
+mod dense;
+mod sparse;
diff --git a/vendor/regex-automata/tests/gen/sparse/mod.rs b/vendor/regex-automata/tests/gen/sparse/mod.rs
new file mode 100644
index 000000000..b4365d4e1
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/mod.rs
@@ -0,0 +1,22 @@
+use regex_automata::{Input, Match};
+
+mod multi_pattern_v2;
+
+#[test]
+fn multi_pattern_v2() {
+ use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
+
+ assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
+ assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
+ assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
+ assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
+ assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
+ assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
+ assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
+ assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
+ assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
+
+ // Fails because we have heuristic support for Unicode word boundaries
+ // enabled.
+ assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
+}
diff --git a/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2.rs b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2.rs
new file mode 100644
index 000000000..911e3f5dd
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2.rs
@@ -0,0 +1,37 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+// regex-cli generate serialize sparse regex MULTI_PATTERN_V2 regex-automata/tests/gen/sparse/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+ dfa::{regex::Regex, sparse::DFA},
+ util::lazy::Lazy,
+};
+
+pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u8]>>> =
+ Lazy::new(|| {
+ let dfafwd = {
+ #[cfg(target_endian = "big")]
+ static BYTES: &'static [u8] =
+ include_bytes!("multi_pattern_v2_fwd.bigendian.dfa");
+ #[cfg(target_endian = "little")]
+ static BYTES: &'static [u8] =
+ include_bytes!("multi_pattern_v2_fwd.littleendian.dfa");
+ DFA::from_bytes(BYTES)
+ .expect("serialized forward DFA should be valid")
+ .0
+ };
+ let dfarev = {
+ #[cfg(target_endian = "big")]
+ static BYTES: &'static [u8] =
+ include_bytes!("multi_pattern_v2_rev.bigendian.dfa");
+ #[cfg(target_endian = "little")]
+ static BYTES: &'static [u8] =
+ include_bytes!("multi_pattern_v2_rev.littleendian.dfa");
+ DFA::from_bytes(BYTES)
+ .expect("serialized reverse DFA should be valid")
+ .0
+ };
+ Regex::builder().build_from_dfas(dfafwd, dfarev)
+ });
diff --git a/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa
new file mode 100644
index 000000000..aa04f6316
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa
new file mode 100644
index 000000000..c27d92abe
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa
new file mode 100644
index 000000000..89867d30f
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa
new file mode 100644
index 000000000..c0ca807f8
--- /dev/null
+++ b/vendor/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa
Binary files differ
diff --git a/vendor/regex-automata/tests/hybrid/api.rs b/vendor/regex-automata/tests/hybrid/api.rs
index 9a834dbb8..e82d808e3 100644
--- a/vendor/regex-automata/tests/hybrid/api.rs
+++ b/vendor/regex-automata/tests/hybrid/api.rs
@@ -1,25 +1,29 @@
use std::error::Error;
use regex_automata::{
- hybrid::{
- dfa::{self, DFA},
- regex::Regex,
- OverlappingState,
- },
+ hybrid::dfa::{OverlappingState, DFA},
nfa::thompson,
- HalfMatch, MatchError, MatchKind, MultiMatch,
+ HalfMatch, Input, MatchError,
};
-use crate::util::{BunkPrefilter, SubstringPrefilter};
-
// Tests that too many cache resets cause the lazy DFA to quit.
//
// We only test this on 64-bit because the test is gingerly crafted based on
// implementation details of cache sizes. It's not a great test because of
// that, but it does check some interesting properties around how positions are
// reported when a search "gives up."
+//
+// NOTE: If you change something in lazy DFA implementation that causes this
+// test to fail by reporting different "gave up" positions, then it's generally
+// okay to update the positions in the test below as long as you're sure your
+// changes are correct. Namely, it is expected that if there are changes in the
+// cache size (or changes in how big things are inside the cache), then its
+// utilization may change slightly and thus impact where a search gives up.
+// Precisely where a search gives up is not an API guarantee, so changing the
+// offsets here is OK.
#[test]
#[cfg(target_pointer_width = "64")]
+#[cfg(not(miri))]
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// This is a carefully chosen regex. The idea is to pick one that requires
// some decent number of states (hence the bounded repetition). But we
@@ -27,9 +31,16 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
// non-ASCII letter so that we can check that no new states are created
// once the cache is full. Namely, if we fill up the cache on a haystack
// of 'a's, then in order to match one 'β', a new state will need to be
- // created since a 'β' is encoded with multiple bytes. Since there's no
- // room for this state, the search should quit at the very first position.
- let pattern = r"[aβ]{100}";
+ // created since a 'β' is encoded with multiple bytes.
+ //
+ // So we proceed by "filling" up the cache by searching a haystack of just
+ // 'a's. The cache won't have enough room to add enough states to find the
+ // match (because of the bounded repetition), which should result in it
+ // giving up before it finds a match.
+ //
+ // Since there's now no more room to create states, we search a haystack
+ // of 'β' and confirm that it gives up immediately.
+ let pattern = r"[aβ]{99}";
let dfa = DFA::builder()
.configure(
// Configure it so that we have the minimum cache capacity
@@ -39,38 +50,53 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
.cache_capacity(0)
.minimum_cache_clear_count(Some(0)),
)
+ .thompson(thompson::NFA::config())
.build(pattern)?;
let mut cache = dfa.create_cache();
let haystack = "a".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 25 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
- assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+ let err = MatchError::gave_up(25);
+ // Notice that we make the same amount of progress in each search! That's
+ // because the cache is reused and already has states to handle the first
+ // N bytes.
+ assert_eq!(
+ Err(err.clone()),
+ dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
+ );
assert_eq!(
- dfa.find_overlapping_fwd(
+ Err(err.clone()),
+ dfa.try_search_overlapping_fwd(
&mut cache,
- &haystack,
+ &Input::new(&haystack),
&mut OverlappingState::start()
),
- Err(err.clone())
);
let haystack = "β".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 0 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ let err = MatchError::gave_up(2);
+ assert_eq!(
+ Err(err),
+ dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
+ );
// no need to test that other find routines quit, since we did that above
// OK, if we reset the cache, then we should be able to create more states
// and make more progress with searching for betas.
cache.reset(&dfa);
- let err = MatchError::GaveUp { offset: 26 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ let err = MatchError::gave_up(27);
+ assert_eq!(
+ Err(err),
+ dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
+ );
// ... switching back to ASCII still makes progress since it just needs to
// set transitions on existing states!
let haystack = "a".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 13 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ let err = MatchError::gave_up(13);
+ assert_eq!(
+ Err(err),
+ dfa.try_search_fwd(&mut cache, &Input::new(&haystack))
+ );
Ok(())
}
@@ -84,20 +110,16 @@ fn quit_fwd() -> Result<(), Box<dyn Error>> {
let mut cache = dfa.create_cache();
assert_eq!(
- dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")),
+ Err(MatchError::quit(b'x', 3)),
);
assert_eq!(
- dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_overlapping_fwd(
+ dfa.try_search_overlapping_fwd(
&mut cache,
- b"abcxyz",
+ &Input::new(b"abcxyz"),
&mut OverlappingState::start()
),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ Err(MatchError::quit(b'x', 3)),
);
Ok(())
@@ -113,12 +135,8 @@ fn quit_rev() -> Result<(), Box<dyn Error>> {
let mut cache = dfa.create_cache();
assert_eq!(
- dfa.find_earliest_rev(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
+ dfa.try_search_rev(&mut cache, &Input::new("abcxyz")),
+ Err(MatchError::quit(b'x', 3)),
);
Ok(())
@@ -145,51 +163,9 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
let dfa = DFA::builder().configure(config).build(r"\b")?;
let mut cache = dfa.create_cache();
let expected = HalfMatch::must(0, 1);
- assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
- Ok(())
-}
-
-// Tests that we can provide a prefilter to a Regex, and the search reports
-// correct results.
-#[test]
-fn prefilter_works() -> Result<(), Box<dyn Error>> {
- let mut re = Regex::new(r"a[0-9]+").unwrap();
- re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
- let mut cache = re.create_cache();
-
- let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
- let matches: Vec<(usize, usize)> = re
- .find_leftmost_iter(&mut cache, text)
- .map(|m| (m.start(), m.end()))
- .collect();
- assert_eq!(
- matches,
- vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
- );
- Ok(())
-}
-
-// This test confirms that a prefilter is active by using a prefilter that
-// reports false negatives.
-#[test]
-fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
- let text = b"za123";
- let mut re = Regex::new(r"a[0-9]+").unwrap();
- let mut cache = re.create_cache();
-
- re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
- assert_eq!(
- re.find_leftmost(&mut cache, b"za123"),
- Some(MultiMatch::must(0, 1, 5))
- );
assert_eq!(
- re.find_leftmost(&mut cache, b"a123"),
- Some(MultiMatch::must(0, 0, 4))
+ Ok(Some(expected)),
+ dfa.try_search_fwd(&mut cache, &Input::new(" a")),
);
- re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
- assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
- // This checks that the prefilter is used when first starting the search,
- // instead of waiting until at least one transition has occurred.
- assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
Ok(())
}
diff --git a/vendor/regex-automata/tests/hybrid/mod.rs b/vendor/regex-automata/tests/hybrid/mod.rs
index f4299510c..36667d09c 100644
--- a/vendor/regex-automata/tests/hybrid/mod.rs
+++ b/vendor/regex-automata/tests/hybrid/mod.rs
@@ -1,2 +1,3 @@
mod api;
+#[cfg(not(miri))]
mod suite;
diff --git a/vendor/regex-automata/tests/hybrid/suite.rs b/vendor/regex-automata/tests/hybrid/suite.rs
index d60570d84..4aaca6698 100644
--- a/vendor/regex-automata/tests/hybrid/suite.rs
+++ b/vendor/regex-automata/tests/hybrid/suite.rs
@@ -1,55 +1,113 @@
-use regex_automata::{
- hybrid::{
- dfa::DFA,
- regex::{self, Regex},
+use {
+ anyhow::Result,
+ regex_automata::{
+ hybrid::{
+ dfa::{OverlappingState, DFA},
+ regex::{self, Regex},
+ },
+ nfa::thompson,
+ util::{prefilter::Prefilter, syntax},
+ Anchored, Input, PatternSet,
+ },
+ regex_test::{
+ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
+ TestRunner,
},
- nfa::thompson,
- MatchKind, SyntaxConfig,
};
-use regex_syntax as syntax;
-use regex_test::{
- bstr::{BString, ByteSlice},
- CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
- SearchKind as TestSearchKind, TestResult, TestRunner,
-};
+use crate::{create_input, suite, untestify_kind};
-use crate::{suite, Result};
+const EXPANSIONS: &[&str] = &["is_match", "find", "which"];
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = Regex::builder();
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA with prefilters enabled.
+#[test]
+fn prefilter() -> Result<()> {
+ let my_compiler = |test: &RegexTest, regexes: &[String]| {
+ // Parse regexes as HIRs so we can get literals to build a prefilter.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+ let kind = match untestify_kind(test.match_kind()) {
+ None => return Ok(CompiledRegex::skip()),
+ Some(kind) => kind,
+ };
+ let pre = Prefilter::from_hirs_prefix(kind, &hirs);
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().prefilter(pre));
+ compiler(builder)(test, regexes)
+ };
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), my_compiler)
+ .assert();
Ok(())
}
-/// Tests the hybrid NFA/DFA with NFA shrinking disabled.
+/// Tests the hybrid NFA/DFA with NFA shrinking enabled.
///
-/// This is actually the typical configuration one wants for a lazy DFA. NFA
+/// This is *usually* not the configuration one wants for a lazy DFA. NFA
/// shrinking is mostly only advantageous when building a full DFA since it
/// can sharply decrease the amount of time determinization takes. But NFA
-/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
-/// no compilation time (other than for building the NFA of course) before
+/// shrinking is itself otherwise fairly expensive currently. Since a lazy DFA
+/// has no compilation time (other than for building the NFA of course) before
/// executing a search, it's usually worth it to forgo NFA shrinking.
+///
+/// Nevertheless, we test to make sure everything is OK with NFA shrinking. As
+/// a bonus, there are some tests we don't need to skip because they now fit in
+/// the default cache capacity.
#[test]
-fn no_nfa_shrink() -> Result<()> {
+fn nfa_shrink() -> Result<()> {
let mut builder = Regex::builder();
- builder.thompson(thompson::Config::new().shrink(false));
+ builder.thompson(thompson::Config::new().shrink(true));
TestRunner::new()?
- // Without NFA shrinking, this test blows the default cache capacity.
- .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .expand(EXPANSIONS, |t| t.compiles())
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
-/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
+/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all
+/// tests.
#[test]
fn starts_for_each_pattern() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().starts_for_each_pattern(true));
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when 'specialize_start_states' is enabled.
+#[test]
+fn specialize_start_states() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().specialize_start_states(true));
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
Ok(())
}
@@ -62,7 +120,12 @@ fn starts_for_each_pattern() -> Result<()> {
fn no_byte_classes() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().byte_classes(false));
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
Ok(())
}
@@ -76,7 +139,12 @@ fn no_byte_classes() -> Result<()> {
fn no_cache_clearing() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
Ok(())
}
@@ -86,27 +154,30 @@ fn min_cache_capacity() -> Result<()> {
let mut builder = Regex::builder();
builder
.dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ TestRunner::new()?
+ .expand(EXPANSIONS, |t| t.compiles())
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
Ok(())
}
fn compiler(
mut builder: regex::Builder,
-) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
- let regexes = regexes
- .iter()
- .map(|r| r.to_str().map(|s| s.to_string()))
- .collect::<std::result::Result<Vec<String>, _>>()?;
+ // Parse regexes as HIRs for some analysis below.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
- let mut thompson = thompson::Builder::new();
- thompson.syntax(config_syntax(test)).configure(config_thompson(test));
- if let Ok(nfa) = thompson.build_many(&regexes) {
- let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
- if nfa.has_word_boundary_unicode() && non_ascii {
- return Ok(CompiledRegex::skip());
+ if !test.haystack().is_ascii() {
+ for hir in hirs.iter() {
+ if hir.properties().look_set().contains_word_unicode() {
+ return Ok(CompiledRegex::skip());
+ }
}
}
if !configure_regex_builder(test, &mut builder) {
@@ -114,7 +185,7 @@ fn compiler(
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
@@ -124,50 +195,45 @@ fn run_test(
re: &Regex,
cache: &mut regex::Cache,
test: &RegexTest,
-) -> Vec<TestResult> {
- let is_match = if re.is_match(cache, test.input()) {
- TestResult::matched()
- } else {
- TestResult::no_match()
- };
- let is_match = is_match.name("is_match");
-
- let find_matches = match test.search_kind() {
- TestSearchKind::Earliest => {
- let it = re
- .find_earliest_iter(cache, test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_earliest_iter")
- }
- TestSearchKind::Leftmost => {
- let it = re
- .find_leftmost_iter(cache, test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_leftmost_iter")
+) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => {
+ TestResult::matched(re.is_match(cache, input.earliest(true)))
}
- TestSearchKind::Overlapping => {
- let it = re
- .find_overlapping_iter(cache, test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_overlapping_iter")
- }
- };
- vec![is_match, find_matches]
+ "find" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ let input =
+ input.earliest(test.search_kind() == SearchKind::Earliest);
+ TestResult::matches(
+ re.find_iter(cache, input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ }),
+ )
+ }
+ SearchKind::Overlapping => {
+ try_search_overlapping(re, cache, &input).unwrap()
+ }
+ },
+ "which" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Leftmost => {
+ // There are no "which" APIs for standard searches.
+ TestResult::skip()
+ }
+ SearchKind::Overlapping => {
+ let dfa = re.forward();
+ let cache = cache.as_parts_mut().0;
+ let mut patset = PatternSet::new(dfa.pattern_len());
+ dfa.try_which_overlapping_matches(cache, &input, &mut patset)
+ .unwrap();
+ TestResult::which(patset.iter().map(|p| p.as_usize()))
+ }
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
}
/// Configures the given regex builder with all relevant settings on the given
@@ -179,34 +245,103 @@ fn configure_regex_builder(
test: &RegexTest,
builder: &mut regex::Builder,
) -> bool {
- let match_kind = match test.match_kind() {
- TestMatchKind::All => MatchKind::All,
- TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
- TestMatchKind::LeftmostLongest => return false,
+ let match_kind = match untestify_kind(test.match_kind()) {
+ None => return false,
+ Some(k) => k,
};
- let dense_config = DFA::config()
- .anchored(test.anchored())
- .match_kind(match_kind)
- .unicode_word_boundary(true);
- let regex_config = Regex::config().utf8(test.utf8());
+ let mut dfa_config =
+ DFA::config().match_kind(match_kind).unicode_word_boundary(true);
+ // When doing an overlapping search, we might try to find the start of each
+ // match with a custom search routine. In that case, we need to tell the
+ // reverse search (for the start offset) which pattern to look for. The
+ // only way that API works is when anchored starting states are compiled
+ // for each pattern. This does technically also enable it for the forward
+ // DFA, but we're okay with that.
+ if test.search_kind() == SearchKind::Overlapping {
+ dfa_config = dfa_config.starts_for_each_pattern(true);
+ }
builder
- .configure(regex_config)
.syntax(config_syntax(test))
.thompson(config_thompson(test))
- .dfa(dense_config);
+ .dfa(dfa_config);
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
- thompson::Config::new().utf8(test.utf8())
+ let mut lookm = regex_automata::util::look::LookMatcher::new();
+ lookm.set_line_terminator(test.line_terminator());
+ thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
-fn config_syntax(test: &RegexTest) -> SyntaxConfig {
- SyntaxConfig::new()
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
+ .line_terminator(test.line_terminator())
+}
+
+/// Execute an overlapping search, and for each match found, also find its
+/// overlapping starting positions.
+///
+/// N.B. This routine used to be part of the crate API, but 1) it wasn't clear
+/// to me how useful it was and 2) it wasn't clear to me what its semantics
+/// should be. In particular, a potentially surprising footgun of this routine
+/// that it is worst case *quadratic* in the size of the haystack. Namely, it's
+/// possible to report a match at every position, and for every such position,
+/// scan all the way to the beginning of the haystack to find the starting
+/// position. Typical leftmost non-overlapping searches don't suffer from this
+/// because, well, matches can't overlap. So subsequent searches after a match
+/// is found don't revisit previously scanned parts of the haystack.
+///
+/// Its semantics can be strange for other reasons too. For example, given
+/// the regex '.*' and the haystack 'zz', the full set of overlapping matches
+/// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of
+/// those matches is quite strange, but makes sense when you think about the
+/// implementation: an end offset is found left-to-right, and then one or more
+/// starting offsets are found right-to-left.
+///
+/// Nevertheless, we provide this routine in our test suite because it's
+/// useful to test the low level DFA overlapping search and our test suite
+/// is written in a way that requires starting offsets.
+fn try_search_overlapping(
+ re: &Regex,
+ cache: &mut regex::Cache,
+ input: &Input<'_>,
+) -> Result<TestResult> {
+ let mut matches = vec![];
+ let mut fwd_state = OverlappingState::start();
+ let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse());
+ let (fwd_cache, rev_cache) = cache.as_parts_mut();
+ while let Some(end) = {
+ fwd_dfa.try_search_overlapping_fwd(
+ fwd_cache,
+ input,
+ &mut fwd_state,
+ )?;
+ fwd_state.get_match()
+ } {
+ let revsearch = input
+ .clone()
+ .range(input.start()..end.offset())
+ .anchored(Anchored::Pattern(end.pattern()))
+ .earliest(false);
+ let mut rev_state = OverlappingState::start();
+ while let Some(start) = {
+ rev_dfa.try_search_overlapping_rev(
+ rev_cache,
+ &revsearch,
+ &mut rev_state,
+ )?;
+ rev_state.get_match()
+ } {
+ let span = Span { start: start.offset(), end: end.offset() };
+ let mat = Match { id: end.pattern().as_usize(), span };
+ matches.push(mat);
+ }
+ }
+ Ok(TestResult::matches(matches))
}
diff --git a/vendor/regex-automata/tests/lib.rs b/vendor/regex-automata/tests/lib.rs
new file mode 100644
index 000000000..1465e51eb
--- /dev/null
+++ b/vendor/regex-automata/tests/lib.rs
@@ -0,0 +1,114 @@
+// We have a similar config in the regex-automata crate root. Basically, it is
+// just too annoying to deal with dead code when a subset of features is
+// enabled.
+#![cfg_attr(
+ not(all(
+ feature = "std",
+ feature = "nfa",
+ feature = "dfa",
+ feature = "hybrid",
+ feature = "perf-literal-substring",
+ feature = "perf-literal-multisubstring",
+ )),
+ allow(dead_code, unused_imports, unused_variables)
+)]
+// Similar deal with Miri. Just let dead code warnings be.
+#![cfg_attr(miri, allow(dead_code, unused_imports, unused_variables))]
+
+#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))]
+mod dfa;
+#[cfg(feature = "dfa-search")]
+mod fuzz;
+#[cfg(feature = "dfa-search")]
+mod gen;
+#[cfg(feature = "hybrid")]
+mod hybrid;
+#[cfg(feature = "meta")]
+mod meta;
+#[cfg(any(feature = "nfa-backtrack", feature = "nfa-pikevm"))]
+mod nfa;
+
+fn suite() -> anyhow::Result<regex_test::RegexTests> {
+ let _ = env_logger::try_init();
+
+ let mut tests = regex_test::RegexTests::new();
+ macro_rules! load {
+ ($name:expr) => {{
+ const DATA: &[u8] =
+ include_bytes!(concat!("../../testdata/", $name, ".toml"));
+ tests.load_slice($name, DATA)?;
+ }};
+ }
+
+ load!("anchored");
+ load!("bytes");
+ load!("crazy");
+ load!("crlf");
+ load!("earliest");
+ load!("empty");
+ load!("expensive");
+ load!("flags");
+ load!("iter");
+ load!("leftmost-all");
+ load!("line-terminator");
+ load!("misc");
+ load!("multiline");
+ load!("no-unicode");
+ load!("overlapping");
+ load!("regression");
+ load!("set");
+ load!("substring");
+ load!("unicode");
+ load!("utf8");
+ load!("word-boundary");
+ load!("fowler/basic");
+ load!("fowler/nullsubexpr");
+ load!("fowler/repetition");
+
+ Ok(tests)
+}
+
+/// Configure a regex_automata::Input with the given test configuration.
+fn create_input<'h>(
+ test: &'h regex_test::RegexTest,
+) -> regex_automata::Input<'h> {
+ use regex_automata::Anchored;
+
+ let bounds = test.bounds();
+ let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No };
+ regex_automata::Input::new(test.haystack())
+ .range(bounds.start..bounds.end)
+ .anchored(anchored)
+}
+
+/// Convert capture matches into the test suite's capture values.
+///
+/// The given captures must represent a valid match, where the first capturing
+/// group has a non-None span. Otherwise this panics.
+fn testify_captures(
+ caps: &regex_automata::util::captures::Captures,
+) -> regex_test::Captures {
+ assert!(caps.is_match(), "expected captures to represent a match");
+ let spans = caps.iter().map(|group| {
+ group.map(|m| regex_test::Span { start: m.start, end: m.end })
+ });
+ // These unwraps are OK because we assume our 'caps' represents a match,
+ // and a match always gives a non-zero number of groups with the first
+ // group being non-None.
+ regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans)
+ .unwrap()
+}
+
+/// Convert a test harness match kind to a regex-automata match kind. If
+/// regex-automata doesn't support the harness kind, then `None` is returned.
+fn untestify_kind(
+ kind: regex_test::MatchKind,
+) -> Option<regex_automata::MatchKind> {
+ match kind {
+ regex_test::MatchKind::All => Some(regex_automata::MatchKind::All),
+ regex_test::MatchKind::LeftmostFirst => {
+ Some(regex_automata::MatchKind::LeftmostFirst)
+ }
+ regex_test::MatchKind::LeftmostLongest => None,
+ }
+}
diff --git a/vendor/regex-automata/tests/meta/mod.rs b/vendor/regex-automata/tests/meta/mod.rs
new file mode 100644
index 000000000..9d6ab475e
--- /dev/null
+++ b/vendor/regex-automata/tests/meta/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(not(miri))]
+mod suite;
diff --git a/vendor/regex-automata/tests/meta/suite.rs b/vendor/regex-automata/tests/meta/suite.rs
new file mode 100644
index 000000000..20f97b4bb
--- /dev/null
+++ b/vendor/regex-automata/tests/meta/suite.rs
@@ -0,0 +1,200 @@
+use {
+ anyhow::Result,
+ regex_automata::{
+ meta::{self, Regex},
+ util::syntax,
+ MatchKind, PatternSet,
+ },
+ regex_test::{
+ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
+ TestRunner,
+ },
+};
+
+use crate::{create_input, suite, testify_captures};
+
+const BLACKLIST: &[&str] = &[
+ // These 'earliest' tests are blacklisted because the meta searcher doesn't
+ // give the same offsets that the test expects. This is legal because the
+ // 'earliest' routines don't guarantee a particular match offset other
+ // than "the earliest the regex engine can report a match." Some regex
+ // engines will quit earlier than others. The backtracker, for example,
+ // can't really quit before finding the full leftmost-first match. Many of
+ // the literal searchers also don't have the ability to quit fully or it's
+ // otherwise not worth doing. (A literal searcher not quitting as early as
+ // possible usually means looking at a few more bytes. That's no biggie.)
+ "earliest/",
+];
+
+/// Tests the default configuration of the meta regex engine.
+#[test]
+fn default() -> Result<()> {
+ let builder = Regex::builder();
+ let mut runner = TestRunner::new()?;
+ runner
+ .expand(&["is_match", "find", "captures"], |test| test.compiles())
+ .blacklist_iter(BLACKLIST)
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the default configuration minus the full DFA.
+#[test]
+fn no_dfa() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.configure(Regex::config().dfa(false));
+ let mut runner = TestRunner::new()?;
+ runner
+ .expand(&["is_match", "find", "captures"], |test| test.compiles())
+ .blacklist_iter(BLACKLIST)
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the default configuration minus the full DFA and lazy DFA.
+#[test]
+fn no_dfa_hybrid() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.configure(Regex::config().dfa(false).hybrid(false));
+ let mut runner = TestRunner::new()?;
+ runner
+ .expand(&["is_match", "find", "captures"], |test| test.compiles())
+ .blacklist_iter(BLACKLIST)
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the default configuration minus the full DFA, lazy DFA and one-pass
+/// DFA.
+#[test]
+fn no_dfa_hybrid_onepass() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.configure(Regex::config().dfa(false).hybrid(false).onepass(false));
+ let mut runner = TestRunner::new()?;
+ runner
+ .expand(&["is_match", "find", "captures"], |test| test.compiles())
+ .blacklist_iter(BLACKLIST)
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the default configuration minus the full DFA, lazy DFA, one-pass
+/// DFA and backtracker.
+#[test]
+fn no_dfa_hybrid_onepass_backtrack() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.configure(
+ Regex::config()
+ .dfa(false)
+ .hybrid(false)
+ .onepass(false)
+ .backtrack(false),
+ );
+ let mut runner = TestRunner::new()?;
+ runner
+ .expand(&["is_match", "find", "captures"], |test| test.compiles())
+ .blacklist_iter(BLACKLIST)
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: meta::Builder,
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ if !configure_meta_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = builder.build_many(&regexes)?;
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, test)
+ }))
+ }
+}
+
+fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => TestResult::matched(re.is_match(input)),
+ "find" => match test.search_kind() {
+ SearchKind::Earliest => TestResult::matches(
+ re.find_iter(input.earliest(true))
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ }),
+ ),
+ SearchKind::Leftmost => TestResult::matches(
+ re.find_iter(input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ }),
+ ),
+ SearchKind::Overlapping => {
+ let mut patset = PatternSet::new(re.pattern_len());
+ re.which_overlapping_matches(&input, &mut patset);
+ TestResult::which(patset.iter().map(|p| p.as_usize()))
+ }
+ },
+ "captures" => match test.search_kind() {
+ SearchKind::Earliest => {
+ let it = re
+ .captures_iter(input.earliest(true))
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|caps| testify_captures(&caps));
+ TestResult::captures(it)
+ }
+ SearchKind::Leftmost => {
+ let it = re
+ .captures_iter(input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|caps| testify_captures(&caps));
+ TestResult::captures(it)
+ }
+ SearchKind::Overlapping => {
+ // There is no overlapping regex API that supports captures.
+ TestResult::skip()
+ }
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_meta_builder(
+ test: &RegexTest,
+ builder: &mut meta::Builder,
+) -> bool {
+ let match_kind = match test.match_kind() {
+ regex_test::MatchKind::All => MatchKind::All,
+ regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+ regex_test::MatchKind::LeftmostLongest => return false,
+ };
+ let meta_config = Regex::config()
+ .match_kind(match_kind)
+ .utf8_empty(test.utf8())
+ .line_terminator(test.line_terminator());
+ builder.configure(meta_config).syntax(config_syntax(test));
+ true
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
+}
diff --git a/vendor/regex-automata/tests/nfa/thompson/backtrack/mod.rs b/vendor/regex-automata/tests/nfa/thompson/backtrack/mod.rs
new file mode 100644
index 000000000..9d6ab475e
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/backtrack/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(not(miri))]
+mod suite;
diff --git a/vendor/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/vendor/regex-automata/tests/nfa/thompson/backtrack/suite.rs
new file mode 100644
index 000000000..bce0eef40
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/backtrack/suite.rs
@@ -0,0 +1,213 @@
+use {
+ anyhow::Result,
+ regex_automata::{
+ nfa::thompson::{
+ self,
+ backtrack::{self, BoundedBacktracker},
+ NFA,
+ },
+ util::{prefilter::Prefilter, syntax},
+ Input,
+ },
+ regex_test::{
+ CompiledRegex, Match, MatchKind, RegexTest, SearchKind, Span,
+ TestResult, TestRunner,
+ },
+};
+
+use crate::{create_input, suite, testify_captures};
+
+/// Tests the default configuration of the bounded backtracker.
+#[test]
+fn default() -> Result<()> {
+ let builder = BoundedBacktracker::builder();
+ let mut runner = TestRunner::new()?;
+ runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
+ // At the time of writing, every regex search in the test suite fits
+ // into the backtracker's default visited capacity (except for the
+ // blacklisted tests below). If regexes are added that blow that capacity,
+ // then they should be blacklisted here. A tempting alternative is to
+ // automatically skip them by checking the haystack length against
+ // BoundedBacktracker::max_haystack_len, but that could wind up hiding
+ // interesting failure modes. e.g., If the visited capacity is somehow
+ // wrong or smaller than it should be.
+ runner.blacklist("expensive/backtrack-blow-visited-capacity");
+ runner.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the backtracker with prefilters enabled.
+#[test]
+fn prefilter() -> Result<()> {
+ let my_compiler = |test: &RegexTest, regexes: &[String]| {
+ // Parse regexes as HIRs so we can get literals to build a prefilter.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+ // We can always select leftmost-first here because the backtracker
+ // only supports leftmost-first matching.
+ let pre = Prefilter::from_hirs_prefix(
+ regex_automata::MatchKind::LeftmostFirst,
+ &hirs,
+ );
+ let mut builder = BoundedBacktracker::builder();
+ builder.configure(BoundedBacktracker::config().prefilter(pre));
+ compiler(builder)(test, regexes)
+ };
+ let mut runner = TestRunner::new()?;
+ runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
+ runner.blacklist("expensive/backtrack-blow-visited-capacity");
+ runner.test_iter(suite()?.iter(), my_compiler).assert();
+ Ok(())
+}
+
+/// Tests the bounded backtracker when its visited capacity is set to its
+/// minimum amount.
+#[test]
+fn min_visited_capacity() -> Result<()> {
+ let mut runner = TestRunner::new()?;
+ runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
+ runner
+ .test_iter(suite()?.iter(), move |test, regexes| {
+ let nfa = NFA::compiler()
+ .configure(config_thompson(test))
+ .syntax(config_syntax(test))
+ .build_many(&regexes)?;
+ let mut builder = BoundedBacktracker::builder();
+ if !configure_backtrack_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ // Setup the bounded backtracker so that its visited capacity is
+ // the absolute minimum required for the test's haystack.
+ builder.configure(BoundedBacktracker::config().visited_capacity(
+ backtrack::min_visited_capacity(
+ &nfa,
+ &Input::new(test.haystack()),
+ ),
+ ));
+
+ let re = builder.build_from_nfa(nfa)?;
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, &mut cache, test)
+ }))
+ })
+ .assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: backtrack::Builder,
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ if !configure_backtrack_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = builder.build_many(&regexes)?;
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
+ run_test(&re, &mut cache, test)
+ }))
+ }
+}
+
+fn run_test(
+ re: &BoundedBacktracker,
+ cache: &mut backtrack::Cache,
+ test: &RegexTest,
+) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Overlapping => {
+ TestResult::skip()
+ }
+ SearchKind::Leftmost => {
+ let input = input.earliest(true);
+ TestResult::matched(re.try_is_match(cache, input).unwrap())
+ }
+ },
+ "find" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Overlapping => {
+ TestResult::skip()
+ }
+ SearchKind::Leftmost => TestResult::matches(
+ re.try_find_iter(cache, input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|result| result.unwrap())
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ }),
+ ),
+ },
+ "captures" => match test.search_kind() {
+ SearchKind::Earliest | SearchKind::Overlapping => {
+ TestResult::skip()
+ }
+ SearchKind::Leftmost => TestResult::captures(
+ re.try_captures_iter(cache, input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|result| result.unwrap())
+ .map(|caps| testify_captures(&caps)),
+ ),
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_backtrack_builder(
+ test: &RegexTest,
+ builder: &mut backtrack::Builder,
+) -> bool {
+ match (test.search_kind(), test.match_kind()) {
+ // For testing the standard search APIs. This is the only supported
+ // configuration for the backtracker.
+ (SearchKind::Leftmost, MatchKind::LeftmostFirst) => {}
+ // Overlapping APIs not supported at all for backtracker.
+ (SearchKind::Overlapping, _) => return false,
+ // Backtracking doesn't really support the notion of 'earliest'.
+ // Namely, backtracking already works by returning as soon as it knows
+ // it has found a match. It just so happens that this corresponds to
+ // the standard 'leftmost' formulation.
+ //
+ // The 'earliest' definition in this crate does indeed permit this
+ // behavior, so this is "fine," but our test suite specifically looks
+ // for the earliest position at which a match is known, which our
+ // finite automata based regex engines have no problem providing. So
+ // for backtracking, we just skip these tests.
+ (SearchKind::Earliest, _) => return false,
+ // For backtracking, 'all' semantics don't really make sense.
+ (_, MatchKind::All) => return false,
+ // Not supported at all in regex-automata.
+ (_, MatchKind::LeftmostLongest) => return false,
+ };
+ let backtrack_config = BoundedBacktracker::config();
+ builder
+ .configure(backtrack_config)
+ .syntax(config_syntax(test))
+ .thompson(config_thompson(test));
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ let mut lookm = regex_automata::util::look::LookMatcher::new();
+ lookm.set_line_terminator(test.line_terminator());
+ thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
+}
diff --git a/vendor/regex-automata/tests/nfa/thompson/mod.rs b/vendor/regex-automata/tests/nfa/thompson/mod.rs
index 3a03f52ce..b2558f704 100644
--- a/vendor/regex-automata/tests/nfa/thompson/mod.rs
+++ b/vendor/regex-automata/tests/nfa/thompson/mod.rs
@@ -1 +1,4 @@
+#[cfg(feature = "nfa-backtrack")]
+mod backtrack;
+#[cfg(feature = "nfa-pikevm")]
mod pikevm;
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs
deleted file mode 100644
index c8199f709..000000000
--- a/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-use std::error::Error;
-
-use regex_automata::{
- hybrid::{
- dfa::{self, DFA},
- regex::Regex,
- OverlappingState,
- },
- nfa::thompson,
- HalfMatch, MatchError, MatchKind, MultiMatch,
-};
-
-use crate::util::{BunkPrefilter, SubstringPrefilter};
-
-// Tests that too many cache resets cause the lazy DFA to quit.
-#[test]
-fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
- // This is a carefully chosen regex. The idea is to pick one that requires
- // some decent number of states (hence the bounded repetition). But we
- // specifically choose to create a class with an ASCII letter and a
- // non-ASCII letter so that we can check that no new states are created
- // once the cache is full. Namely, if we fill up the cache on a haystack
- // of 'a's, then in order to match one 'β', a new state will need to be
- // created since a 'β' is encoded with multiple bytes. Since there's no
- // room for this state, the search should quit at the very first position.
- let pattern = r"[aβ]{100}";
- let dfa = DFA::builder()
- .configure(
- // Configure it so that we have the minimum cache capacity
- // possible. And that if any resets occur, the search quits.
- DFA::config()
- .skip_cache_capacity_check(true)
- .cache_capacity(0)
- .minimum_cache_clear_count(Some(0)),
- )
- .build(pattern)?;
- let mut cache = dfa.create_cache();
-
- let haystack = "a".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 25 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
- assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
- assert_eq!(
- dfa.find_overlapping_fwd(
- &mut cache,
- &haystack,
- &mut OverlappingState::start()
- ),
- Err(err.clone())
- );
-
- let haystack = "β".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 0 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
- // no need to test that other find routines quit, since we did that above
-
- // OK, if we reset the cache, then we should be able to create more states
- // and make more progress with searching for betas.
- cache.reset(&dfa);
- let err = MatchError::GaveUp { offset: 26 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
-
- // ... switching back to ASCII still makes progress since it just needs to
- // set transitions on existing states!
- let haystack = "a".repeat(101).into_bytes();
- let err = MatchError::GaveUp { offset: 13 };
- assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
-
- Ok(())
-}
-
-// Tests that quit bytes in the forward direction work correctly.
-#[test]
-fn quit_fwd() -> Result<(), Box<dyn Error>> {
- let dfa = DFA::builder()
- .configure(DFA::config().quit(b'x', true))
- .build("[[:word:]]+$")?;
- let mut cache = dfa.create_cache();
-
- assert_eq!(
- dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_overlapping_fwd(
- &mut cache,
- b"abcxyz",
- &mut OverlappingState::start()
- ),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
-
- Ok(())
-}
-
-// Tests that quit bytes in the reverse direction work correctly.
-#[test]
-fn quit_rev() -> Result<(), Box<dyn Error>> {
- let dfa = DFA::builder()
- .configure(DFA::config().quit(b'x', true))
- .thompson(thompson::Config::new().reverse(true))
- .build("^[[:word:]]+")?;
- let mut cache = dfa.create_cache();
-
- assert_eq!(
- dfa.find_earliest_rev(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
- assert_eq!(
- dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
- Err(MatchError::Quit { byte: b'x', offset: 3 })
- );
-
- Ok(())
-}
-
-// Tests that if we heuristically enable Unicode word boundaries but then
-// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
-// will panic.
-#[test]
-#[should_panic]
-fn quit_panics() {
- DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
-}
-
-// This tests an intesting case where even if the Unicode word boundary option
-// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
-// word boundaries to be enabled.
-#[test]
-fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
- let mut config = DFA::config();
- for b in 0x80..=0xFF {
- config = config.quit(b, true);
- }
- let dfa = DFA::builder().configure(config).build(r"\b")?;
- let mut cache = dfa.create_cache();
- let expected = HalfMatch::must(0, 1);
- assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
- Ok(())
-}
-
-// Tests that we can provide a prefilter to a Regex, and the search reports
-// correct results.
-#[test]
-fn prefilter_works() -> Result<(), Box<dyn Error>> {
- let mut re = Regex::new(r"a[0-9]+").unwrap();
- re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
- let mut cache = re.create_cache();
-
- let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
- let matches: Vec<(usize, usize)> = re
- .find_leftmost_iter(&mut cache, text)
- .map(|m| (m.start(), m.end()))
- .collect();
- assert_eq!(
- matches,
- vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
- );
- Ok(())
-}
-
-// This test confirms that a prefilter is active by using a prefilter that
-// reports false negatives.
-#[test]
-fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
- let text = b"za123";
- let mut re = Regex::new(r"a[0-9]+").unwrap();
- let mut cache = re.create_cache();
-
- re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
- assert_eq!(
- re.find_leftmost(&mut cache, b"za123"),
- Some(MultiMatch::must(0, 1, 5))
- );
- assert_eq!(
- re.find_leftmost(&mut cache, b"a123"),
- Some(MultiMatch::must(0, 0, 4))
- );
- re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
- assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
- // This checks that the prefilter is used when first starting the search,
- // instead of waiting until at least one transition has occurred.
- assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
- Ok(())
-}
-*/
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs
index f4299510c..9d6ab475e 100644
--- a/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs
+++ b/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs
@@ -1,2 +1,2 @@
-mod api;
+#[cfg(not(miri))]
mod suite;
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs
index e5505d59a..d32842a15 100644
--- a/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs
+++ b/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs
@@ -1,42 +1,65 @@
-use regex_automata::{
- nfa::thompson::{
- self,
- pikevm::{self, PikeVM},
+use {
+ anyhow::Result,
+ regex_automata::{
+ nfa::thompson::{
+ self,
+ pikevm::{self, PikeVM},
+ },
+ util::{prefilter::Prefilter, syntax},
+ PatternSet,
+ },
+ regex_test::{
+ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult,
+ TestRunner,
},
- MatchKind, SyntaxConfig,
-};
-use regex_syntax as syntax;
-
-use regex_test::{
- bstr::{BString, ByteSlice},
- CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
- SearchKind as TestSearchKind, TestResult, TestRunner,
};
-use crate::{suite, Result};
+use crate::{create_input, suite, testify_captures, untestify_kind};
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = PikeVM::builder();
- TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ let mut runner = TestRunner::new()?;
+ runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
+ runner.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the PikeVM with prefilters enabled.
+#[test]
+fn prefilter() -> Result<()> {
+ let my_compiler = |test: &RegexTest, regexes: &[String]| {
+ // Parse regexes as HIRs so we can get literals to build a prefilter.
+ let mut hirs = vec![];
+ for pattern in regexes.iter() {
+ hirs.push(syntax::parse_with(pattern, &config_syntax(test))?);
+ }
+ let kind = match untestify_kind(test.match_kind()) {
+ None => return Ok(CompiledRegex::skip()),
+ Some(kind) => kind,
+ };
+ let pre = Prefilter::from_hirs_prefix(kind, &hirs);
+ let mut builder = PikeVM::builder();
+ builder.configure(PikeVM::config().prefilter(pre));
+ compiler(builder)(test, regexes)
+ };
+ let mut runner = TestRunner::new()?;
+ runner.expand(&["is_match", "find", "captures"], |test| test.compiles());
+ runner.test_iter(suite()?.iter(), my_compiler).assert();
Ok(())
}
fn compiler(
mut builder: pikevm::Builder,
-) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
move |test, regexes| {
- let regexes = regexes
- .iter()
- .map(|r| r.to_str().map(|s| s.to_string()))
- .collect::<std::result::Result<Vec<String>, _>>()?;
if !configure_pikevm_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
- Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ Ok(CompiledRegex::compiled(move |test| -> TestResult {
run_test(&re, &mut cache, test)
}))
}
@@ -46,35 +69,59 @@ fn run_test(
re: &PikeVM,
cache: &mut pikevm::Cache,
test: &RegexTest,
-) -> Vec<TestResult> {
- // let is_match = if re.is_match(cache, test.input()) {
- // TestResult::matched()
- // } else {
- // TestResult::no_match()
- // };
- // let is_match = is_match.name("is_match");
-
- let find_matches = match test.search_kind() {
- TestSearchKind::Earliest => {
- TestResult::skip().name("find_earliest_iter")
- }
- TestSearchKind::Leftmost => {
- let it = re
- .find_leftmost_iter(cache, test.input())
- .take(test.match_limit().unwrap_or(std::usize::MAX))
- .map(|m| Match {
- id: m.pattern().as_usize(),
- start: m.start(),
- end: m.end(),
- });
- TestResult::matches(it).name("find_leftmost_iter")
- }
- TestSearchKind::Overlapping => {
- TestResult::skip().name("find_overlapping_iter")
- }
- };
- // vec![is_match, find_matches]
- vec![find_matches]
+) -> TestResult {
+ let input = create_input(test);
+ match test.additional_name() {
+ "is_match" => TestResult::matched(re.is_match(cache, input)),
+ "find" => match test.search_kind() {
+ SearchKind::Earliest => {
+ let it = re
+ .find_iter(cache, input.earliest(true))
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ });
+ TestResult::matches(it)
+ }
+ SearchKind::Leftmost => {
+ let it = re
+ .find_iter(cache, input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ span: Span { start: m.start(), end: m.end() },
+ });
+ TestResult::matches(it)
+ }
+ SearchKind::Overlapping => {
+ let mut patset = PatternSet::new(re.get_nfa().pattern_len());
+ re.which_overlapping_matches(cache, &input, &mut patset);
+ TestResult::which(patset.iter().map(|p| p.as_usize()))
+ }
+ },
+ "captures" => match test.search_kind() {
+ SearchKind::Earliest => {
+ let it = re
+ .captures_iter(cache, input.earliest(true))
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|caps| testify_captures(&caps));
+ TestResult::captures(it)
+ }
+ SearchKind::Leftmost => {
+ let it = re
+ .captures_iter(cache, input)
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|caps| testify_captures(&caps));
+ TestResult::captures(it)
+ }
+ SearchKind::Overlapping => {
+ // There is no overlapping PikeVM API that supports captures.
+ TestResult::skip()
+ }
+ },
+ name => TestResult::fail(&format!("unrecognized test name: {}", name)),
+ }
}
/// Configures the given regex builder with all relevant settings on the given
@@ -86,8 +133,11 @@ fn configure_pikevm_builder(
test: &RegexTest,
builder: &mut pikevm::Builder,
) -> bool {
- let pikevm_config =
- PikeVM::config().anchored(test.anchored()).utf8(test.utf8());
+ let match_kind = match untestify_kind(test.match_kind()) {
+ None => return false,
+ Some(k) => k,
+ };
+ let pikevm_config = PikeVM::config().match_kind(match_kind);
builder
.configure(pikevm_config)
.syntax(config_syntax(test))
@@ -97,13 +147,16 @@ fn configure_pikevm_builder(
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
- thompson::Config::new().utf8(test.utf8())
+ let mut lookm = regex_automata::util::look::LookMatcher::new();
+ lookm.set_line_terminator(test.line_terminator());
+ thompson::Config::new().utf8(test.utf8()).look_matcher(lookm)
}
/// Configuration of the regex parser from a regex test.
-fn config_syntax(test: &RegexTest) -> SyntaxConfig {
- SyntaxConfig::new()
+fn config_syntax(test: &RegexTest) -> syntax::Config {
+ syntax::Config::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
+ .line_terminator(test.line_terminator())
}
diff --git a/vendor/regex-automata/tests/tests.rs b/vendor/regex-automata/tests/tests.rs
deleted file mode 100644
index e4728470c..000000000
--- a/vendor/regex-automata/tests/tests.rs
+++ /dev/null
@@ -1,44 +0,0 @@
-#![allow(warnings)]
-
-use regex_test::RegexTests;
-
-mod dfa;
-mod hybrid;
-mod nfa;
-mod regression;
-mod util;
-
-type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
-
-fn suite() -> Result<RegexTests> {
- let mut tests = RegexTests::new();
- macro_rules! load {
- ($name:expr) => {{
- const DATA: &[u8] =
- include_bytes!(concat!("data/", $name, ".toml"));
- tests.load_slice($name, DATA)?;
- }};
- }
-
- load!("bytes");
- load!("crazy");
- load!("earliest");
- load!("empty");
- load!("expensive");
- load!("flags");
- load!("iter");
- load!("misc");
- load!("multiline");
- load!("no-unicode");
- load!("overlapping");
- load!("regression");
- load!("set");
- load!("unicode");
- load!("word-boundary");
- load!("fowler/basic");
- load!("fowler/nullsubexpr");
- load!("fowler/repetition");
- load!("fowler/repetition-expensive");
-
- Ok(tests)
-}
diff --git a/vendor/regex-automata/tests/util.rs b/vendor/regex-automata/tests/util.rs
deleted file mode 100644
index 499aa8c6d..000000000
--- a/vendor/regex-automata/tests/util.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-use regex_automata::util::prefilter::{self, Candidate, Prefilter};
-
-#[derive(Clone, Debug)]
-pub struct SubstringPrefilter(bstr::Finder<'static>);
-
-impl SubstringPrefilter {
- pub fn new<B: AsRef<[u8]>>(needle: B) -> SubstringPrefilter {
- SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned())
- }
-}
-
-impl Prefilter for SubstringPrefilter {
- #[inline]
- fn next_candidate(
- &self,
- state: &mut prefilter::State,
- haystack: &[u8],
- at: usize,
- ) -> Candidate {
- self.0
- .find(&haystack[at..])
- .map(|i| Candidate::PossibleStartOfMatch(at + i))
- .unwrap_or(Candidate::None)
- }
-
- fn heap_bytes(&self) -> usize {
- self.0.needle().len()
- }
-}
-
-/// A prefilter that always returns `Candidate::None`, even if it's a false
-/// negative. This is useful for confirming that a prefilter is actually
-/// active by asserting an incorrect result.
-#[derive(Clone, Debug)]
-pub struct BunkPrefilter(());
-
-impl BunkPrefilter {
- pub fn new() -> BunkPrefilter {
- BunkPrefilter(())
- }
-}
-
-impl Prefilter for BunkPrefilter {
- #[inline]
- fn next_candidate(
- &self,
- _state: &mut prefilter::State,
- _haystack: &[u8],
- _at: usize,
- ) -> Candidate {
- Candidate::None
- }
-
- fn heap_bytes(&self) -> usize {
- 0
- }
-}